In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m928.8 kB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29881 sha256=b4758f3ba8fc6350f6949f38b9f9a57d2bc9f350d89719aa41ca217a933dcf0f
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-non

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif

In [4]:
# Read the data
PATH = '/content/drive/MyDrive/Dataset/'
df = pd.read_csv(PATH + 'csv_preprocessed.csv')
print(df.shape)

(1215675, 63)


In [5]:
# Define features and target
X = df.drop(columns=['Attack Type'])
y = df['Attack Type']

In [6]:
# Perform mutual information feature selection
mutual_info_arr = mutual_info_classif(X, y)
top_features = X.columns[np.argsort(mutual_info_arr)[::-1][:15]]
X_selected = X[top_features]

In [7]:
# Perform PCA
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_selected)

In [8]:
# Define the parameter grid - Part 1
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [9]:
# Initialize Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

In [10]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='accuracy')

In [11]:
# Perform Grid Search
grid_search.fit(X_pca, y)

In [12]:
# Get the best parameters
best_params_part1 = grid_search.best_params_

In [13]:
# Train the model with the best parameters from Part 1
best_rfc_part1 = RandomForestClassifier(**best_params_part1, random_state=42)
best_rfc_part1.fit(X_pca, y)

In [14]:
# Predictions
y_pred_part1 = best_rfc_part1.predict(X_pca)

In [15]:
# Evaluate the model
accuracy_part1 = accuracy_score(y, y_pred_part1)
precision_part1 = precision_score(y, y_pred_part1, average='weighted')
recall_part1 = recall_score(y, y_pred_part1, average='weighted')
f1_part1 = f1_score(y, y_pred_part1, average='weighted')

In [16]:
# Print evaluation metrics
print(f'Accuracy: {accuracy_part1}')
print(f'Precision: {precision_part1}')
print(f'Recall: {recall_part1}')
print(f'F1 Score: {f1_part1}')

Accuracy: 0.9996257223353281
Precision: 0.9996258865366346
Recall: 0.9996257223353281
F1 Score: 0.9996256136310316
