In [1]:
!pip install fancyimpute
!pip install pandas



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif

In [3]:
# Read the data
df = pd.read_csv('C:/Users/user/Downloads/517/csv_preprocessed.csv')
print(df.shape)

(1215675, 63)


In [4]:
# Define features and target
X = df.drop(columns=['Attack Type'])
y = df['Attack Type']

In [5]:
# Perform mutual information feature selection
mutual_info_arr = mutual_info_classif(X, y)
top_features = X.columns[np.argsort(mutual_info_arr)[::-1][:15]]
X_selected = X[top_features]

In [6]:
# Perform PCA
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_selected)

In [7]:
# Define the parameter grid
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

In [8]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy', verbose=10)

In [9]:
# Perform Grid Search
grid_search.fit(X_pca, y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV 1/5; 1/324] START bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 1/5; 1/324] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.989 total time=   7.0s
[CV 2/5; 1/324] START bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 2/5; 1/324] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.989 total time=   7.5s
[CV 3/5; 1/324] START bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 3/5; 1/324] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.985 total time=   7.3s
[CV 4/5; 1/324] START bootstrap=True, criterion=gini, max_depth=None, min_samples

In [10]:
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [11]:
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.995631233676764


In [12]:
# Train the model with the best parameters
best_rfc = grid_search.best_estimator_

In [13]:
# Predictions
y_pred = best_rfc.predict(X_pca)

In [14]:
# Evaluate the model
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted')

In [15]:
# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9999506447035598
Precision: 0.9999506639476778
Recall: 0.9999506447035598
F1 Score: 0.9999506467722727
