In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib



In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
df = pd.read_csv('/content/drive/MyDrive/dataset/cleaned.csv')

In [27]:
df.head()

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Protocol_6,Protocol_17,Label
0,112640768.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,56300000.0,138.592929,56300000.0,56300000.0,0,0,Benign
1,112641773.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,56300000.0,263.750829,56300000.0,56300000.0,0,0,Benign
2,20784143.0,23.0,44.0,2416.0,1344.0,240.0,64.0,105.043478,54.542292,64.0,...,0.0,2624734.0,2624734.0,9058214.0,0.0,9058214.0,9058214.0,1,0,Benign
3,112640836.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,56300000.0,82.024387,56300000.0,56300000.0,0,0,Benign
4,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,Benign


In [28]:
df.shape

(2192355, 79)

In [29]:
df["Label"] = df["Label"].astype(str).str.strip()   # clean strings

In [30]:
le = LabelEncoder()
df["Label_enc"] = le.fit_transform(df["Label"]) # keep both label and label_enc

# X = all features (drop labels), y = encoded label
X = df.drop(columns=["Label", "Label_enc"], errors="ignore").copy()
y = df["Label_enc"]

In [31]:
X.shape

(2192355, 78)

In [32]:
y.shape

(2192355,)

In [33]:
# Replace +inf/-inf with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)


In [34]:
print(X.isna().sum().sum())
print(y.isna().sum().sum())

686
0


In [35]:
# Find the rows in X that contain NaN
mask = ~X.isna().any(axis=1)   # True = row is clean

# Apply the same mask to both X and y
X = X.loc[mask].copy()
y = y.loc[mask].copy()

print("After dropping NaN rows,  X:", X.shape, "y:", y.shape)


After dropping NaN rows,  X: (2192012, 78) y: (2192012,)


In [36]:
print(X.isna().sum().sum())
print(y.isna().sum().sum())

0
0


In [37]:
X.shape

(2192012, 78)

In [38]:
y.shape

(2192012,)

In [39]:
arr = y.to_numpy()

has_posinf = np.isposinf(arr).any()
has_neginf = np.isneginf(arr).any()
print("Has +inf?", has_posinf, "| Has -inf?", has_neginf)
print("Train has NaN?", X.isna().any().any())

Has +inf? False | Has -inf? False
Train has NaN? False


In [40]:
# Split dataset into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,        # keep class proportions
    random_state=101
)

In [41]:
X_train.shape

(1753609, 78)

In [42]:
# From Train (80%), split into Train_fit (≈90% of Train) and Validation (≈10% of Train)
X_train_fit, X_val, y_train_fit, y_val = train_test_split(
    X_train, y_train,
    test_size=0.15,     # 15% of Train goes to Validation (~12% of total data)
    stratify=y_train,   # ensures all known classes are represented in Validation
    random_state=101
)

In [44]:
X_train_fit.shape

(1490567, 78)

In [45]:
arr = X_train_fit.to_numpy()

has_posinf = np.isposinf(arr).any()
has_neginf = np.isneginf(arr).any()
print("Has +inf?", has_posinf, "| Has -inf?", has_neginf)
print("Train has NaN?", X_train_fit.isna().any().any())

Has +inf? False | Has -inf? False
Train has NaN? False


In [46]:
print(X_train_fit.shape)
print(y_train_fit.shape)

(1490567, 78)
(1490567,)


In [47]:
joblib.dump({
    "X_train": X_train,
    "X_train_fit": X_train_fit,
    "X_val": X_val,
    "X_test": X_test,
    "y_train_fit": y_train_fit,
    "y_train": y_train,
    "y_val": y_val,
    "y_test": y_test,
    "X" : X,
    "y" : y
}, "splits_bundle.joblib")

['splits_bundle.joblib']

Random Forest Feature Selection

In [57]:
# Base model
model = RandomForestClassifier(
    n_estimators = 500,
    max_depth = None,
    min_samples_split = 2,
    max_features="sqrt",
    class_weight="balanced_subsample",
    bootstrap = True,
    oob_score = False,
    n_jobs=-1,
    random_state=101,
)
hyperparameters = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400]
}

In [58]:
rf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [59]:
#  Run search
rf.fit(X_train_fit, y_train_fit)


Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [69]:
rf_model = rf.best_estimator_
print(rf.best_score_ )
print(rf.best_params_)
print(rf.best_estimator_)

0.9994626205744239
{'n_estimators': 250}
RandomForestClassifier(class_weight='balanced_subsample', n_estimators=250,
                       n_jobs=-1, random_state=101)


In [70]:
#  Best params
imp = pd.Series(rf_model.feature_importances_, index=X_train_fit.columns).sort_values(ascending=False)
imp


Unnamed: 0,0
Fwd Seg Size Min,0.069594
Init Fwd Win Byts,0.053572
Bwd Pkts/s,0.048814
Flow Duration,0.045217
Fwd IAT Min,0.038021
...,...
Bwd Blk Rate Avg,0.000000
Fwd Pkts/b Avg,0.000000
Fwd Byts/b Avg,0.000000
CWE Flag Count,0.000000


In [None]:
# Compute cumulative importance and pick minimal top-k reaching ≥92%
cum = imp.cumsum()
topk = int((cum <= 0.92).sum()) + 1
feat_selected = imp.index[:topk].tolist()
print(f"Selected {topk} features to reach ≥92% cumulative importance.")


Selected 40 features to reach ≥92% cumulative importance.


In [75]:
feat_selected

['Fwd Seg Size Min',
 'Init Fwd Win Byts',
 'Bwd Pkts/s',
 'Flow Duration',
 'Fwd IAT Min',
 'Flow Pkts/s',
 'Fwd IAT Tot',
 'Fwd Pkts/s',
 'Flow Byts/s',
 'Flow IAT Mean',
 'Subflow Fwd Pkts',
 'Fwd IAT Max',
 'Flow IAT Max',
 'Fwd Header Len',
 'Fwd IAT Mean',
 'Fwd Pkt Len Mean',
 'Tot Fwd Pkts',
 'Flow IAT Min',
 'Subflow Fwd Byts',
 'Pkt Size Avg',
 'Fwd Seg Size Avg',
 'Pkt Len Mean',
 'RST Flag Cnt',
 'TotLen Fwd Pkts',
 'ECE Flag Cnt',
 'Pkt Len Max',
 'Bwd Pkt Len Mean',
 'Flow IAT Std',
 'Bwd Pkt Len Max',
 'Bwd Seg Size Avg',
 'TotLen Bwd Pkts',
 'Down/Up Ratio',
 'Fwd Pkt Len Max',
 'Pkt Len Std',
 'Bwd Header Len',
 'Subflow Bwd Byts',
 'Bwd Pkt Len Std',
 'Pkt Len Var',
 'Fwd Pkt Len Std',
 'Fwd IAT Std']

In [76]:
# Select 40 RF-picked features from Train
X_train_fit_selected = X_train_fit[feat_selected].copy()
# Select the exact same columns for the Validation split.
X_val_selected = X_val[feat_selected].copy()
# Select the exact same columns for the Test split
X_test_selected = X_test[feat_selected].copy()

In [77]:
print(X_train_fit_selected.shape)
print(y_train_fit.shape)

(1490567, 40)
(1490567,)


In [78]:
joblib.dump(
    {
        "X_train_fit_selected" : X_train_fit_selected,
        "X_val_selected" : X_val_selected,
        "X_test_selected" : X_test_selected
    },
    "extracted_features_bundle.joblib"
)

['extracted_features_bundle.joblib']

In [79]:
scaler = StandardScaler()


In [80]:
scaler.fit(X_train_fit_selected)

In [81]:
X_train_fit_selected_scaled = pd.DataFrame(scaler.transform(X_train_fit_selected), columns=feat_selected, index=X_train_fit_selected.index)
X_val_selected_scaled = pd.DataFrame(scaler.transform(X_val_selected), columns=feat_selected, index=X_val_selected.index)
X_test_selected_scaled = pd.DataFrame(scaler.transform(X_test_selected), columns=feat_selected, index=X_test_selected.index)

In [82]:
joblib.dump(
    {
    "X_train_fit_selected_scaled" : X_train_fit_selected_scaled,
    "X_val_selected_scaled" : X_val_selected_scaled,
    "X_test_selected_scaled" : X_test_selected_scaled
    },
    "standard_scaler_bundle.joblib"
)


['standard_scaler_bundle.joblib']

In [83]:
print(X_train_fit_selected_scaled.shape)
print(y_train_fit.shape)

(1490567, 40)
(1490567,)
