Imports + Load encoded CSVs

In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
train_df = pd.read_csv("../data/processed/nsl_kdd_train_encoded.csv")
test_df  = pd.read_csv("../data/processed/nsl_kdd_test_encoded.csv")

print(train_df.shape, test_df.shape)


(125973, 125) (22544, 125)


Build X/y (NO LEAKAGE)

In [3]:
drop_cols = ["attack_type", "difficulty", "label"]

X_train = train_df.drop(columns=drop_cols)
y_train = train_df["label"]

X_test  = test_df.drop(columns=drop_cols)
y_test  = test_df["label"]

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test :", X_test.shape,  "y_test :", y_test.shape)


X_train: (125973, 122) y_train: (125973,)
X_test : (22544, 122) y_test : (22544,)


Train Baseline Random Forest (no tuning)

In [4]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"  # helps because train labels are not perfectly balanced
)

rf.fit(X_train, y_train)
print("Random Forest trained.")


Random Forest trained.


Evaluate on test set

In [5]:
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))


Accuracy: 0.7648154719659332

Confusion Matrix:
 [[9448  263]
 [5039 7794]]

Classification Report:
               precision    recall  f1-score   support

           0     0.6522    0.9729    0.7809      9711
           1     0.9674    0.6073    0.7462     12833

    accuracy                         0.7648     22544
   macro avg     0.8098    0.7901    0.7635     22544
weighted avg     0.8316    0.7648    0.7611     22544



Feature Importance (Top 25)

In [6]:
feat_imp = pd.DataFrame({
    "feature": X_train.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

feat_imp.head(25)


Unnamed: 0,feature,importance
1,src_bytes,0.139637
2,dst_bytes,0.088775
120,flag_SF,0.085726
29,dst_host_srv_count,0.061388
8,logged_in,0.052917
25,same_srv_rate,0.050022
30,dst_host_same_srv_rate,0.044055
26,diff_srv_rate,0.041608
35,dst_host_srv_serror_rate,0.040203
19,count,0.032551


Save model + importance

In [7]:
import os
os.makedirs("../models/rf", exist_ok=True)

joblib.dump(rf, "../models/rf/rf_baseline.joblib")
feat_imp.to_csv("../reports/rf_feature_importance.csv", index=False)

print("Saved model: models/rf/rf_baseline.joblib")
print("Saved importance: reports/rf_feature_importance.csv")


Saved model: models/rf/rf_baseline.joblib
Saved importance: reports/rf_feature_importance.csv
