In [40]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")


In [44]:
# Cell 2: Load dataset
df = pd.read_csv('balanced_synthetic_cicids.csv')
print("✅ Data loaded. Shape:", df.shape)
df.head()


✅ Data loaded. Shape: (99996, 16)


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,Fwd IAT Mean,Bwd IAT Mean,Fwd Header Length,Bwd Header Length,Packet Length Mean,Down/Up Ratio,Avg Fwd Segment Size,Avg Bwd Segment Size,Label
0,372911,40,61,207.578808,498.354903,590056.393368,5340.1596,279.382916,71.379364,30,50,251.537041,7,276.77057,869.4848,BENIGN
1,416833,75,54,470.777028,391.274887,981285.62966,9623.016867,196.067989,337.417042,51,25,765.04709,5,582.283212,116.340323,BENIGN
2,658691,25,8,143.566882,182.194382,68432.128204,776.293122,432.614975,635.716569,40,51,573.098893,9,329.492926,29.667906,BENIGN
3,242783,82,16,197.040143,278.526344,914901.025178,1528.357143,371.101643,364.160552,23,47,798.631238,4,90.656616,727.16539,BENIGN
4,845150,64,41,243.68787,312.489184,273477.596633,6911.016963,665.448001,248.391561,34,21,241.411675,4,692.342537,968.006828,BENIGN


# 1. Feature Engineering
# Use OneHotEncoder for categorical features (method, endpoint, attack_type)
# Use TF-IDF for the 'payload' feature (textual data)

# Define the target (attack type)

In [45]:
# Cell 3: Preprocessing
# Drop unneeded columns
# Only drop columns that exist
columns_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)


# Encode categorical labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Separate features and labels
X = df.drop('Label', axis=1)
y = df['Label']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("✅ Preprocessing complete. Features shape:", X_scaled.shape)


✅ Preprocessing complete. Features shape: (99996, 15)


In [32]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split on balanced data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [46]:
# Cell 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42)

print("🔄 Train size:", X_train.shape[0], " | Test size:", X_test.shape[0])


🔄 Train size: 79996  | Test size: 20000


In [47]:
# Cell 5: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)
print("✅ Model training complete.")


✅ Model training complete.


In [35]:

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("✅ Model, label encoder, and scaler saved to disk.")


✅ Model, label encoder, and scaler saved to disk.


In [48]:
# Cell 6: Evaluate Model
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("🎯 Accuracy:", round(acc * 100, 2), "%")
print("\n📊 Classification Report:\n", classification_report(
    y_test, y_pred, target_names=label_encoder.classes_.astype(str)
))



🎯 Accuracy: 16.12 %

📊 Classification Report:
               precision    recall  f1-score   support

      BENIGN       0.16      0.18      0.17      3333
  BruteForce       0.16      0.17      0.17      3333
        DDoS       0.16      0.16      0.16      3333
    PortScan       0.16      0.16      0.16      3334
        SQLi       0.16      0.16      0.16      3334
         XSS       0.16      0.14      0.15      3333

    accuracy                           0.16     20000
   macro avg       0.16      0.16      0.16     20000
weighted avg       0.16      0.16      0.16     20000



In [None]:
# Cell 7: Confusion Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
