In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [3]:
train_path = '/content/drive/MyDrive/UNSW/UNSW_NB15_training-set.csv'
test_path = '/content/drive/MyDrive/UNSW/UNSW_NB15_testing-set.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Training shape:", train_df.shape)
print("Testing shape:", test_df.shape)
train_df.head()

Training shape: (175341, 45)
Testing shape: (82332, 45)


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [4]:
X_train = train_df.drop(['id','label','attack_cat'], axis=1, errors='ignore')
y_train = train_df['label']

X_test = test_df.drop(['id','label','attack_cat'], axis=1, errors='ignore')
y_test = test_df['label']


In [6]:
from sklearn.preprocessing import OrdinalEncoder
import joblib

cat_cols = []
for col in ["proto", "service", "state"]:
    if col in X_train.columns and X_train[col].dtype == "object":
        cat_cols.append(col)

print("Categorical columns to encode:", cat_cols)

feature_encoders = {}

for col in cat_cols:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    X_train[[col]] = enc.fit_transform(X_train[[col]].astype(str))
    X_test[[col]] = enc.transform(X_test[[col]].astype(str))
    feature_encoders[col] = enc
    joblib.dump(enc, f"{col}_encoder.pkl")   # ✅ save locally in notebook folder

print("✅ Encoders saved for:", cat_cols if cat_cols else "No categorical columns found")



Categorical columns to encode: ['service', 'state']
✅ Encoders saved for: ['service', 'state']


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Scale features
scaler_rf = StandardScaler()
X_train_scaled = scaler_rf.fit_transform(X_train)
X_test_scaled = scaler_rf.transform(X_test)

# Train RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = rf_model.predict(X_test_scaled)
print("RF Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ✅ Save locally (not Drive)
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(scaler_rf, "scaler_rf.pkl")

print("✅ Random Forest model + scaler saved locally")


RF Accuracy: 0.8713379973764758
              precision    recall  f1-score   support

           0       0.97      0.73      0.84     37000
           1       0.82      0.98      0.89     45332

    accuracy                           0.87     82332
   macro avg       0.90      0.86      0.87     82332
weighted avg       0.89      0.87      0.87     82332

✅ Random Forest model + scaler saved locally


In [9]:
from google.colab import files

files.download("rf_model.pkl")
files.download("scaler_rf.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
from sklearn.preprocessing import OrdinalEncoder
import joblib

# Columns we care about
cat_cols = ["proto", "service", "state"]
feature_encoders = {}

for col in cat_cols:
    if col in X_train.columns and X_train[col].dtype == "object":
        enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        X_train[[col]] = enc.fit_transform(X_train[[col]].astype(str))
        X_test[[col]] = enc.transform(X_test[[col]].astype(str))
        feature_encoders[col] = enc
        joblib.dump(enc, f"{col}_encoder.pkl")   # ✅ save locally
        print(f"✅ Saved {col}_encoder.pkl")
    else:
        print(f"⚠️ Skipped {col} (not object type or missing)")

print("✅ Encoders ready:", list(feature_encoders.keys()))


⚠️ Skipped proto (not object type or missing)
⚠️ Skipped service (not object type or missing)
⚠️ Skipped state (not object type or missing)
✅ Encoders ready: []


In [12]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ---------------------------
# Prepare multi-class dataset
# ---------------------------
X_train_mc = train_df.drop(['id','label','attack_cat'], axis=1, errors='ignore')
y_train_mc = train_df['attack_cat']

X_test_mc = test_df.drop(['id','label','attack_cat'], axis=1, errors='ignore')
y_test_mc = test_df['attack_cat']

print("X_train_mc shape:", X_train_mc.shape)

# ---------------------------
# Encode categorical columns (proto, service, state, etc.)
# ---------------------------
cat_cols_mc = X_train_mc.select_dtypes(include=['object']).columns
print("Categorical columns in multi-class dataset:", list(cat_cols_mc))

encoder_mc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_train_mc[cat_cols_mc] = encoder_mc.fit_transform(X_train_mc[cat_cols_mc].astype(str))
X_test_mc[cat_cols_mc] = encoder_mc.transform(X_test_mc[cat_cols_mc].astype(str))

print("✅ Categorical columns encoded")

# ---------------------------
# Scale features
# ---------------------------
scaler_mc = StandardScaler()
X_train_mc_scaled = scaler_mc.fit_transform(X_train_mc)
X_test_mc_scaled = scaler_mc.transform(X_test_mc)

# ---------------------------
# Encode attack labels
# ---------------------------
attack_encoder = LabelEncoder()
y_train_mc_enc = attack_encoder.fit_transform(y_train_mc)
y_test_mc_enc = attack_encoder.transform(y_test_mc)

# ---------------------------
# Train XGBoost
# ---------------------------
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(attack_encoder.classes_),
    random_state=42,
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    n_jobs=-1
)

xgb_model.fit(X_train_mc_scaled, y_train_mc_enc)

# ---------------------------
# Evaluate
# ---------------------------
y_pred_mc = xgb_model.predict(X_test_mc_scaled)
print("XGB Accuracy:", accuracy_score(y_test_mc_enc, y_pred_mc))
print(classification_report(y_test_mc_enc, y_pred_mc, target_names=attack_encoder.classes_))

# ---------------------------
# Save models locally
# ---------------------------
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(scaler_mc, "scaler_mc.pkl")
joblib.dump(attack_encoder, "attack_encoder.pkl")
joblib.dump(encoder_mc, "feature_encoder.pkl")

print("✅ Saved: xgb_model.pkl, scaler_mc.pkl, attack_encoder.pkl, feature_encoder.pkl")



X_train_mc shape: (175341, 42)
Categorical columns in multi-class dataset: ['proto', 'service', 'state']
✅ Categorical columns encoded
XGB Accuracy: 0.7637856483505806
                precision    recall  f1-score   support

      Analysis       0.09      0.19      0.12       677
      Backdoor       0.02      0.06      0.03       583
           DoS       0.45      0.12      0.19      4089
      Exploits       0.62      0.84      0.71     11132
       Fuzzers       0.30      0.55      0.38      6062
       Generic       1.00      0.97      0.98     18871
        Normal       0.96      0.76      0.85     37000
Reconnaissance       0.93      0.81      0.86      3496
     Shellcode       0.34      0.81      0.48       378
         Worms       0.47      0.41      0.44        44

      accuracy                           0.76     82332
     macro avg       0.52      0.55      0.51     82332
  weighted avg       0.83      0.76      0.78     82332

✅ Saved: xgb_model.pkl, scaler_mc.pkl, attack

In [13]:
from google.colab import files

# Download all XGBoost-related files
files.download("xgb_model.pkl")
files.download("scaler_mc.pkl")
files.download("attack_encoder.pkl")
files.download("feature_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>