# Imports

In [58]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib
import os
from pathlib import Path

# Loading

In [59]:
DATA_DIR = Path("Dataset_UNSW_NB15/Training_Testing")  
TRAIN_PATH = DATA_DIR / "UNSW_NB15_training-set.csv"
TEST_PATH  = DATA_DIR / "UNSW_NB15_testing-set.csv"

# Exploration

In [60]:
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print(train_df.shape, test_df.shape)

(175341, 45) (82332, 45)


In [61]:
print(train_df.columns.tolist())

['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']


In [62]:
train_df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


# Data Processing 

**Cleaning**

In [63]:
def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Drop obvious index/id-like columns if present
    for col in ["id"]:
        if col in df.columns:
            df = df.drop(columns=[col])

    # Ensure label exists and is numeric 0/1
    if "label" in df.columns:
        df["label"] = pd.to_numeric(df["label"], errors="coerce").fillna(0).astype(int)

    # Make sure attack_cat is string (if you keep it for analysis; DO NOT use it as feature for binary label unless you want leakage)
    if "attack_cat" in df.columns:
        df["attack_cat"] = df["attack_cat"].astype(str)

    # Replace inf with NaN so imputers can handle it
    df = df.replace([np.inf, -np.inf], np.nan)

    return df

train_df = basic_clean(train_df)
test_df  = basic_clean(test_df)

train_df[["label"]].value_counts().head()


label
1        119341
0         56000
Name: count, dtype: int64

**Feature Engineering**

In [64]:
def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Safe throughput estimation (Mbps) from bytes / duration
    if set(["sbytes", "dbytes", "dur"]).issubset(df.columns):
        total_bytes = df["sbytes"].astype(float) + df["dbytes"].astype(float)
        dur = df["dur"].astype(float).clip(lower=1e-6)  # avoid /0
        df["throughput_mbps"] = (8.0 * total_bytes / dur) / 1e6
    else:
        df["throughput_mbps"] = np.nan

    # Packets per second (pps) if packet counts exist
    if set(["spkts", "dpkts", "dur"]).issubset(df.columns):
        total_pkts = df["spkts"].astype(float) + df["dpkts"].astype(float)
        dur = df["dur"].astype(float).clip(lower=1e-6)
        df["pps"] = total_pkts / dur
    else:
        df["pps"] = np.nan

    return df

train_df = add_engineered_features(train_df)
test_df  = add_engineered_features(test_df)

train_df[["throughput_mbps","pps"]].describe()


Unnamed: 0,throughput_mbps,pps
count,175341.0,175341.0
mean,153.258544,206035.3
std,378.429589,351445.4
min,6e-06,0.01666906
25%,0.054976,38.38223
50%,8.607235,4514.673
75%,177.777778,250000.0
max,12032.0,14000000.0


**Adding 6G features**

In [65]:
def add_6g_fields(df: pd.DataFrame, seed: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    out = df.copy()

    # --- throughput (Mbps) from UNSW fields (safe) ---
    total_bytes = out["sbytes"].astype(float) + out["dbytes"].astype(float)
    dur = out["dur"].astype(float).clip(lower=1e-6)
    out["throughput_mbps"] = (8.0 * total_bytes / dur) / 1e6

    # --- slice_type (simple heuristic baseline, you can refine later) ---
    # mMTC: small flows + many packets or typical IoT ports
    dsport = out["dsport"].fillna(-1).astype(int)
    small_flow = total_bytes < total_bytes.quantile(0.25)
    many_pkts = (out["spkts"].astype(float) + out["dpkts"].astype(float)) > (
        (out["spkts"].astype(float) + out["dpkts"].astype(float)).quantile(0.75)
    )
    iot_ports = dsport.isin([1883, 5683, 8883])

    # URLLC: short duration + small flows (approx low latency service)
    short_flow = out["dur"].astype(float) < out["dur"].astype(float).quantile(0.25)

    out["slice_type"] = "eMBB"
    out.loc[(small_flow & (many_pkts | iot_ports)), "slice_type"] = "mMTC"
    out.loc[(short_flow & small_flow & ~(many_pkts | iot_ports)), "slice_type"] = "URLLC"

    # --- simulated 6G metrics for dashboard context (mentor asked latency profile / throughput class / priority) ---
    out["simulated_latency_ms"] = np.nan
    out.loc[out["slice_type"] == "URLLC", "simulated_latency_ms"] = rng.uniform(0.1, 1.0, (out["slice_type"] == "URLLC").sum())
    out.loc[out["slice_type"] == "eMBB",  "simulated_latency_ms"] = rng.uniform(5.0, 30.0, (out["slice_type"] == "eMBB").sum())
    out.loc[out["slice_type"] == "mMTC",  "simulated_latency_ms"] = rng.uniform(20.0, 200.0, (out["slice_type"] == "mMTC").sum())

    # latency profile (categorical)
    out["latency_profile"] = pd.cut(
        out["simulated_latency_ms"],
        bins=[0, 1, 30, 10_000],
        labels=["ultra_low", "medium", "high"],
        include_lowest=True
    ).astype(str)

    # throughput class (categorical)
    # (you can tune thresholds; here: based on computed throughput)
    out["throughput_class"] = pd.cut(
        out["throughput_mbps"].clip(lower=0),
        bins=[-1, 10, 100, 1e12],
        labels=["low", "mid", "high"],
    ).astype(str)

    # priority: URLLC highest, then eMBB, then mMTC
    priority_map = {"URLLC": 3, "eMBB": 2, "mMTC": 1}
    out["priority"] = out["slice_type"].map(priority_map).astype(int)

    return out


**Build X/y with correct feature selection**

label missing → wrong CSV

categorical columns contain NaN → OneHot must be preceded by imputer

train/test have different columns → align them

In [66]:
TARGET = "label"

# Columns to NEVER use as features (avoid leakage)
LEAKY_COLS = [c for c in ["attack_cat", TARGET] if c in train_df.columns]

X = train_df.drop(columns=LEAKY_COLS)
y = train_df[TARGET].astype(int)

# Align test to train columns (prevents "columns mismatch" errors later)
X_test_full = test_df.drop(columns=[c for c in LEAKY_COLS if c in test_df.columns], errors="ignore")

missing_in_test = set(X.columns) - set(X_test_full.columns)
extra_in_test   = set(X_test_full.columns) - set(X.columns)
print("Missing in test:", missing_in_test)
print("Extra in test:", extra_in_test)

# Create missing columns in test if any
for c in missing_in_test:
    X_test_full[c] = np.nan

# Drop extra columns
X_test_full = X_test_full[X.columns]


Missing in test: set()
Extra in test: set()


**Preprocessing Pipeline**

In [67]:
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # sparse-friendly
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop"
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_valid.shape, y_train.mean(), y_valid.mean())


(140272, 44) (35069, 44) 0.680620508725904 0.6806296159000826


In [68]:
from sklearn.base import clone

# base preprocessor (NOT fitted yet)
preprocessor_base = preprocessor

preprocessor_bin = clone(preprocessor_base)
preprocessor_atk = clone(preprocessor_base)


# Attack detection model (binary)

## Logistic Regression 

In [69]:
lr_clf = Pipeline(steps=[
    ("preprocess", preprocessor_bin),
    ("model", LogisticRegression(max_iter=1000, n_jobs=None))
])

lr_clf.fit(X_train, y_train)

valid_proba = lr_clf.predict_proba(X_valid)[:, 1]
valid_pred  = (valid_proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_valid, valid_proba))
print(classification_report(y_valid, valid_pred))
print(confusion_matrix(y_valid, valid_pred))


ROC-AUC: 0.9842644224726634
              precision    recall  f1-score   support

           0       0.97      0.83      0.89     11200
           1       0.92      0.99      0.95     23869

    accuracy                           0.94     35069
   macro avg       0.95      0.91      0.92     35069
weighted avg       0.94      0.94      0.93     35069

[[ 9251  1949]
 [  279 23590]]


## Random Forest 

In [70]:
rf_clf = Pipeline(steps=[
    ("preprocess", preprocessor_bin),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ))
])

rf_clf.fit(X_train, y_train)

valid_proba = rf_clf.predict_proba(X_valid)[:, 1]
valid_pred  = (valid_proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_valid, valid_proba))
print(classification_report(y_valid, valid_pred))
print(confusion_matrix(y_valid, valid_pred))


ROC-AUC: 0.9937385704260757
              precision    recall  f1-score   support

           0       0.95      0.92      0.93     11200
           1       0.96      0.98      0.97     23869

    accuracy                           0.96     35069
   macro avg       0.96      0.95      0.95     35069
weighted avg       0.96      0.96      0.96     35069

[[10281   919]
 [  523 23346]]


In [71]:

os.makedirs("models", exist_ok=True)
joblib.dump(rf_clf, "models/binary_attack_detector.joblib")

['models/binary_attack_detector.joblib']

# Classification Model (which type of attack it is)

**Preparing the dataset for attack classification**

In [72]:
# Keep only attacks
attack_df = train_df[train_df["label"] == 1].copy()

print("Attack-only shape:", attack_df.shape)
print(attack_df["attack_cat"].value_counts())


Attack-only shape: (119341, 46)
attack_cat
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: count, dtype: int64


**Defining X/y**

In [73]:
TARGET_ATTACK = "attack_cat"

# Drop columns that cause leakage
DROP_COLS = ["label", "attack_cat"]

X_attack = attack_df.drop(columns=DROP_COLS, errors="ignore")
y_attack = attack_df[TARGET_ATTACK].astype(str)


**Train-Validation split**

In [74]:
from sklearn.model_selection import train_test_split

X_train_a, X_valid_a, y_train_a, y_valid_a = train_test_split(
    X_attack,
    y_attack,
    test_size=0.2,
    random_state=42,
    stratify=y_attack
)


## Random Forest

In [76]:
attack_clf = Pipeline(steps=[
    ("preprocess", preprocessor_atk),  
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

attack_clf.fit(X_train_a, y_train_a)


In [77]:
from sklearn.metrics import classification_report

y_pred_a = attack_clf.predict(X_valid_a)


print(classification_report(y_valid_a, y_pred_a))


                precision    recall  f1-score   support

      Analysis       0.10      0.22      0.14       400
      Backdoor       0.03      0.14      0.05       349
           DoS       0.28      0.42      0.34      2453
      Exploits       0.83      0.57      0.67      6679
       Fuzzers       0.95      0.89      0.92      3637
       Generic       1.00      0.98      0.99      8000
Reconnaissance       0.92      0.74      0.82      2098
     Shellcode       0.68      0.62      0.65       227
         Worms       0.64      0.27      0.38        26

      accuracy                           0.74     23869
     macro avg       0.60      0.54      0.55     23869
  weighted avg       0.83      0.74      0.78     23869



In [78]:

os.makedirs("models", exist_ok=True)
joblib.dump(attack_clf, "models/attack_type_model.joblib")


['models/attack_type_model.joblib']

# Dashboard ready csv creation

In [79]:
binary_clf = rf_clf          # your trained pipeline for label (0/1)
attack_type_clf = attack_clf # your trained pipeline for attack_cat (multiclass)


In [80]:

def make_dashboard_table(
    df_original: pd.DataFrame,
    binary_clf,
    attack_type_clf=None,
    drop_cols_for_inference=("label", "attack_cat")
) -> pd.DataFrame:

    df_dash = df_original.copy()

    # Features for inference (drop labels)
    X_bin = df_dash.drop(columns=[c for c in drop_cols_for_inference if c in df_dash.columns], errors="ignore")

    # Binary model
    proba_attack = binary_clf.predict_proba(X_bin)[:, 1]
    pred_label = (proba_attack >= 0.5).astype(int)

    df_dash["pred_label"] = pred_label
    df_dash["pred_proba_attack"] = proba_attack

    # Attack-type model (only for predicted attacks)
    if attack_type_clf is not None:
        df_dash["pred_attack_cat"] = "normal"
        df_dash["pred_attack_cat_proba"] = np.nan

        attack_rows = np.where(pred_label == 1)[0]
        if len(attack_rows) > 0:
            X_attack = X_bin.iloc[attack_rows]
            pred_cat = attack_type_clf.predict(X_attack)
            df_dash.loc[df_dash.index[attack_rows], "pred_attack_cat"] = pred_cat

            if hasattr(attack_type_clf, "predict_proba"):
                conf = attack_type_clf.predict_proba(X_attack).max(axis=1)
                df_dash.loc[df_dash.index[attack_rows], "pred_attack_cat_proba"] = conf

    return df_dash


In [81]:
from pathlib import Path

Path("data/processed").mkdir(parents=True, exist_ok=True)

dash_train = make_dashboard_table(train_df, binary_clf=binary_clf, attack_type_clf=attack_type_clf)
dash_test  = make_dashboard_table(test_df,  binary_clf=binary_clf, attack_type_clf=attack_type_clf)

dash_train.to_csv("data/processed/unsw_train_dashboard.csv", index=False)
dash_test.to_csv("data/processed/unsw_test_dashboard.csv", index=False)

print("✅ Saved:", "data/processed/unsw_train_dashboard.csv", "and", "data/processed/unsw_test_dashboard.csv")


✅ Saved: data/processed/unsw_train_dashboard.csv and data/processed/unsw_test_dashboard.csv
