In [7]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

import pandas as pd

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

df.head()


Saving train_data.csv to train_data.csv


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [8]:
df.columns = [col.strip().lower() for col in df.columns]  # clean names
LABEL_COL = "class"


Convert labels to binary (normal = 0, anomaly = 1)

In [9]:
df[LABEL_COL] = df[LABEL_COL].apply(lambda x: 0 if x in ['normal', 'normal.'] else 1)
df[LABEL_COL].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,13449
1,11743


Split Numerical and Categorical columns

In [10]:
CAT_COLS = ["protocol_type", "service", "flag"]
NUM_COLS = [col for col in df.columns if col not in CAT_COLS + [LABEL_COL]]

Encode Categorical Columns

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), CAT_COLS),
        ('num', 'passthrough', NUM_COLS)
    ]
)

Train-Test Split

In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(LABEL_COL, axis=1)
y = df[LABEL_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Build XGBoost Pipeline

In [13]:
model = XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss'
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('xgb', model)
])

Train Model

In [14]:
pipeline.fit(X_train, y_train)

Evaluate Performance

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2687    3]
 [   9 2340]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2690
           1       1.00      1.00      1.00      2349

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039



Save Model + Encoders

In [None]:
import joblib

MODEL_PATH = "/content/drive/MyDrive/IDS/xgb_pipeline.joblib"
joblib.dump(pipeline, MODEL_PATH)

print("Model saved to:", MODEL_PATH)