# NSL-KDD Model Training & Intrusion Detection

This notebook covers preprocessing, model training, evaluation, and a real-time intrusion detection function using the NSL-KDD dataset.

In [6]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [8]:

# Column names (from dataset description)
column_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
    "num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
    "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
    "dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "class","difficulty"
]

# Load training & testing datasets
train_df = pd.read_csv("data/NSL-KDD/KDDTrain+.txt", header=None, names=column_names)
test_df = pd.read_csv("data/NSL-KDD/KDDTest+.txt", header=None, names=column_names)

# Drop 'difficulty' (not needed)
train_df.drop(['difficulty'], axis=1, inplace=True)
test_df.drop(['difficulty'], axis=1, inplace=True)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'data/NSL-KDD/KDDTrain+.txt'

In [None]:

# Binary target variable: 0=normal, 1=attack
train_df['attack_binary'] = train_df['class'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['attack_binary'] = test_df['class'].apply(lambda x: 0 if x == 'normal' else 1)

# Split features and labels
X_train_raw = train_df.drop(['class','attack_binary'], axis=1)
y_train = train_df['attack_binary']

X_test_raw = test_df.drop(['class','attack_binary'], axis=1)
y_test = test_df['attack_binary']

# Categorical & numerical columns
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = X_train_raw.columns.drop(categorical_cols)

# One-hot encoding
X_train_encoded = pd.get_dummies(X_train_raw, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_raw, columns=categorical_cols, drop_first=True)

# Align columns
train_cols = X_train_encoded.columns
test_cols = X_test_encoded.columns

for c in set(train_cols) - set(test_cols):
    X_test_encoded[c] = 0
for c in set(test_cols) - set(train_cols):
    X_train_encoded[c] = 0

X_test_encoded = X_test_encoded[train_cols]  # reorder columns

print("Encoded training shape:", X_train_encoded.shape)
print("Encoded testing shape:", X_test_encoded.shape)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train_encoded[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test_encoded[numerical_cols])


In [None]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}\n")


In [None]:

# Random Forest as final model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Normal','Attack']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Normal','Attack'], yticklabels=['Normal','Attack'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:

def detect_intrusion(new_traffic_sample):
    # One-hot encode
    new_encoded = pd.get_dummies(new_traffic_sample, columns=categorical_cols)
    for c in set(train_cols) - set(new_encoded.columns):
        new_encoded[c] = 0
    new_encoded = new_encoded[train_cols]

    # Scale numerical features
    new_scaled = new_encoded.copy()
    new_scaled[numerical_cols] = scaler.transform(new_encoded[numerical_cols])

    # Prediction
    pred = rf_model.predict(new_scaled)
    prob = rf_model.predict_proba(new_scaled)

    if pred[0] == 1:
        print("🚨 ALERT: Malicious Traffic Detected!")
        print(f"Confidence: {prob[0][1]*100:.2f}%")
    else:
        print("✅ Normal Traffic Detected.")
        print(f"Confidence: {prob[0][0]*100:.2f}%")

# Test with one attack & one normal sample
attack_sample = X_test_raw.iloc[y_test[y_test==1].index[0]].to_frame().T
normal_sample = X_test_raw.iloc[y_test[y_test==0].index[0]].to_frame().T

print("--- Testing Attack Sample ---")
detect_intrusion(attack_sample)

print("\n--- Testing Normal Sample ---")
detect_intrusion(normal_sample)
