# Intrusion Detection 
This notebook demonstrates **data cleaning**, **feature engineering**, **feature selection**, and **model training** on the uploaded `cybersecurity_intrusion_data.csv`. It's prepared as a university project deliverable: readable, reproducible, and annotated.

It contains example code for hyperparameter tuning and for model explanation (SHAP). Run cells in order.

Author: ChatGPT (ML training assistant)



In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, RocCurveDisplay
from sklearn.pipeline import Pipeline
import joblib
import os
print('Environment ready, pandas', pd.__version__)


In [None]:
# Load dataset
fn = '/mnt/data/cybersecurity_intrusion_data.csv'
df = pd.read_csv(fn)
print('Shape:', df.shape)
df.head()


In [None]:
# Quick EDA
print('Columns and dtypes:')
display(df.dtypes)
print('\nMissing values per column:')
display(df.isnull().sum().sort_values(ascending=False))
print('\nValue counts for object columns (sample):')
for c in df.select_dtypes(include=['object','category']).columns[:5]:
    print('\n---', c, '---')
    display(df[c].value_counts().head(10))


In [None]:
# Identify target column (heuristic). If the dataset has an obvious 'label' or 'attack' column it will be used,
# otherwise the last column is used as the target.
possible_targets = [c for c in df.columns if c.lower() in ('label','target','attack','intrusion','class','is_intrusion','attack_type')]
if len(possible_targets) == 0:
    target_col = df.columns[-1]
    print('No canonical target found. Using last column as target:', target_col)
else:
    target_col = possible_targets[0]
    print('Using target column:', target_col)

print('\nTarget distribution:')
display(df[target_col].value_counts(dropna=False))


In [None]:
# Data cleaning
df_clean = df.copy()

# Drop duplicates
dups = df_clean.duplicated().sum()
print('Duplicate rows:', dups)
if dups > 0:
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)

# Impute missing values: numeric -> median, categorical -> mode
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_clean.select_dtypes(include=['object','category']).columns.tolist()
if target_col in cat_cols:
    cat_cols.remove(target_col)

from sklearn.impute import SimpleImputer
num_imp = SimpleImputer(strategy='median')
cat_imp = SimpleImputer(strategy='most_frequent')

if len(num_cols) > 0:
    df_clean[num_cols] = num_imp.fit_transform(df_clean[num_cols])
if len(cat_cols) > 0:
    df_clean[cat_cols] = cat_imp.fit_transform(df_clean[cat_cols])

print('After imputation, any missing?', df_clean.isnull().sum().sum())


In [None]:
# Feature engineering (examples tailored to common IDS features).
# Add derived features only if the base columns exist in the dataset.
df_fe = df_clean.copy()

derived = {}
if 'src_bytes' in df_fe.columns and 'dst_bytes' in df_fe.columns:
    df_fe['bytes_ratio_src_dst'] = df_fe['src_bytes'] / (df_fe['dst_bytes'] + 1)
    derived['bytes_ratio_src_dst'] = 'src_bytes/dst_bytes'
if 'session_duration' in df_fe.columns and 'network_packet_size' in df_fe.columns:
    df_fe['packets_per_second'] = df_fe['network_packet_size'] / (df_fe['session_duration'] + 1e-6)
    derived['packets_per_second'] = 'network_packet_size/session_duration'
if 'login_attempts' in df_fe.columns and 'session_duration' in df_fe.columns:
    df_fe['attempts_per_min'] = df_fe['login_attempts'] / (df_fe['session_duration']/60 + 1e-6)
    derived['attempts_per_min'] = 'login_attempts/(duration/60)'

print('Derived features created:', derived)
df_fe.shape


In [None]:
# Prepare X, y and encoding
from sklearn.preprocessing import OneHotEncoder

y = df_fe[target_col].astype(str)
le = LabelEncoder()
y_enc = le.fit_transform(y)
print('Classes:', dict(enumerate(le.classes_)))

X = df_fe.drop(columns=[target_col]).copy()

# Separate categorical cols
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# For categorical features:
# - if unique values <= 20 -> OneHot
# - else -> LabelEncode
low_card = [c for c in cat_cols if X[c].nunique() <= 20]
high_card = [c for c in cat_cols if X[c].nunique() > 20]
print('Low-card categorical:', low_card)
print('High-card categorical:', high_card)

# Label-encode high-cardinality
for c in high_card:
    X[c] = LabelEncoder().fit_transform(X[c].astype(str))

# One-hot low-cardinality
if len(low_card) > 0:
    X = pd.get_dummies(X, columns=low_card, drop_first=True)

print('Shape after encoding:', X.shape)


In [None]:
# Feature selection pipeline:
# 1) VarianceThreshold to remove near-constant columns
# 2) Drop highly correlated features (> 0.95)
# 3) SelectKBest (mutual information) to pick top features

from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif

X_num = X.fillna(0).copy()
vt = VarianceThreshold(threshold=1e-5)
vt.fit(X_num)
cols_vt = X_num.columns[vt.get_support()]
X_vt = X_num[cols_vt]

# Drop highly correlated
corr = X_vt.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
print('Dropping due to high correlation:', to_drop)
X_vt.drop(columns=to_drop, inplace=True)

# SelectKBest
k = min(30, X_vt.shape[1])
skb = SelectKBest(score_func=mutual_info_classif, k=k)
skb.fit(X_vt, y_enc)
mask = skb.get_support()
selected_features = X_vt.columns[mask].tolist()
print('Selected features ({}):'.format(len(selected_features)))
display(pd.DataFrame({'feature': X_vt.columns, 'score': skb.scores_}).sort_values('score', ascending=False).head(40))

X_sel = X_vt[selected_features].copy()


In [None]:
# Train/Test split and scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X_sel, y_enc, test_size=0.3, random_state=42, stratify=y_enc)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

print('Train shape:', X_train_s.shape, 'Test shape:', X_test_s.shape)


In [None]:
# Train and evaluate models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42)
}

from sklearn.metrics import classification_report, confusion_matrix

results = {}
for name, m in models.items():
    print('\nTraining', name)
    m.fit(X_train_s, y_train)
    preds = m.predict(X_test_s)
    print('Classification report for', name)
    print(classification_report(y_test, preds))
    cm = confusion_matrix(y_test, preds)
    print('Confusion matrix:\n', cm)
    results[name] = m

# Save best model (example: GradientBoosting)
joblib.dump(results['GradientBoosting'], 'best_model_joblib.pkl')
print('Saved best model to best_model_joblib.pkl')


In [None]:
# Hyperparameter tuning example (RandomizedSearchCV on RandomForest)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

param_dist = {
    'n_estimators': sp_randint(50, 400),
    'max_depth': sp_randint(3, 20),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 20)
}

rf = RandomForestClassifier(random_state=42)
rs = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=30, scoring='f1', cv=3, verbose=1, n_jobs=-1, random_state=42)
rs.fit(X_train_s, y_train)
print('Best params:', rs.best_params_)
print('Best CV score:', rs.best_score_)
best_rf = rs.best_estimator_
# Evaluate on test set
preds = best_rf.predict(X_test_s)
print('\nTest classification report (best RF):')
print(classification_report(y_test, preds))


In [None]:
# Model explainability with SHAP (if available).
# If SHAP is not installed, uncomment the pip install line and run the cell.
try:
    import shap
    print('SHAP version', shap.__version__)
    explainer = shap.Explainer(results['RandomForest'], X_train_s)
    shap_values = explainer(X_test_s)
    # Plot summary (force matplotlib)
    shap.summary_plot(shap_values, features=X_test, feature_names=X_test.columns, show=True)
except Exception as e:
    print('SHAP not available or failed to run:', e)
    print('If you want SHAP, run: pip install shap')


In [None]:
# Save cleaned dataset and selected features to /mnt/data for submission
X_sel.to_csv('/mnt/data/selected_features.csv', index=False)
df_fe.head(200).to_csv('/mnt/data/cleaned_dataset_sample.csv', index=False)
print('Saved selected_features.csv and cleaned_dataset_sample.csv in /mnt/data') 


# Final notes

"Suggestions for the report"

"1. Describe data provenance and any assumptions (e.g., how target was defined)."
"2. Show EDA plots (class imbalance, feature distributions)." 
"3. Report cross-validated metrics and confusion matrix with per-class precision/recall." 
"4. Discuss limitations and next steps."

Good luck with your project â€” you can run each cell in order and modify hyperparameters, add visualizations, or extend with other models (XGBoost/LightGBM) if desired."
