In [1]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import os
import glob
import re
import time
import joblib

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# --- Visualization ---
import seaborn as sns
import matplotlib.pyplot as plt

# --- Configuration ---
DATASET_DIR = 'datasets'
OUTPUT_DIR = 'models'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
print("Setup complete.")

Setup complete.


In [2]:
print("--- [Step 1/4] Starting Simplified Preprocessing with Data Augmentation ---")

files_to_process = glob.glob(os.path.join(DATASET_DIR, "dataset*.csv"))
all_processed_dfs = []

label_map = {
    'DoS attacks-Hulk': 'dos_ddos', 'DoS attacks-GoldenEye': 'dos_ddos',
    'DoS attacks-Slowloris': 'dos_ddos', 'DoS attacks-SlowHTTPTest': 'dos_ddos',
    'DDOS attack-HOIC': 'dos_ddos', 'DDoS attacks-LOIC-HTTP': 'dos_ddos',
    'DDOS attack-LOIC-UDP': 'dos_ddos', 'FTP-BruteForce': 'bruteforce',
    'SSH-BruteForce': 'bruteforce', 'Brute Force -Web': 'bruteforce',
    'Brute Force -XSS': 'bruteforce', 'Bot': 'bot',
    'Infilteration': 'infiltration', 'SQL Injection': 'sql_injection', 'Benign': 'benign'
}

# ðŸ’¡ NEW: Define how many samples we want for our minority classes
augmentation_targets = {
    'infiltration': 10000,
    'sql_injection': 1000
}

for filepath in files_to_process:
    filename = os.path.basename(filepath)
    print(f"--> Processing {filename}...")
    df = pd.read_csv(filepath, low_memory=False)
    
    df = df.loc[:, ~df.columns.duplicated()]
    if 'Label' in df.columns and 'Label' in df['Label'].unique():
        df = df[df['Label'] != 'Label'].copy()
        
    new_cols = [re.sub(r'[^a-zA-Z0-9]+', '_', col).lower().strip('_') for col in df.columns]
    df.columns = new_cols
    
    if 'label' in df.columns:
        df['label'] = df['label'].map(label_map).fillna(df['label'])

        # --- ðŸ’¡ NEW: DATA AUGMENTATION / OVERSAMPLING LOGIC ---
        new_samples = []
        for label, target_count in augmentation_targets.items():
            if label in df['label'].values:
                label_df = df[df['label'] == label]
                current_count = len(label_df)
                
                if current_count > 0 and current_count < target_count:
                    # Calculate how many new samples to add
                    num_to_add = target_count - current_count
                    # Create new samples by duplicating existing ones
                    augmented_samples = label_df.sample(n=num_to_add, replace=True, random_state=42)
                    new_samples.append(augmented_samples)
                    print(f"    -> Augmenting '{label}' with {num_to_add} new samples.")
        
        # Add the new samples back to the dataframe
        if new_samples:
            df = pd.concat([df] + new_samples, ignore_index=True)
        # --- END OF NEW LOGIC ---

        class_counts = df['label'].value_counts()
        if len(class_counts) > 1:
            majority_class_name = class_counts.index[0]
            target_size = class_counts.iloc[1]
            df_majority = df[df['label'] == majority_class_name]
            df_others = df[df['label'] != majority_class_name]
            df_majority_downsampled = df_majority.sample(n=target_size, random_state=42)
            df_balanced = pd.concat([df_majority_downsampled, df_others])
            all_processed_dfs.append(df_balanced)
        else:
            all_processed_dfs.append(df)

print("\nâœ… All raw datasets have been processed, augmented, and balanced in memory.")

--- [Step 1/4] Starting Simplified Preprocessing with Data Augmentation ---
--> Processing dataset1.csv...
--> Processing dataset10.csv...
--> Processing dataset2.csv...
--> Processing dataset3.csv...
    -> Augmenting 'sql_injection' with 947 new samples.
--> Processing dataset4.csv...
--> Processing dataset5.csv...
--> Processing dataset6.csv...
--> Processing dataset7.csv...
    -> Augmenting 'sql_injection' with 966 new samples.
--> Processing dataset8.csv...
--> Processing dataset9.csv...

âœ… All raw datasets have been processed, augmented, and balanced in memory.


In [3]:
print("--- [Step 2/4] Merging and Final Bulletproof Cleaning ---")

# --- Merge all dataframes first ---
final_df = pd.concat(all_processed_dfs, ignore_index=True)
print(f"-> Combined dataset shape: {final_df.shape}")

# --- Identify feature and identifier columns ---
identifier_cols = ['src_ip', 'src_port', 'dst_ip', 'timestamp', 'flow_id']
label_col = 'label'
# Get all columns that are features for the model
feature_cols = [col for col in final_df.columns if col not in identifier_cols + [label_col]]

# --- Force all feature columns to numeric, coercing errors ---
print("--> Forcing feature columns to numeric type...")
final_df[feature_cols] = final_df[feature_cols].apply(pd.to_numeric, errors='coerce')

# --- Replace infinities, fill all NaNs, and clip large values ---
print("--> Cleaning and clipping data...")
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df.fillna(0, inplace=True)
finfo = np.finfo(np.float32)
final_df[feature_cols] = final_df[feature_cols].clip(finfo.min, finfo.max)

print("\nâœ… Merging and final cleaning complete. Data is guaranteed to be valid.")

--- [Step 2/4] Merging and Final Bulletproof Cleaning ---
-> Combined dataset shape: (4347988, 84)
--> Forcing feature columns to numeric type...
--> Cleaning and clipping data...

âœ… Merging and final cleaning complete. Data is guaranteed to be valid.


In [4]:
print("--- Final Dataset Class Distribution ---")

# Use value_counts() on the 'label' column of your final DataFrame
label_counts = final_df['label'].value_counts()

print(label_counts)

--- Final Dataset Class Distribution ---
label
benign            2079779
dos_ddos          1436294
bot                286191
bruteforce         194201
SSH-Bruteforce     187589
infiltration       161934
sql_injection        2000
Name: count, dtype: int64


In [5]:
print("--- [Step 3/4] Preparing Data for Modeling ---")

# --- Select only the feature columns for X ---
feature_cols = [col for col in final_df.columns if col not in ['label'] + identifier_cols]
X = final_df[feature_cols]
y = final_df['label']

# Encode labels and split data
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"-> Data split into training ({X_train.shape}) and testing ({X_test.shape}) sets.")

--- [Step 3/4] Preparing Data for Modeling ---
-> Data split into training ((3478390, 78)) and testing ((869598, 78)) sets.


In [6]:
print("--- [Step 4/4] Training and Evaluating Models ---")

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1)
}

for name, model in models.items():
    print(f"\n{'='*15} Training {name} {'='*15}")
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"-> Training completed in {((end_time - start_time) / 60):.2f} minutes.")
    
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) * 100
    
    print(f"\n--- {name} Classification Report ---")
    print(classification_report(y_test, predictions, target_names=label_encoder.classes_))
    print(f"-> {name} Accuracy: {accuracy:.2f}%")

print("\nâœ… All models trained and evaluated.")

--- [Step 4/4] Training and Evaluating Models ---

-> Training completed in 3.25 minutes.

--- Random Forest Classification Report ---
                precision    recall  f1-score   support

SSH-Bruteforce       1.00      1.00      1.00     37518
        benign       0.95      0.98      0.96    415956
           bot       1.00      1.00      1.00     57238
    bruteforce       0.76      0.91      0.83     38840
      dos_ddos       0.99      0.96      0.97    287259
  infiltration       0.50      0.31      0.39     32387
 sql_injection       0.99      1.00      0.99       400

      accuracy                           0.95    869598
     macro avg       0.88      0.88      0.88    869598
  weighted avg       0.94      0.95      0.94    869598

-> Random Forest Accuracy: 94.60%

-> Training completed in 1.68 minutes.

--- XGBoost Classification Report ---
                precision    recall  f1-score   support

SSH-Bruteforce       1.00      1.00      1.00     37518
        benign      

In [8]:
import joblib
import os

print("--- [Final Step] Saving Trained Models and Encoder ---")

# This assumes the 'models' dictionary from the training cell is available
# and OUTPUT_DIR is set to 'models'

# Save each trained model from the dictionary
for name, model in models.items():
    filename = f"cyber_threat_model_{name.lower().replace(' ', '')}.joblib"
    output_path = os.path.join(OUTPUT_DIR, filename)
    joblib.dump(model, output_path)
    print(f"-> {name} model saved to: {output_path}")

# Save the label encoder, which is essential for decoding predictions later
encoder_filename = os.path.join(OUTPUT_DIR, 'label_encoder.joblib')
joblib.dump(label_encoder, encoder_filename)
print(f"-> Label encoder saved to: {encoder_filename}")

print("\nâœ… All baseline assets exported successfully!")

--- [Final Step] Saving Trained Models and Encoder ---
-> Random Forest model saved to: models\cyber_threat_model_randomforest.joblib
-> XGBoost model saved to: models\cyber_threat_model_xgboost.joblib
-> LightGBM model saved to: models\cyber_threat_model_lightgbm.joblib
-> Label encoder saved to: models\label_encoder.joblib

âœ… All baseline assets exported successfully!
