In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import joblib
import json
from joblib import Parallel, delayed
import os
from datetime import datetime


# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', module='xgboost')
warnings.filterwarnings('ignore', module='lightgbm')
warnings.filterwarnings('ignore', category=FutureWarning)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings

# Create models directory if it doesn't exist
models_dir = "models"
os.makedirs(models_dir, exist_ok=True)

In [2]:
DOCS_DIR = './docs'  
MODEL_DIR = './models'
sample_file = f'{DOCS_DIR}/preprocessed_data.csv'

In [3]:
# === Function to load both the scaler and a saved model ===
def load_model(model_name, models_dir="./models"):
    """
    Load the saved scaler and model by name.

    Parameters:
    model_name (str): Name of the model (e.g., 'Random Forest', 'XGBoost')
    models_dir (str): Directory where models are saved

    Returns:
    tuple: (scaler, model)
    """
    # Build file paths
    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
    model_path = os.path.join(models_dir, model_filename)
    scaler_path = os.path.join(models_dir, "scaler.pkl")

    # Check file existence
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"❌ Model file not found: {model_path}")
    if not os.path.exists(scaler_path):
        raise FileNotFoundError(f"❌ Scaler file not found: {scaler_path}")

    # Load and return the scaler and model
    scaler = joblib.load(scaler_path)
    model = joblib.load(model_path)
    return scaler, model

# 💡 Usage example
print("\n💡 To load a saved model and scaler, use:")
print("   scaler, model = load_model('Random Forest')")



💡 To load a saved model and scaler, use:
   scaler, model = load_model('Random Forest')


In [4]:
label_mapping = {
    0: "BENIGN",
    1: "Bot",
    2: "DDoS",
    3: "DoS GoldenEye",
    4: "DoS Hulk",
    5: "DoS Slowhttptest",
    6: "DoS slowloris",
    7: "FTP-Patator",
    8: "Heartbleed",
    9: "Infiltration",
    10: "PortScan",
    11: "SSH-Patator",
    12: "Web Attack - Brute Force",
    13: "Web Attack - Sql Injection",
    14: "Web Attack - XSS"
}

In [8]:
df = pd.read_csv(sample_file)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565423 entries, 0 to 565422
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   total_length_of_fwd_packets  565423 non-null  float64
 1   total_length_of_bwd_packets  565423 non-null  float64
 2   fwd_packet_length_max        565423 non-null  float64
 3   bwd_packet_length_max        565423 non-null  float64
 4   bwd_packet_length_mean       565423 non-null  float64
 5   max_packet_length            565423 non-null  float64
 6   packet_length_mean           565423 non-null  float64
 7   packet_length_std            565423 non-null  float64
 8   packet_length_variance       565423 non-null  float64
 9   average_packet_size          565423 non-null  float64
 10  avg_bwd_segment_size         565423 non-null  float64
 11  subflow_fwd_bytes            565423 non-null  float64
 12  subflow_bwd_bytes            565423 non-null  float64
 13 

In [12]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [13]:
x

Unnamed: 0,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,bwd_packet_length_max,bwd_packet_length_mean,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,average_packet_size,avg_bwd_segment_size,subflow_fwd_bytes,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward
0,0.0,0.0,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,264.0,-1.0
1,62.0,208.0,31.0,104.0,104.0000,104.0,60.200000,39.983747,1598.700000,75.250000,104.0000,62.0,208.0,-1.0,-1.0
2,12.0,0.0,6.0,0.0,0.0000,6.0,6.000000,0.000000,0.000000,9.000000,0.0000,12.0,0.0,258.0,-1.0
3,80.0,112.0,40.0,56.0,56.0000,56.0,46.400000,8.763561,76.800000,58.000000,56.0000,80.0,112.0,-1.0,-1.0
4,57.0,173.0,57.0,173.0,173.0000,173.0,95.666667,66.972631,4485.333333,143.500000,173.0000,57.0,173.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565418,0.0,0.0,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,243.0,290.0
565419,43.0,171.0,43.0,171.0,171.0000,171.0,85.666667,73.900834,5461.333333,128.500000,171.0000,43.0,171.0,-1.0,-1.0
565420,1176.0,5339.0,693.0,1448.0,667.3750,1448.0,296.136364,506.385257,256426.028139,310.238095,667.3750,1176.0,5339.0,65535.0,246.0
565421,5823.0,7281.0,2912.0,1448.0,455.0625,2912.0,468.000000,859.325101,738439.629600,485.333333,455.0625,5823.0,7281.0,29200.0,75.0


In [14]:
y

0         0
1         0
2         0
3         0
4         0
         ..
565418    0
565419    0
565420    0
565421    0
565422    0
Name: label, Length: 565423, dtype: int64

In [15]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(x) 

# Save the scaler
joblib.dump(scaler, f"{MODEL_DIR}/scaler.pkl")
print("Scaler saved to models/scaler.pkl")


Scaler saved to models/scaler.pkl


In [16]:
X_scaled

array([[-0.07876342, -0.00722908, -0.28899089, ..., -0.00722949,
        -0.4684919 , -0.23464292],
       [-0.06988111, -0.00714797, -0.24586022, ..., -0.00714837,
        -0.48702575, -0.23464292],
       [-0.07704426, -0.00722908, -0.28064302, ..., -0.00722949,
        -0.46891153, -0.23464292],
       ...,
       [ 0.08971393, -0.00514712,  0.67518827, ..., -0.00514718,
         4.09650071, -0.20523186],
       [ 0.75545735, -0.00438984,  3.76250943, ..., -0.00438976,
         1.55526484, -0.22559336],
       [-0.07117048, -0.00718384, -0.21525136, ..., -0.00718425,
        -0.48702575, -0.23464292]], shape=(565423, 15))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', n_jobs=-1, random_state=42, verbosity=0),
    "LightGBM": LGBMClassifier(n_jobs=-1, random_state=42, verbose=-1, force_col_wise=True)
}

# Sequential training - COMMENTED OUT
# results = []
# model_paths = []
# print("Starting model training and evaluation...\n")
# for name, model in tqdm(models.items(), desc="Overall Progress", unit="model"):
#     result, model_path = train_and_evaluate(name, model, X_train, y_train, X_test, y_test)
#     results.append(result)
#     model_paths.append({"Model": name, "Path": model_path})

# Parallel training with model saving and tracking
def train_and_evaluate_parallel(name, model):
    print(f"Training {name} in parallel...")

    # Handle feature names for LightGBM
    if name == "LightGBM":
        if hasattr(X_train, 'columns'):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
        else:
            feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
            X_train_df = pd.DataFrame(X_train, columns=feature_names)
            X_test_df = pd.DataFrame(X_test, columns=feature_names)
            model.fit(X_train_df, y_train)
            y_pred = model.predict(X_test_df)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Save model
    model_filename = f"{name.replace(' ', '_').lower()}.pkl"
    model_path = os.path.join(models_dir, model_filename)
    joblib.dump(model, model_path)

    print(f"✓ {name} completed - Accuracy: {accuracy_score(y_test, y_pred):.4f} - Saved to {model_path}")

    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1-Score": f1_score(y_test, y_pred, average='macro', zero_division=0),
        "Path": model_path
    }

print("Training models in parallel...\n")
model_items = list(models.items())
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(train_and_evaluate_parallel)(name, model)
    for name, model in model_items
)

# Extract model paths
model_paths = [{"Model": r["Model"], "Path": r["Path"]} for r in results]

# Display results
print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

results_df = pd.DataFrame(results).drop(columns="Path").sort_values(by="Accuracy", ascending=False)
print(results_df.to_string(index=False, float_format='%.4f'))

# Additional summary
print(f"\nBest performing model: {results_df.iloc[0]['Model']}")
print(f"Best accuracy: {results_df.iloc[0]['Accuracy']:.4f}")

# Save results summary
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_filename = f"model_evaluation_results_{timestamp}.csv"
results_path = os.path.join(models_dir, results_filename)
results_df.to_csv(results_path, index=False)

# Save model paths summary
model_paths_df = pd.DataFrame(model_paths)
paths_filename = f"model_paths_{timestamp}.csv"
paths_path = os.path.join(models_dir, paths_filename)
model_paths_df.to_csv(paths_path, index=False)

print(f"\n" + "="*60)
print("SAVED FILES")
print("="*60)
print("Trained models saved:")
for model_info in model_paths:
    print(f"  • {model_info['Model']}: {model_info['Path']}")

print(f"\nResults summary saved to: {results_path}")
print(f"Model paths saved to: {paths_path}")
print(f"\n📁 All files saved in the '{models_dir}' directory")


Training models in parallel...



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:   52.7s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.5min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:  1.8min remaining:   53.7s


In [5]:
scaler, model = load_model("Random Forest")      # Ensure this path is correct

# === Step 2: Define the input JSON ===
input_json = {
    "total_length_of_fwd_packets": 369,
    "total_length_of_bwd_packets": 11595,
    "fwd_packet_length_max": 369,
    "bwd_packet_length_max": 8688,
    "bwd_packet_length_mean": 2898.75,
    "max_packet_length": 8688,
    "packet_length_mean": 854.5714,
    "packet_length_std": 2383.3406,
    "packet_length_variance": 5680312.418,
    "average_packet_size": 920.3077,
    "avg_bwd_segment_size": 2898.75,
    "subflow_fwd_bytes": 369,
    "subflow_bwd_bytes": 11595,
    "init_win_bytes_forward": 251,
    "init_win_bytes_backward": 235
}

# === Step 3: Convert JSON to DataFrame ===
input_df = pd.DataFrame([input_json])

# === Step 4: Scale input ===
input_scaled = scaler.transform(input_df)

# === Step 5: Predict ===
y_pred = model.predict(input_scaled)
y_proba = model.predict_proba(input_scaled) if hasattr(model, "predict_proba") else None

predicted_label = y_pred[0]
predicted_class_name = label_mapping.get(predicted_label, "Unknown")

# === Step 7: Display prediction ===
print(f"Predicted Class is {y_pred} which is {predicted_class_name}")

if y_proba is not None:
    print("Class Probabilities:", y_proba[0])


Predicted Class is [4] which is DoS Hulk
Class Probabilities: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:

# === Step 1: Load model and scaler ===
scaler, model = load_model("Random Forest")

# === Step 2: Define the input data (corrected structure) ===
with open(f"{DOCS_DIR}/test.json", "r") as f:
    input_data = json.load(f)

# === Step 3: Function to predict for a single sample ===
def predict_threat(threat_type):
    """Predict threat for a specific sample from the input data"""
    if threat_type not in input_data:
        print(f"Error: '{threat_type}' not found in input data")
        return
    
    # Get the sample data
    sample_data = input_data[threat_type]
    
    # Convert to DataFrame
    input_df = pd.DataFrame([sample_data])
    
    # Scale input
    input_scaled = scaler.transform(input_df)
    
    # Predict
    y_pred = model.predict(input_scaled)
    y_proba = model.predict_proba(input_scaled) if hasattr(model, "predict_proba") else None
    
    
    predicted_class = y_pred[0]
    threat_label = label_mapping.get(predicted_class, "Unknown")
    
    # Output results
    print(f"\n=== Prediction for {threat_type} ===")
    print("Predicted Class ID:", predicted_class)
    print("Predicted Threat Type:", threat_label)
    print("Actual Threat Type:", threat_type)
    print("Prediction Correct:", threat_label == threat_type)
    
    if y_proba is not None:
        print("\nClass Probabilities:")
        for idx, prob in enumerate(y_proba[0]):
            print(f" {idx} - {label_mapping.get(idx, idx)}: {prob:.4f}")

# === Step 4: Function to predict all samples ===
def predict_all_threats():
    """Predict for all samples in the input data"""
    print("=== Predicting all threat types ===")
    
    for threat_type in input_data.keys():
        predict_threat(threat_type)

# === Step 5: Function to create DataFrame from all samples for batch prediction ===
def batch_predict():
    """Convert all samples to DataFrame and predict in batch"""
    # Convert dictionary to list of dictionaries for DataFrame creation
    samples = []
    labels = []
    
    for threat_type, features in input_data.items():
        samples.append(features)
        labels.append(threat_type)
    
    # Create DataFrame
    input_df = pd.DataFrame(samples)
    
    # Scale input
    input_scaled = scaler.transform(input_df)
    
    # Predict
    y_pred = model.predict(input_scaled)
    y_proba = model.predict_proba(input_scaled) if hasattr(model, "predict_proba") else None
    
    print("\n=== Batch Prediction Results ===")
    for i, (actual, predicted_id) in enumerate(zip(labels, y_pred)):
        predicted_label = label_mapping.get(predicted_id, "Unknown")
        print(f"Sample {i+1}: {actual} -> Predicted: {predicted_label} (ID: {predicted_id}) - {'✓' if actual == predicted_label else '✗'}")
        
        if y_proba is not None:
            print(f"  Top probabilities: {', '.join([f'{label_mapping[j]}: {prob:.3f}' for j, prob in enumerate(y_proba[i]) if prob > 0.1])}")

# === Example Usage ===
if __name__ == "__main__":
    # Predict for a single threat type
    predict_threat("Bot")
    
    # Or predict for all threat types
    # predict_all_threats()
    
    # Or do batch prediction
    # batch_predict()


=== Prediction for Bot ===
Predicted Class ID: 1
Predicted Threat Type: Bot
Actual Threat Type: Bot
Prediction Correct: True

Class Probabilities:
 0 - BENIGN: 0.0989
 1 - Bot: 0.9011
 2 - DDoS: 0.0000
 3 - DoS GoldenEye: 0.0000
 4 - DoS Hulk: 0.0000
 5 - DoS Slowhttptest: 0.0000
 6 - DoS slowloris: 0.0000
 7 - FTP-Patator: 0.0000
 8 - Heartbleed: 0.0000
 9 - Infiltration: 0.0000
 10 - PortScan: 0.0000
 11 - SSH-Patator: 0.0000
 12 - Web Attack - Brute Force: 0.0000
 13 - Web Attack - Sql Injection: 0.0000
 14 - Web Attack - XSS: 0.0000
