In [6]:
# Cell 1: Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import dump_svmlight_file
import xgboost as xgb


In [7]:
# Cell 2: CUDA Check
from xgboost import XGBClassifier
print("✅ XGBoost version:", xgb.__version__)
try:
    test_model = XGBClassifier(tree_method='hist', device='cuda')
    print("🚀 GPU is available and will be used for training.")
except Exception as e:
    print("❌ GPU check failed:", e)


✅ XGBoost version: 3.0.2
🚀 GPU is available and will be used for training.


In [4]:
# Cell 3: Convert CSV logs to Multi-Class LibSVM format (with global label encoder)
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import dump_svmlight_file

folder_path = 'datasets'
output_libsvm = 'multiclass_dataset.libsvm'
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Phase 1: Collect all labels first
print("🔍 Scanning all labels...")
all_labels = []
for file in csv_files:
    chunks = pd.read_csv(file, chunksize=100_000)
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if 'Label' in chunk.columns:
            all_labels.extend(chunk['Label'].dropna().unique())
all_labels = pd.Series(all_labels).unique()

# Fit encoder on all unique labels
encoder = LabelEncoder()
encoder.fit(all_labels)
print(f"✅ Label encoder fitted with classes: {list(encoder.classes_)}")

# Phase 2: Process and save dataset
scaler = StandardScaler()
first_chunk = True
chunk_count = 0

for file in csv_files:
    chunks = pd.read_csv(file, chunksize=100_000)
    for chunk in chunks:
        try:
            chunk.columns = chunk.columns.str.strip()
            if 'Label' not in chunk.columns:
                continue

            # Drop extra columns to avoid join/merge confusion
            drop_cols = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
            chunk.drop(columns=[col for col in drop_cols if col in chunk.columns], inplace=True, errors='ignore')

            # Keep only numeric + label
            chunk = chunk.select_dtypes(include=[np.number]).join(chunk['Label'])

            chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
            chunk.dropna(inplace=True)

            # Encode labels using pre-fitted encoder
            y = encoder.transform(chunk['Label'])
            X = chunk.drop(columns=['Label'])

            X_scaled = scaler.fit_transform(X.astype(np.float32)) if first_chunk else scaler.transform(X.astype(np.float32))

            # Save as LibSVM
            mode = 'wb' if first_chunk else 'ab'
            with open(output_libsvm, mode) as f:
                dump_svmlight_file(X_scaled, y, f, zero_based=True)

            first_chunk = False
            chunk_count += 1
            print(f"✅ Saved chunk {chunk_count} with {len(chunk)} rows")

        except Exception as e:
            print(f"❌ Chunk error: {e}")

print("✅ All chunks processed and saved as Multi-Class LibSVM")



🔍 Scanning all labels...
✅ Label encoder fitted with classes: ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed', 'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack � Brute Force', 'Web Attack � Sql Injection', 'Web Attack � XSS']
✅ Saved chunk 1 with 99984 rows
✅ Saved chunk 2 with 99994 rows
✅ Saved chunk 3 with 25733 rows
✅ Saved chunk 4 with 99935 rows
✅ Saved chunk 5 with 99811 rows
✅ Saved chunk 6 with 86350 rows
✅ Saved chunk 7 with 99930 rows
✅ Saved chunk 8 with 90981 rows
✅ Saved chunk 9 with 99906 rows
✅ Saved chunk 10 with 99915 rows
✅ Saved chunk 11 with 99948 rows
✅ Saved chunk 12 with 99912 rows
✅ Saved chunk 13 with 99906 rows
✅ Saved chunk 14 with 29894 rows
✅ Saved chunk 15 with 99964 rows
✅ Saved chunk 16 with 99886 rows
✅ Saved chunk 17 with 88545 rows
✅ Saved chunk 18 with 99910 rows
✅ Saved chunk 19 with 70321 rows
✅ Saved chunk 20 with 99929 rows
✅ Saved chunk 21 with 99943 rows
✅ Saved chun

In [5]:
# Cell 4: Train XGBoost Multi-Class Model using full dataset (GPU)
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.datasets import load_svmlight_file

# Load dataset
print("📦 Loading full multi-class dataset from LibSVM format...")
X, y = load_svmlight_file("multiclass_dataset.libsvm")
print(f"✅ Loaded: {X.shape}")

# Detect number of classes
num_classes = len(set(y))
print(f"🔢 Number of classes detected: {num_classes}")

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X, label=y)

# Define training parameters
params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'tree_method': 'hist',       # Tip: For GPU support, use 'hist' + 'device': 'cuda'
    'device': 'cuda',
    'max_depth': 6,
    'eta': 0.3,
    'verbosity': 1
}

# Train the model
print("🧠 Training multi-class XGBoost model with GPU...")
model = xgb.train(params, dtrain, num_boost_round=100)
print("✅ Model training complete.")

# Predict and evaluate
print("🔍 Evaluating model...")
y_pred = model.predict(dtrain)
y_pred_labels = y_pred.argmax(axis=1)

# Evaluation metrics
acc = accuracy_score(y, y_pred_labels)
print(f"✅ Accuracy: {acc}")
print(classification_report(y, y_pred_labels))


📦 Loading full multi-class dataset from LibSVM format...
✅ Loaded: (2827876, 78)
🔢 Number of classes detected: 15
🧠 Training multi-class XGBoost model with GPU...
✅ Model training complete.
🔍 Evaluating model...
✅ Accuracy: 0.9991806571433826
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   2271320
         1.0       0.96      0.79      0.87      1956
         2.0       1.00      1.00      1.00    128025
         3.0       1.00      1.00      1.00     10293
         4.0       1.00      1.00      1.00    230124
         5.0       0.99      1.00      0.99      5499
         6.0       1.00      1.00      1.00      5796
         7.0       1.00      1.00      1.00      7935
         8.0       1.00      1.00      1.00        11
         9.0       1.00      1.00      1.00        36
        10.0       0.99      1.00      1.00    158804
        11.0       1.00      1.00      1.00      5897
        12.0       0.79      0.95      0.86      1507


In [6]:
# Cell 5: Save Trained Multi-Class Model & Label Encoder
import joblib
import os

os.makedirs("ai/models", exist_ok=True)

# Define paths
model_path = os.path.join("ai", "models", "xgboost_multiclass.model")
encoder_path = os.path.join("ai", "models", "label_encoder.pkl")

# Save model and encoder
model.save_model(model_path)
joblib.dump(encoder, encoder_path)

print(f"✅ Model saved to: {model_path}")
print(f"✅ Label encoder saved to: {encoder_path}")


✅ Model saved to: ai\models\xgboost_multiclass.model
✅ Label encoder saved to: ai\models\label_encoder.pkl


  model.save_model(model_path)


In [7]:
# Cell 6: Load trained model and encoder for prediction
import os
import xgboost as xgb
import joblib
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import classification_report

# Correct paths
model_path = os.path.join("ai", "models", "xgboost_multiclass.model")
encoder_path = os.path.join("ai", "models", "label_encoder.pkl")
dataset_path = "multiclass_dataset.libsvm"  

# Load model and encoder
model = xgb.Booster()
model.load_model(model_path)
encoder = joblib.load(encoder_path)
print("✅ Model and label encoder loaded.")

# Load dataset
X, y_true = load_svmlight_file(dataset_path)

# Predict
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluation
print("📊 Classification Report:")
print(classification_report(y_true, y_pred))



✅ Model and label encoder loaded.
📊 Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   2271320
         1.0       0.96      0.79      0.87      1956
         2.0       1.00      1.00      1.00    128025
         3.0       1.00      1.00      1.00     10293
         4.0       1.00      1.00      1.00    230124
         5.0       0.99      1.00      0.99      5499
         6.0       1.00      1.00      1.00      5796
         7.0       1.00      1.00      1.00      7935
         8.0       1.00      1.00      1.00        11
         9.0       1.00      1.00      1.00        36
        10.0       0.99      1.00      1.00    158804
        11.0       1.00      1.00      1.00      5897
        12.0       0.79      0.95      0.86      1507
        13.0       1.00      1.00      1.00        21
        14.0       0.79      0.43      0.56       652

    accuracy                           1.00   2827876
   macro avg       0.

In [10]:
# Cell 7: Predict, Score, Flag, and Save Results (Fixed)
import xgboost as xgb
import joblib
import pandas as pd
import numpy as np
import os
from sklearn.datasets import load_svmlight_file

# Paths
model_path = os.path.join("ai", "models", "xgboost_multiclass.model")
encoder_path = os.path.join("ai", "models", "label_encoder.pkl")
dataset_path = "multiclass_dataset.libsvm"

# Load model and encoder
model = xgb.Booster()
model.load_model(model_path)
encoder = joblib.load(encoder_path)
print("✅ Model and encoder loaded")

# Load dataset
X, y_true = load_svmlight_file(dataset_path)
dtest = xgb.DMatrix(X)

# Predict probabilities
y_probs = model.predict(dtest)
y_pred = np.argmax(y_probs, axis=1)

# Map encoded class numbers to string labels
unique_class_indices = np.unique(y_pred)
class_labels = encoder.inverse_transform(unique_class_indices)
label_mapping = dict(zip(unique_class_indices, class_labels))

predicted_labels = [label_mapping[i] for i in y_pred]
true_labels = encoder.inverse_transform(y_true.astype(int))

# Compute threat scores: confidence * 100 only for non-benign, else 0
threat_scores = []
for i, pred_class in enumerate(y_pred):
    label = label_mapping[pred_class]
    score = float(np.max(y_probs[i])) * 100 if label != "BENIGN" else 0.0
    threat_scores.append(round(score, 2))

# Flag if score ≥ 75 and not BENIGN
is_flagged = [(label != "BENIGN") and (score >= 75) for label, score in zip(predicted_labels, threat_scores)]

# Create results DataFrame
df_results = pd.DataFrame({
    'true_label': true_labels,
    'predicted_label': predicted_labels,
    'threat_score': threat_scores,
    'is_flagged': is_flagged
})

# Add class-wise confidence columns
for i, label in label_mapping.items():
    df_results[f'conf_{label}'] = (y_probs[:, i] * 100).round(2)

# Save results
output_path = os.path.join("ai", "predictions_with_flags.csv")
df_results.to_csv(output_path, index=False)
print(f"✅ Predictions saved to: {output_path}")


✅ Model and encoder loaded
✅ Predictions saved to: ai\predictions_with_flags.csv
