In [6]:
# Cell 1: Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import dump_svmlight_file
import xgboost as xgb


In [7]:
# Cell 2: CUDA Check
from xgboost import XGBClassifier
print("✅ XGBoost version:", xgb.__version__)
try:
    test_model = XGBClassifier(tree_method='hist', device='cuda')
    print("🚀 GPU is available and will be used for training.")
except Exception as e:
    print("❌ GPU check failed:", e)


✅ XGBoost version: 3.0.2
🚀 GPU is available and will be used for training.


In [10]:
# Cell 3: Convert CSV logs to Multi-Class LibSVM format
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import dump_svmlight_file

folder_path = 'datasets'
output_libsvm = 'multiclass_dataset.libsvm'
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

scaler = StandardScaler()
encoder = LabelEncoder()
first_chunk = True
label_list = []

chunk_count = 0

for file in csv_files:
    print(f"📥 Processing: {file}")
    chunks = pd.read_csv(file, chunksize=100_000)

    for chunk in chunks:
        try:
            chunk.columns = chunk.columns.str.strip()
            if 'Label' not in chunk.columns:
                continue

            # Drop unnecessary columns
            drop_cols = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
            chunk.drop(columns=[col for col in drop_cols if col in chunk.columns], inplace=True, errors='ignore')

            # Filter numeric + label
            chunk = chunk.select_dtypes(include=[np.number]).join(chunk['Label'])

            chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
            chunk.dropna(inplace=True)

            # Encode multi-class labels
            labels = encoder.fit_transform(chunk['Label'])
            y = pd.Series(labels)
            label_list.extend(chunk['Label'].unique())  # collect original labels
            X = chunk.drop(columns=['Label'])

            # Scale features
            X_scaled = scaler.fit_transform(X.astype(np.float32)) if first_chunk else scaler.transform(X.astype(np.float32))

            # Dump to file with manual append handling
            mode = 'ab' if not first_chunk else 'wb'
            with open(output_libsvm, mode) as f:
                dump_svmlight_file(X_scaled, y, f, zero_based=True)

            first_chunk = False
            chunk_count += 1
            print(f"✅ Saved chunk {chunk_count} with {len(chunk)} rows")

        except Exception as e:
            print(f"❌ Chunk error: {e}")

print("✅ All chunks processed and saved as Multi-Class LibSVM")


📥 Processing: datasets\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
✅ Saved chunk 1 with 99984 rows
✅ Saved chunk 2 with 99994 rows
✅ Saved chunk 3 with 25733 rows
📥 Processing: datasets\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
✅ Saved chunk 4 with 99935 rows
✅ Saved chunk 5 with 99811 rows
✅ Saved chunk 6 with 86350 rows
📥 Processing: datasets\Friday-WorkingHours-Morning.pcap_ISCX.csv
✅ Saved chunk 7 with 99930 rows
✅ Saved chunk 8 with 90981 rows
📥 Processing: datasets\Monday-WorkingHours.pcap_ISCX.csv
✅ Saved chunk 9 with 99906 rows
✅ Saved chunk 10 with 99915 rows
✅ Saved chunk 11 with 99948 rows
✅ Saved chunk 12 with 99912 rows
✅ Saved chunk 13 with 99906 rows
✅ Saved chunk 14 with 29894 rows
📥 Processing: datasets\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
✅ Saved chunk 15 with 99964 rows
✅ Saved chunk 16 with 99886 rows
✅ Saved chunk 17 with 88545 rows
📥 Processing: datasets\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
✅ Saved chunk 18 

In [11]:
# Cell 4: Train XGBoost Multi-Class Model using full dataset (GPU)
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.datasets import load_svmlight_file

# Load dataset
print("📦 Loading full multi-class dataset from LibSVM format...")
X, y = load_svmlight_file("multiclass_dataset.libsvm")
print(f"✅ Loaded: {X.shape}")

# Detect number of classes
num_classes = len(set(y))
print(f"🔢 Number of classes detected: {num_classes}")

# Create DMatrix
dtrain = xgb.DMatrix(X, label=y)

# Define parameters
params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'tree_method': 'hist',      # GPU tip: use 'hist' and add 'device': 'cuda'
    'device': 'cuda',
    'max_depth': 6,
    'eta': 0.3,
    'verbosity': 1
}

# Train model
print("🧠 Training multi-class XGBoost model with GPU...")
model = xgb.train(params, dtrain, num_boost_round=100)
print("✅ Model training complete.")

# Predict
print("🔍 Evaluating model...")
y_pred = model.predict(dtrain)
y_pred_labels = y_pred.argmax(axis=1)

# Report
acc = accuracy_score(y, y_pred_labels)
print(f"✅ Accuracy: {acc}")
print(classification_report(y, y_pred_labels))


📦 Loading full multi-class dataset from LibSVM format...
✅ Loaded: (2827876, 78)
🔢 Number of classes detected: 4
🧠 Training multi-class XGBoost model with GPU...
✅ Model training complete.
🔍 Evaluating model...
✅ Accuracy: 0.9962257892495994
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   2271320
         1.0       0.98      1.00      0.99    522473
         2.0       0.96      0.73      0.83     27635
         3.0       0.99      0.93      0.96      6448

    accuracy                           1.00   2827876
   macro avg       0.98      0.91      0.94   2827876
weighted avg       1.00      1.00      1.00   2827876



In [14]:
# Cell 5: Save Trained Multi-Class Model & Label Encoder
import joblib
import os

os.makedirs("ai/models", exist_ok=True)

# Define paths
model_path = os.path.join("ai", "models", "xgboost_multiclass.model")
encoder_path = os.path.join("ai", "models", "label_encoder.pkl")

# Save model and encoder
model.save_model(model_path)
joblib.dump(encoder, encoder_path)

print(f"✅ Model saved to: {model_path}")
print(f"✅ Label encoder saved to: {encoder_path}")


✅ Model saved to: ai\models\xgboost_multiclass.model
✅ Label encoder saved to: ai\models\label_encoder.pkl


  model.save_model(model_path)


In [16]:
# Cell 6: Load trained model and encoder for prediction
import os
import xgboost as xgb
import joblib
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import classification_report

# Correct paths
model_path = os.path.join("ai", "models", "xgboost_multiclass.model")
encoder_path = os.path.join("ai", "models", "label_encoder.pkl")
dataset_path = "multiclass_dataset.libsvm"  

# Load model and encoder
model = xgb.Booster()
model.load_model(model_path)
encoder = joblib.load(encoder_path)
print("✅ Model and label encoder loaded.")

# Load dataset
X, y_true = load_svmlight_file(dataset_path)

# Predict
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluation
print("📊 Classification Report:")
print(classification_report(y_true, y_pred))



✅ Model and label encoder loaded.
📊 Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   2271320
         1.0       0.98      1.00      0.99    522473
         2.0       0.96      0.73      0.83     27635
         3.0       0.99      0.93      0.96      6448

    accuracy                           1.00   2827876
   macro avg       0.98      0.91      0.94   2827876
weighted avg       1.00      1.00      1.00   2827876

