## Step 1: Install PySpark and Import Libraries

In [1]:
# Install PySpark (Colab only)
!pip install pyspark -q

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
import time
import json
import os

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


## Step 2: Mount Google Drive

In [2]:
# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_DIR = "/content/drive/MyDrive/NetworkIDS"
    print(f"‚úÖ Google Drive mounted successfully!")
    IS_COLAB = True
except:
    BASE_DIR = "d:/Coding/real-time-network-intrusion-detection-spark-kafka/data"
    print(f"‚úÖ Running locally. Data directory: {BASE_DIR}")
    IS_COLAB = False

# Define paths
DATA_PATH = f"{BASE_DIR}/output/parquet/cicids_merged_harmonized"
MODEL_DIR = f"{BASE_DIR}/output/models"
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"üìÇ Data path: {DATA_PATH}")
print(f"üìÇ Model directory: {MODEL_DIR}")

Mounted at /content/drive
‚úÖ Google Drive mounted successfully!
üìÇ Data path: /content/drive/MyDrive/NetworkIDS/output/parquet/cicids_merged_harmonized
üìÇ Model directory: /content/drive/MyDrive/NetworkIDS/output/models


## Step 3: Create Spark Session

In [3]:
# Create Spark session optimized for ML training
spark = SparkSession.builder \
    .appName("NIDS-ModelTraining") \
    .config("spark.driver.memory", "10g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
print(f"‚úÖ Spark session created")
print(f"üìä Spark version: {spark.version}")

‚úÖ Spark session created
üìä Spark version: 3.5.1


## Step 4: Load Harmonized Dataset

In [4]:
# Load the harmonized dataset
print("Loading harmonized dataset...")
start_time = time.time()

df = spark.read.parquet(DATA_PATH)

# Show basic info
print(f"‚úÖ Dataset loaded in {time.time() - start_time:.2f} seconds")
print(f"üìä Columns: {len(df.columns)}")
print(f"\nSchema (key columns):")
for col in ['features_scaled', 'binary_label', 'unified_label', 'sample_weight', 'multiclass_weight']:
    if col in df.columns:
        print(f"  - {col}: {df.schema[col].dataType}")

Loading harmonized dataset...
‚úÖ Dataset loaded in 10.49 seconds
üìä Columns: 34

Schema (key columns):
  - features_scaled: VectorUDT()
  - binary_label: IntegerType()
  - unified_label: IntegerType()
  - sample_weight: DoubleType()
  - multiclass_weight: DoubleType()


In [5]:
# Check label distributions
print("Binary Label Distribution:")
df.groupBy('binary_label').count().show()

print("\nUnified Label Distribution:")
df.groupBy('unified_label').count().orderBy('unified_label').show(10)

Binary Label Distribution:
+------------+--------+
|binary_label|   count|
+------------+--------+
|           1| 2779281|
|           0|15484134|
+------------+--------+


Unified Label Distribution:
+-------------+--------+
|unified_label|   count|
+-------------+--------+
|            0|15484134|
|            1|  699820|
|            2|  705921|
|            3|  165820|
|            4|     928|
|            5|  161095|
|            6|  284263|
|            7|   90819|
|            8|  670615|
+-------------+--------+



## Step 5: Prepare Data for Training

We'll use stratified sampling to maintain class distribution in train/test sets.

In [6]:
# Select only needed columns for training (reduces memory)
df_train = df.select(
    'features_scaled',
    'binary_label',
    'unified_label',
    'sample_weight',
    'multiclass_weight'
)

print(f"‚úÖ Selected {len(df_train.columns)} columns for training")
df_train.printSchema()

‚úÖ Selected 5 columns for training
root
 |-- features_scaled: vector (nullable = true)
 |-- binary_label: integer (nullable = true)
 |-- unified_label: integer (nullable = true)
 |-- sample_weight: double (nullable = true)
 |-- multiclass_weight: double (nullable = true)



In [8]:
# Train/Test split (80/20) - NO CACHING to save RAM
print("Splitting data into train/test sets...")

# Use random split
train_df, test_df = df_train.randomSplit([0.8, 0.2], seed=42)

# DON'T cache - saves RAM on Colab's 12GB limit
# Spark will recompute as needed (slightly slower but won't crash)

print("‚úÖ Train/Test split complete (80/20)")
print("   Estimated: ~14.6M train, ~3.6M test records")
print("   (No caching to conserve RAM)")

Splitting data into train/test sets...
‚úÖ Train/Test split complete (80/20)
   Estimated: ~14.6M train, ~3.6M test records
   (No caching to conserve RAM)


In [9]:
# Quick verification - just show sample, skip expensive groupBy on full data
print("Sample from train set:")
train_df.select('binary_label', 'unified_label').show(5)

print("‚úÖ Data ready for training")

Sample from train set:


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

## Step 6: Train Binary Classification Models

### 6.1 Random Forest - Binary Classification

In [None]:
# Random Forest for Binary Classification - OPTIMIZED for speed
print("="*60)
print("Training Random Forest - Binary Classification")
print("="*60)

start_time = time.time()

rf_binary = RandomForestClassifier(
    featuresCol='features_scaled',
    labelCol='binary_label',
    weightCol='sample_weight',
    numTrees=50,       # Reduced from 100
    maxDepth=8,        # Reduced from 10
    maxBins=32,
    seed=42
)

print("Training model (this may take 10-20 minutes)...")
rf_binary_model = rf_binary.fit(train_df)

elapsed = time.time() - start_time
print(f"‚úÖ Training completed in {elapsed/60:.2f} minutes")

In [None]:
# Evaluate Random Forest - Binary
print("Evaluating Random Forest - Binary Classification...")

# Predictions
rf_binary_preds = rf_binary_model.transform(test_df)

# Binary metrics
binary_evaluator_auc = BinaryClassificationEvaluator(
    labelCol='binary_label',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)

binary_evaluator_pr = BinaryClassificationEvaluator(
    labelCol='binary_label',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderPR'
)

# Multiclass metrics for accuracy, precision, recall, f1
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol='binary_label',
    predictionCol='prediction'
)

auc_roc = binary_evaluator_auc.evaluate(rf_binary_preds)
auc_pr = binary_evaluator_pr.evaluate(rf_binary_preds)
accuracy = multi_evaluator.evaluate(rf_binary_preds, {multi_evaluator.metricName: 'accuracy'})
f1 = multi_evaluator.evaluate(rf_binary_preds, {multi_evaluator.metricName: 'f1'})
precision = multi_evaluator.evaluate(rf_binary_preds, {multi_evaluator.metricName: 'weightedPrecision'})
recall = multi_evaluator.evaluate(rf_binary_preds, {multi_evaluator.metricName: 'weightedRecall'})

print("\n" + "="*50)
print("Random Forest - Binary Classification Results")
print("="*50)
print(f"AUC-ROC:   {auc_roc:.4f}")
print(f"AUC-PR:    {auc_pr:.4f}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

# Store results
rf_binary_results = {
    'model': 'Random Forest',
    'task': 'Binary Classification',
    'auc_roc': auc_roc,
    'auc_pr': auc_pr,
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

### 6.2 Gradient Boosted Trees - Binary Classification

In [None]:
# GBT for Binary Classification - OPTIMIZED for speed
print("="*60)
print("Training Gradient Boosted Trees - Binary Classification")
print("="*60)

start_time = time.time()

gbt_binary = GBTClassifier(
    featuresCol='features_scaled',
    labelCol='binary_label',
    weightCol='sample_weight',
    maxIter=30,        # Reduced from 50
    maxDepth=6,        # Reduced from 8
    seed=42
)

print("Training model (this may take 15-30 minutes)...")
gbt_binary_model = gbt_binary.fit(train_df)

elapsed = time.time() - start_time
print(f"‚úÖ Training completed in {elapsed/60:.2f} minutes")

In [None]:
# Evaluate GBT - Binary
print("Evaluating GBT - Binary Classification...")

gbt_binary_preds = gbt_binary_model.transform(test_df)

auc_roc = binary_evaluator_auc.evaluate(gbt_binary_preds)
auc_pr = binary_evaluator_pr.evaluate(gbt_binary_preds)
accuracy = multi_evaluator.evaluate(gbt_binary_preds, {multi_evaluator.metricName: 'accuracy'})
f1 = multi_evaluator.evaluate(gbt_binary_preds, {multi_evaluator.metricName: 'f1'})
precision = multi_evaluator.evaluate(gbt_binary_preds, {multi_evaluator.metricName: 'weightedPrecision'})
recall = multi_evaluator.evaluate(gbt_binary_preds, {multi_evaluator.metricName: 'weightedRecall'})

print("\n" + "="*50)
print("GBT - Binary Classification Results")
print("="*50)
print(f"AUC-ROC:   {auc_roc:.4f}")
print(f"AUC-PR:    {auc_pr:.4f}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

gbt_binary_results = {
    'model': 'Gradient Boosted Trees',
    'task': 'Binary Classification',
    'auc_roc': auc_roc,
    'auc_pr': auc_pr,
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

## Step 7: Train Multi-class Classification Models

### 7.1 Random Forest - Multi-class (9 attack types)

In [None]:
# Random Forest for Multi-class Classification - OPTIMIZED
print("="*60)
print("Training Random Forest - Multi-class Classification (9 classes)")
print("="*60)

start_time = time.time()

rf_multi = RandomForestClassifier(
    featuresCol='features_scaled',
    labelCol='unified_label',
    weightCol='multiclass_weight',
    numTrees=50,       # Reduced from 100
    maxDepth=10,       # Reduced from 12
    maxBins=32,
    seed=42
)

print("Training model (this may take 15-25 minutes)...")
rf_multi_model = rf_multi.fit(train_df)

elapsed = time.time() - start_time
print(f"‚úÖ Training completed in {elapsed/60:.2f} minutes")

In [None]:
# Evaluate Random Forest - Multi-class
print("Evaluating Random Forest - Multi-class Classification...")

rf_multi_preds = rf_multi_model.transform(test_df)

# Multi-class evaluator
mc_evaluator = MulticlassClassificationEvaluator(
    labelCol='unified_label',
    predictionCol='prediction'
)

accuracy = mc_evaluator.evaluate(rf_multi_preds, {mc_evaluator.metricName: 'accuracy'})
f1 = mc_evaluator.evaluate(rf_multi_preds, {mc_evaluator.metricName: 'f1'})
precision = mc_evaluator.evaluate(rf_multi_preds, {mc_evaluator.metricName: 'weightedPrecision'})
recall = mc_evaluator.evaluate(rf_multi_preds, {mc_evaluator.metricName: 'weightedRecall'})

print("\n" + "="*50)
print("Random Forest - Multi-class Classification Results")
print("="*50)
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

rf_multi_results = {
    'model': 'Random Forest',
    'task': 'Multi-class Classification (9 classes)',
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

In [None]:
# Confusion Matrix for Multi-class
print("\nConfusion Matrix (Predicted vs Actual):")
rf_multi_preds.groupBy('unified_label', 'prediction').count() \
    .orderBy('unified_label', 'prediction').show(50)

# Per-class accuracy
print("\nPer-class Prediction Counts:")
rf_multi_preds.groupBy('unified_label') \
    .agg(
        F.count('*').alias('total'),
        F.sum(F.when(F.col('prediction') == F.col('unified_label'), 1).otherwise(0)).alias('correct')
    ) \
    .withColumn('accuracy', F.round(F.col('correct') / F.col('total'), 4)) \
    .orderBy('unified_label').show()

## Step 8: Save Trained Models

In [None]:
# Re-verify Google Drive connection before saving
if IS_COLAB:
    try:
        os.listdir(BASE_DIR)
        print("‚úÖ Google Drive connection verified")
    except:
        print("‚ö†Ô∏è Drive disconnected! Remounting...")
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        print("‚úÖ Drive remounted successfully")

In [None]:
# Save models
print("Saving trained models...")

# Save Random Forest - Binary
rf_binary_path = f"{MODEL_DIR}/rf_binary_classifier"
rf_binary_model.write().overwrite().save(rf_binary_path)
print(f"‚úÖ Saved: {rf_binary_path}")

# Save GBT - Binary
gbt_binary_path = f"{MODEL_DIR}/gbt_binary_classifier"
gbt_binary_model.write().overwrite().save(gbt_binary_path)
print(f"‚úÖ Saved: {gbt_binary_path}")

# Save Random Forest - Multi-class
rf_multi_path = f"{MODEL_DIR}/rf_multiclass_classifier"
rf_multi_model.write().overwrite().save(rf_multi_path)
print(f"‚úÖ Saved: {rf_multi_path}")

In [None]:
# Save training results summary
all_results = {
    'rf_binary': rf_binary_results,
    'gbt_binary': gbt_binary_results,
    'rf_multiclass': rf_multi_results,
    'train_size': train_count,
    'test_size': test_count,
    'total_records': train_count + test_count
}

results_path = f"{MODEL_DIR}/training_results.json"
with open(results_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\n‚úÖ Results saved to: {results_path}")

## Step 9: Model Comparison Summary

In [None]:
# Print final comparison
print("\n" + "="*70)
print("MODEL TRAINING SUMMARY")
print("="*70)

print("\nüìä BINARY CLASSIFICATION (Attack vs Benign)")
print("-"*70)
print(f"{'Model':<25} {'AUC-ROC':<10} {'Accuracy':<10} {'F1':<10} {'Precision':<10} {'Recall':<10}")
print("-"*70)
print(f"{'Random Forest':<25} {rf_binary_results['auc_roc']:<10.4f} {rf_binary_results['accuracy']:<10.4f} {rf_binary_results['f1']:<10.4f} {rf_binary_results['precision']:<10.4f} {rf_binary_results['recall']:<10.4f}")
print(f"{'Gradient Boosted Trees':<25} {gbt_binary_results['auc_roc']:<10.4f} {gbt_binary_results['accuracy']:<10.4f} {gbt_binary_results['f1']:<10.4f} {gbt_binary_results['precision']:<10.4f} {gbt_binary_results['recall']:<10.4f}")

print("\nüìä MULTI-CLASS CLASSIFICATION (9 Attack Types)")
print("-"*70)
print(f"{'Model':<25} {'Accuracy':<10} {'F1':<10} {'Precision':<10} {'Recall':<10}")
print("-"*70)
print(f"{'Random Forest':<25} {rf_multi_results['accuracy']:<10.4f} {rf_multi_results['f1']:<10.4f} {rf_multi_results['precision']:<10.4f} {rf_multi_results['recall']:<10.4f}")

print("\n" + "="*70)
print("‚úÖ All models trained and saved successfully!")
print(f"üìÅ Models location: {MODEL_DIR}")
print("="*70)

## Step 10: Feature Importance (Optional)

In [None]:
# Get feature importance from Random Forest
print("Top 20 Most Important Features (Random Forest - Binary):")
print("="*50)

importances = rf_binary_model.featureImportances.toArray()

# Create feature importance list
feature_importance = [(i, imp) for i, imp in enumerate(importances)]
feature_importance.sort(key=lambda x: x[1], reverse=True)

print(f"{'Rank':<6} {'Feature Index':<15} {'Importance':<12}")
print("-"*35)
for rank, (idx, imp) in enumerate(feature_importance[:20], 1):
    print(f"{rank:<6} {idx:<15} {imp:.6f}")

## Summary

### Models Trained:
1. **Random Forest - Binary** (`rf_binary_classifier`)
   - Task: Attack vs Benign
   - Use case: Quick attack detection

2. **Gradient Boosted Trees - Binary** (`gbt_binary_classifier`)
   - Task: Attack vs Benign
   - Use case: Higher accuracy attack detection

3. **Random Forest - Multi-class** (`rf_multiclass_classifier`)
   - Task: Identify specific attack type (9 classes)
   - Use case: Detailed threat classification

### Saved Artifacts:
- Models: `/content/drive/MyDrive/NetworkIDS/output/models/`
- Results: `training_results.json`

### Next Steps:
1. Deploy models for real-time inference
2. Integrate with Kafka streaming pipeline
3. Build alerting/monitoring dashboard

In [None]:
# Cleanup
train_df.unpersist()
test_df.unpersist()
spark.stop()
print("‚úÖ Spark session stopped")
print("\nüéâ Model training complete! Ready for deployment.")