In [None]:
# Install dependencies
!pip install pyspark pandas matplotlib -q

import time
import json
import random
import os
from datetime import datetime
from collections import deque
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output, display, HTML

print("‚úÖ Libraries imported")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/NetworkIDS"
DATA_PATH = f"{BASE_DIR}/output/parquet/cicids_merged_harmonized"
MODEL_DIR = f"{BASE_DIR}/output/models"

print(f"üìÇ Data: {DATA_PATH}")
print(f"üìÇ Models: {MODEL_DIR}")

In [None]:
# Create Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.classification import RandomForestClassificationModel, GBTClassificationModel

gc.collect()

spark = SparkSession.builder \
    .appName("NIDS-Streaming-Simulation") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .master("local[2]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
print("‚úÖ Spark session created")

In [None]:
# Load models
print("Loading models...")

rf_binary_model = RandomForestClassificationModel.load(f"{MODEL_DIR}/rf_binary_classifier")
print("‚úÖ Binary classifier loaded")

rf_multi_model = RandomForestClassificationModel.load(f"{MODEL_DIR}/rf_multiclass_classifier")
print("‚úÖ Multi-class classifier loaded")

# Attack type mapping
ATTACK_TYPES = {
    0: "Benign",
    1: "DoS",
    2: "DDoS",
    3: "PortScan",
    4: "BruteForce",
    5: "WebAttack",
    6: "Infiltration",
    7: "Botnet",
    8: "Heartbleed"
}

SEVERITY_MAP = {
    "DDoS": "üî¥ HIGH",
    "DoS": "üî¥ HIGH",
    "Infiltration": "üî¥ HIGH",
    "Heartbleed": "üî¥ HIGH",
    "BruteForce": "üü° MEDIUM",
    "WebAttack": "üü° MEDIUM",
    "Botnet": "üü° MEDIUM",
    "PortScan": "üü¢ LOW",
    "Benign": "‚ö™ NONE"
}

In [None]:
# Load test data for simulation
print("Loading test data for simulation...")

df = spark.read.parquet(DATA_PATH)
df_test = df.select('features_scaled', 'binary_label', 'unified_label')

# Sample for simulation (use smaller sample for real-time feel)
simulation_data = df_test.sample(fraction=0.05, seed=42).cache()
total_records = simulation_data.count()

print(f"‚úÖ Loaded {total_records:,} records for simulation")

# Convert to pandas for iteration
sim_pdf = simulation_data.toPandas()
print(f"‚úÖ Converted to pandas")

## üéÆ Real-Time Streaming Simulation

This cell simulates a real-time intrusion detection system processing network traffic.

In [None]:
class RealTimeSimulator:
    """Simulates real-time intrusion detection"""
    
    def __init__(self, binary_model, multi_model, spark_session):
        self.binary_model = binary_model
        self.multi_model = multi_model
        self.spark = spark_session
        
        # Statistics
        self.total_processed = 0
        self.attacks_detected = 0
        self.alerts_by_type = {}
        self.alerts_by_severity = {"HIGH": 0, "MEDIUM": 0, "LOW": 0}
        self.recent_alerts = deque(maxlen=10)
        self.timeline = []
        
    def predict_batch(self, batch_df):
        """Make predictions on a batch of records"""
        # Convert to Spark DataFrame
        spark_df = self.spark.createDataFrame(batch_df)
        
        # Binary prediction
        binary_preds = self.binary_model.transform(spark_df)
        binary_preds = binary_preds.withColumnRenamed('prediction', 'binary_pred')
        
        # Multi-class prediction
        multi_preds = self.multi_model.transform(binary_preds)
        multi_preds = multi_preds.withColumnRenamed('prediction', 'multi_pred')
        
        return multi_preds.toPandas()
    
    def process_predictions(self, results_df):
        """Process predictions and update statistics"""
        alerts = []
        
        for _, row in results_df.iterrows():
            self.total_processed += 1
            
            is_attack = row['binary_pred'] == 1.0
            attack_type = ATTACK_TYPES.get(int(row['multi_pred']), 'Unknown')
            
            if is_attack:
                self.attacks_detected += 1
                
                # Update attack type counts
                self.alerts_by_type[attack_type] = self.alerts_by_type.get(attack_type, 0) + 1
                
                # Update severity counts
                severity_str = SEVERITY_MAP.get(attack_type, "NONE")
                if "HIGH" in severity_str:
                    self.alerts_by_severity["HIGH"] += 1
                elif "MEDIUM" in severity_str:
                    self.alerts_by_severity["MEDIUM"] += 1
                elif "LOW" in severity_str:
                    self.alerts_by_severity["LOW"] += 1
                
                alert = {
                    'timestamp': datetime.now().strftime('%H:%M:%S'),
                    'attack_type': attack_type,
                    'severity': severity_str,
                    'true_label': ATTACK_TYPES.get(int(row['unified_label']), 'Unknown')
                }
                self.recent_alerts.appendleft(alert)
                alerts.append(alert)
        
        # Update timeline
        self.timeline.append({
            'time': datetime.now(),
            'attacks': len(alerts),
            'total': len(results_df)
        })
        
        return alerts
    
    def display_dashboard(self):
        """Display real-time dashboard"""
        clear_output(wait=True)
        
        detection_rate = (self.attacks_detected / self.total_processed * 100) if self.total_processed > 0 else 0
        
        html = f"""
        <div style="font-family: 'Courier New', monospace; background: #1a1a2e; color: white; padding: 20px; border-radius: 10px;">
            <h1 style="color: #00d9ff; text-align: center;">üõ°Ô∏è Network Intrusion Detection System</h1>
            <h3 style="color: #aaa; text-align: center;">Real-Time Streaming Simulation</h3>
            
            <div style="display: flex; justify-content: space-around; margin: 20px 0;">
                <div style="text-align: center; background: #2a2a4e; padding: 15px; border-radius: 10px; min-width: 120px;">
                    <div style="font-size: 2em; color: #00ff88;">{self.total_processed:,}</div>
                    <div style="color: #aaa;">Processed</div>
                </div>
                <div style="text-align: center; background: #2a2a4e; padding: 15px; border-radius: 10px; min-width: 120px;">
                    <div style="font-size: 2em; color: #ff4444;">{self.attacks_detected:,}</div>
                    <div style="color: #aaa;">Attacks</div>
                </div>
                <div style="text-align: center; background: #2a2a4e; padding: 15px; border-radius: 10px; min-width: 120px;">
                    <div style="font-size: 2em; color: #ff4444;">{self.alerts_by_severity['HIGH']}</div>
                    <div style="color: #aaa;">üî¥ High</div>
                </div>
                <div style="text-align: center; background: #2a2a4e; padding: 15px; border-radius: 10px; min-width: 120px;">
                    <div style="font-size: 2em; color: #ffaa00;">{self.alerts_by_severity['MEDIUM']}</div>
                    <div style="color: #aaa;">üü° Medium</div>
                </div>
                <div style="text-align: center; background: #2a2a4e; padding: 15px; border-radius: 10px; min-width: 120px;">
                    <div style="font-size: 2em; color: #00d9ff;">{self.alerts_by_severity['LOW']}</div>
                    <div style="color: #aaa;">üü¢ Low</div>
                </div>
            </div>
            
            <h3 style="color: #00d9ff;">üìã Recent Alerts</h3>
            <table style="width: 100%; border-collapse: collapse; margin: 10px 0;">
                <tr style="background: #2a2a4e;">
                    <th style="padding: 10px; text-align: left; color: #00d9ff;">Time</th>
                    <th style="padding: 10px; text-align: left; color: #00d9ff;">Attack Type</th>
                    <th style="padding: 10px; text-align: left; color: #00d9ff;">Severity</th>
                    <th style="padding: 10px; text-align: left; color: #00d9ff;">True Label</th>
                </tr>
        """
        
        for alert in list(self.recent_alerts)[:8]:
            html += f"""
                <tr style="border-bottom: 1px solid #333;">
                    <td style="padding: 8px;">{alert['timestamp']}</td>
                    <td style="padding: 8px; color: #ff88aa;">{alert['attack_type']}</td>
                    <td style="padding: 8px;">{alert['severity']}</td>
                    <td style="padding: 8px; color: #888;">{alert['true_label']}</td>
                </tr>
            """
        
        if not self.recent_alerts:
            html += '<tr><td colspan="4" style="padding: 20px; text-align: center; color: #666;">No alerts yet...</td></tr>'
        
        # Attack type breakdown
        html += """
            </table>
            <h3 style="color: #00d9ff; margin-top: 20px;">üìä Attack Types Detected</h3>
            <div style="display: flex; flex-wrap: wrap; gap: 10px;">
        """
        
        for attack_type, count in sorted(self.alerts_by_type.items(), key=lambda x: -x[1]):
            html += f"""
                <div style="background: #2a2a4e; padding: 10px 15px; border-radius: 5px;">
                    <span style="color: #ff88aa;">{attack_type}</span>: 
                    <span style="color: #00ff88;">{count}</span>
                </div>
            """
        
        html += """
            </div>
            <div style="margin-top: 20px; padding: 10px; background: #2a2a4e; border-radius: 5px; text-align: center;">
                <span style="color: #aaa;">Detection Rate: </span>
                <span style="color: #00ff88; font-size: 1.2em;">{:.1f}%</span>
            </div>
        </div>
        """.format(detection_rate)
        
        display(HTML(html))

print("‚úÖ Simulator class defined")

In [None]:
# Run the real-time simulation
print("üöÄ Starting Real-Time Simulation...")
print("Press the STOP button to end the simulation")
time.sleep(2)

simulator = RealTimeSimulator(rf_binary_model, rf_multi_model, spark)

BATCH_SIZE = 50  # Records per batch
DELAY = 1.5      # Seconds between batches
MAX_BATCHES = 30 # Maximum batches to process

try:
    for batch_num in range(MAX_BATCHES):
        # Get random batch
        start_idx = random.randint(0, len(sim_pdf) - BATCH_SIZE - 1)
        batch_df = sim_pdf.iloc[start_idx:start_idx + BATCH_SIZE].copy()
        
        # Make predictions
        results = simulator.predict_batch(batch_df)
        
        # Process and update stats
        simulator.process_predictions(results)
        
        # Update dashboard
        simulator.display_dashboard()
        
        time.sleep(DELAY)
        
except KeyboardInterrupt:
    print("\n‚èπÔ∏è Simulation stopped by user")

print("\n" + "="*60)
print("SIMULATION COMPLETE")
print("="*60)
print(f"Total processed: {simulator.total_processed:,}")
print(f"Attacks detected: {simulator.attacks_detected:,}")
print(f"Detection rate: {simulator.attacks_detected/simulator.total_processed*100:.1f}%")

## üìà Simulation Results Visualization

In [None]:
# Visualize simulation results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 1. Attack types pie chart
if simulator.alerts_by_type:
    labels = list(simulator.alerts_by_type.keys())
    sizes = list(simulator.alerts_by_type.values())
    axes[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=plt.cm.Set3(np.linspace(0, 1, len(labels))))
    axes[0].set_title('Detected Attack Types', fontweight='bold')
else:
    axes[0].text(0.5, 0.5, 'No attacks detected', ha='center', va='center')
    axes[0].set_title('Detected Attack Types')

# 2. Severity distribution
severities = ['HIGH', 'MEDIUM', 'LOW']
severity_counts = [simulator.alerts_by_severity[s] for s in severities]
colors = ['#ff4444', '#ffaa00', '#00d9ff']
bars = axes[1].bar(severities, severity_counts, color=colors)
axes[1].set_title('Alerts by Severity', fontweight='bold')
axes[1].set_ylabel('Count')
for bar, count in zip(bars, severity_counts):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, str(count), ha='center')

# 3. Timeline
if simulator.timeline:
    times = [t['time'] for t in simulator.timeline]
    attacks = [t['attacks'] for t in simulator.timeline]
    axes[2].plot(range(len(attacks)), attacks, 'r-', linewidth=2, marker='o')
    axes[2].fill_between(range(len(attacks)), attacks, alpha=0.3, color='red')
    axes[2].set_title('Attacks Over Time', fontweight='bold')
    axes[2].set_xlabel('Batch Number')
    axes[2].set_ylabel('Attacks Detected')

plt.tight_layout()
plt.savefig(f"{BASE_DIR}/output/visualizations/simulation_results.png", dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Results saved to: {BASE_DIR}/output/visualizations/simulation_results.png")

In [None]:
# Export simulation summary
summary = {
    'total_processed': simulator.total_processed,
    'attacks_detected': simulator.attacks_detected,
    'detection_rate': simulator.attacks_detected / simulator.total_processed if simulator.total_processed > 0 else 0,
    'alerts_by_type': simulator.alerts_by_type,
    'alerts_by_severity': simulator.alerts_by_severity,
    'simulation_time': datetime.now().isoformat()
}

with open(f"{BASE_DIR}/output/simulation_summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

print("üìä Simulation Summary:")
print(json.dumps(summary, indent=2))

In [None]:
# Cleanup
simulation_data.unpersist()
spark.stop()
gc.collect()

print("\n" + "="*60)
print("‚úÖ Simulation Complete!")
print("="*60)
print("\nThis demonstrates how the real-time system would work.")
print("For production deployment with Kafka, use the streaming/ folder scripts.")