# HAI-20.07 Dataset Analysis: Model Comparison and Evaluation

This notebook compares and evaluates the performance of different models for attack detection in industrial control systems using the HAI-20.07 dataset.

## 1. Import Libraries

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, auc

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 2. Load Model Results

In [None]:
# Load model results
def load_results():
    results = {}
    
    # Check if model_results directory exists
    if not os.path.exists('model_results'):
        print("Model results directory not found. Please run the model notebooks first.")
        return None
    
    # Load TCN results
    if os.path.exists('model_results/tcn_results.pkl'):
        with open('model_results/tcn_results.pkl', 'rb') as f:
            results['TCN'] = pickle.load(f)
    else:
        print("TCN results not found. Please run the TCN model notebook first.")
    
    # Load GRU results
    if os.path.exists('model_results/gru_results.pkl'):
        with open('model_results/gru_results.pkl', 'rb') as f:
            results['GRU'] = pickle.load(f)
    else:
        print("GRU results not found. Please run the GRU model notebook first.")
    
    # Load LightGBM results
    if os.path.exists('model_results/lightgbm_results.pkl'):
        with open('model_results/lightgbm_results.pkl', 'rb') as f:
            results['LightGBM'] = pickle.load(f)
    else:
        print("LightGBM results not found. Please run the LightGBM model notebook first.")
    
    return results

# Load results
model_results = load_results()

# If no results are found, create dummy results for demonstration
if model_results is None or len(model_results) == 0:
    print("Creating dummy results for demonstration...")
    
    # Load test data
    with open('preprocessed_data/sequence_data.pkl', 'rb') as f:
        sequence_data = pickle.load(f)
    
    y_test_seq = sequence_data['y_test_seq']
    
    with open('preprocessed_data/tabular_data.pkl', 'rb') as f:
        tabular_data = pickle.load(f)
    
    y_test_enhanced = tabular_data['y_test_enhanced']
    
    # Create dummy results
    model_results = {
        'TCN': {
            'model_name': 'Optimized TCN',
            'accuracy': 0.976,
            'precision': 0.950,
            'recall': 0.422,
            'f1': 0.585,
            'auc': 0.970,
            'training_time': 251.22,
            'inference_time': 0.0000008,
            'memory_used': 379.73,
            'model_size': 2.5,
            'y_pred': np.random.randint(0, 2, size=len(y_test_seq)),
            'y_pred_proba': np.random.random(size=len(y_test_seq))
        },
        'GRU': {
            'model_name': 'GRU with Attention',
            'accuracy': 0.975,
            'precision': 0.951,
            'recall': 0.394,
            'f1': 0.557,
            'auc': 0.965,
            'training_time': 187.65,
            'inference_time': 0.00005,
            'memory_used': 450.0,
            'model_size': 3.2,
            'y_pred': np.random.randint(0, 2, size=len(y_test_seq)),
            'y_pred_proba': np.random.random(size=len(y_test_seq))
        },
        'LightGBM': {
            'model_name': 'Optimized LightGBM',
            'accuracy': 0.961,
            'precision': 0.920,
            'recall': 0.380,
            'f1': 0.540,
            'auc': 0.960,
            'training_time': 0.44,
            'inference_time': 0.0000001,
            'memory_used': 108.30,
            'model_size': 1.5,
            'y_pred': np.random.randint(0, 2, size=len(y_test_enhanced)),
            'y_pred_proba': np.random.random(size=len(y_test_enhanced))
        }
    }

## 3. Create Performance Comparison Table

In [None]:
# Create a DataFrame for performance comparison
def create_performance_table(results):
    data = []
    
    for model_name, result in results.items():
        data.append({
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1 Score': result['f1'],
            'AUC': result['auc'],
            'Training Time (s)': result['training_time'],
            'Inference Time (ms)': result['inference_time'] * 1000,  # Convert to milliseconds
            'Memory Usage (MB)': result['memory_used'],
            'Model Size (MB)': result['model_size'] if 'model_size' in result else 0
        })
    
    df = pd.DataFrame(data)
    
    # Calculate efficiency score
    df['Efficiency Score'] = df.apply(calculate_efficiency_score, axis=1)
    
    # Sort by efficiency score
    df = df.sort_values('Efficiency Score', ascending=False).reset_index(drop=True)
    
    return df

# Calculate efficiency score
def calculate_efficiency_score(row):
    # Normalize values (higher is better for accuracy, lower is better for time and memory)
    accuracy_weight = 0.3
    f1_weight = 0.3
    training_time_weight = 0.1
    inference_time_weight = 0.1
    memory_weight = 0.1
    model_size_weight = 0.1
    
    # Get min and max values for normalization
    df = pd.DataFrame([row])
    accuracy_max = df['Accuracy'].max()
    f1_max = df['F1 Score'].max()
    training_time_min = df['Training Time (s)'].min()
    inference_time_min = df['Inference Time (ms)'].min()
    memory_min = df['Memory Usage (MB)'].min()
    model_size_min = df['Model Size (MB)'].min()
    
    # Normalize values
    accuracy_norm = row['Accuracy'] / accuracy_max if accuracy_max > 0 else 0
    f1_norm = row['F1 Score'] / f1_max if f1_max > 0 else 0
    training_time_norm = training_time_min / row['Training Time (s)'] if row['Training Time (s)'] > 0 else 0
    inference_time_norm = inference_time_min / row['Inference Time (ms)'] if row['Inference Time (ms)'] > 0 else 0
    memory_norm = memory_min / row['Memory Usage (MB)'] if row['Memory Usage (MB)'] > 0 else 0
    model_size_norm = model_size_min / row['Model Size (MB)'] if row['Model Size (MB)'] > 0 else 0
    
    # Calculate weighted score
    score = (accuracy_weight * accuracy_norm + 
             f1_weight * f1_norm + 
             training_time_weight * training_time_norm + 
             inference_time_weight * inference_time_norm + 
             memory_weight * memory_norm + 
             model_size_weight * model_size_norm)
    
    return score

# Create performance table
performance_df = create_performance_table(model_results)

# Display performance table
print("Model Performance Comparison:")
display(performance_df.style.highlight_max(subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'Efficiency Score'], color='lightgreen')
       .highlight_min(subset=['Training Time (s)', 'Inference Time (ms)', 'Memory Usage (MB)', 'Model Size (MB)'], color='lightgreen')
       .format({
           'Accuracy': '{:.4f}',
           'Precision': '{:.4f}',
           'Recall': '{:.4f}',
           'F1 Score': '{:.4f}',
           'AUC': '{:.4f}',
           'Training Time (s)': '{:.2f}',
           'Inference Time (ms)': '{:.4f}',
           'Memory Usage (MB)': '{:.2f}',
           'Model Size (MB)': '{:.2f}',
           'Efficiency Score': '{:.4f}'
       }))

## 4. Visualize Performance Metrics

In [None]:
# Plot performance metrics comparison
def plot_performance_comparison(df):
    # Create a figure with 6 subplots
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    
    # Performance metrics
    performance_df = df.melt(id_vars=['Model'],
                            value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'],
                            var_name='Metric', value_name='Value')
    sns.barplot(x='Model', y='Value', hue='Metric', data=performance_df, ax=axes[0, 0])
    axes[0, 0].set_title('Performance Metrics Comparison', fontsize=14)
    axes[0, 0].set_ylim(0.3, 1.0)  # Adjust as needed
    axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
    axes[0, 0].legend(loc='lower right')
    
    # Training time
    sns.barplot(x='Model', y='Training Time (s)', data=df, ax=axes[0, 1], palette='viridis')
    axes[0, 1].set_title('Training Time Comparison (seconds)', fontsize=14)
    axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45, ha='right')
    axes[0, 1].set_yscale('log')  # Log scale for better visualization
    
    # Inference time
    sns.barplot(x='Model', y='Inference Time (ms)', data=df, ax=axes[0, 2], palette='viridis')
    axes[0, 2].set_title('Inference Time Comparison (milliseconds)', fontsize=14)
    axes[0, 2].set_xticklabels(axes[0, 2].get_xticklabels(), rotation=45, ha='right')
    axes[0, 2].set_yscale('log')  # Log scale for better visualization
    
    # Memory usage
    sns.barplot(x='Model', y='Memory Usage (MB)', data=df, ax=axes[1, 0], palette='viridis')
    axes[1, 0].set_title('Memory Usage Comparison (MB)', fontsize=14)
    axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45, ha='right')
    
    # Model size
    sns.barplot(x='Model', y='Model Size (MB)', data=df, ax=axes[1, 1], palette='viridis')
    axes[1, 1].set_title('Model Size Comparison (MB)', fontsize=14)
    axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=45, ha='right')
    
    # Efficiency score
    sns.barplot(x='Model', y='Efficiency Score', data=df, ax=axes[1, 2], palette='viridis')
    axes[1, 2].set_title('Efficiency Score Comparison', fontsize=14)
    axes[1, 2].set_xticklabels(axes[1, 2].get_xticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

# Plot performance comparison
plot_performance_comparison(performance_df)

## 5. Compare Confusion Matrices

In [None]:
# Plot confusion matrices for all models
def plot_confusion_matrices(results):
    n_models = len(results)
    fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 5))
    
    if n_models == 1:
        axes = [axes]
    
    for i, (model_name, result) in enumerate(results.items()):
        y_pred = result['y_pred']
        
        # Determine the true labels based on the model
        if model_name == 'LightGBM':
            # Load tabular data for LightGBM
            with open('preprocessed_data/tabular_data.pkl', 'rb') as f:
                tabular_data = pickle.load(f)
            y_true = tabular_data['y_test_enhanced']
        else:
            # Load sequence data for TCN and GRU
            with open('preprocessed_data/sequence_data.pkl', 'rb') as f:
                sequence_data = pickle.load(f)
            y_true = sequence_data['y_test_seq']
        
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f'Confusion Matrix - {model_name}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('True')
    
    plt.tight_layout()
    plt.show()

# Plot confusion matrices
plot_confusion_matrices(model_results)

## 6. Compare ROC and PR Curves

In [None]:
# Plot ROC curves for all models
def plot_roc_curves(results):
    plt.figure(figsize=(10, 8))
    
    for model_name, result in results.items():
        y_prob = result['y_pred_proba']
        
        # Determine the true labels based on the model
        if model_name == 'LightGBM':
            # Load tabular data for LightGBM
            with open('preprocessed_data/tabular_data.pkl', 'rb') as f:
                tabular_data = pickle.load(f)
            y_true = tabular_data['y_test_enhanced']
        else:
            # Load sequence data for TCN and GRU
            with open('preprocessed_data/sequence_data.pkl', 'rb') as f:
                sequence_data = pickle.load(f)
            y_true = sequence_data['y_test_seq']
        
        # Calculate ROC curve
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        roc_auc = auc(fpr, tpr)
        
        # Plot ROC curve
        plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.3f})')
    
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curves')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()

# Plot precision-recall curves for all models
def plot_pr_curves(results):
    plt.figure(figsize=(10, 8))
    
    for model_name, result in results.items():
        y_prob = result['y_pred_proba']
        
        # Determine the true labels based on the model
        if model_name == 'LightGBM':
            # Load tabular data for LightGBM
            with open('preprocessed_data/tabular_data.pkl', 'rb') as f:
                tabular_data = pickle.load(f)
            y_true = tabular_data['y_test_enhanced']
        else:
            # Load sequence data for TCN and GRU
            with open('preprocessed_data/sequence_data.pkl', 'rb') as f:
                sequence_data = pickle.load(f)
            y_true = sequence_data['y_test_seq']
        
        # Calculate precision-recall curve
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        pr_auc = auc(recall, precision)
        
        # Plot precision-recall curve
        plt.plot(recall, precision, lw=2, label=f'{model_name} (AUC = {pr_auc:.3f})')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()

# Plot ROC and PR curves
plot_roc_curves(model_results)
plot_pr_curves(model_results)

## 7. Identify the Best Model

In [None]:
# Identify the best model based on efficiency score
best_model = performance_df.iloc[0]['Model']
best_model_score = performance_df.iloc[0]['Efficiency Score']

print(f"The best model based on efficiency score is: {best_model} with a score of {best_model_score:.4f}")

# Display the performance metrics of the best model
best_model_metrics = performance_df[performance_df['Model'] == best_model].iloc[0]
print("\nPerformance metrics of the best model:")
for metric, value in best_model_metrics.items():
    if metric != 'Model':
        print(f"{metric}: {value:.4f}" if isinstance(value, float) else f"{metric}: {value}")

## 8. Ensemble Model

In [None]:
# Create an ensemble model by averaging the predictions of all models
def create_ensemble_model(results):
    # Load test data
    with open('preprocessed_data/sequence_data.pkl', 'rb') as f:
        sequence_data = pickle.load(f)
    y_test_seq = sequence_data['y_test_seq']
    
    # Get predictions from all models
    y_probs = []
    for model_name, result in results.items():
        if model_name != 'LightGBM':  # Skip LightGBM as it uses different test data
            y_probs.append(result['y_pred_proba'])
    
    # Average the predictions
    y_prob_ensemble = np.mean(y_probs, axis=0)
    y_pred_ensemble = (y_prob_ensemble > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_seq, y_pred_ensemble)
    precision = precision_score(y_test_seq, y_pred_ensemble, zero_division=0)
    recall = recall_score(y_test_seq, y_pred_ensemble, zero_division=0)
    f1 = f1_score(y_test_seq, y_pred_ensemble, zero_division=0)
    auc_score = roc_auc_score(y_test_seq, y_prob_ensemble)
    
    print(f"Ensemble Model Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc_score:.4f}")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test_seq, y_pred_ensemble)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Ensemble Model')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Return ensemble results
    return {
        'model_name': 'Ensemble',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc_score,
        'y_pred': y_pred_ensemble,
        'y_pred_proba': y_prob_ensemble
    }

# Create ensemble model if we have at least 2 models that use the same test data
if len([m for m in model_results.keys() if m != 'LightGBM']) >= 2:
    ensemble_results = create_ensemble_model(model_results)
else:
    print("Not enough models to create an ensemble. Need at least 2 models that use the same test data.")

## 9. Conclusion and Recommendations

### Summary of Findings

Based on our analysis of the HAI-20.07 dataset and the implementation of various models for attack detection in industrial control systems, we can draw the following conclusions:

1. **Performance Metrics**: All models achieved high accuracy (>95%), but there were significant differences in precision, recall, and F1 scores. The best model in terms of balanced performance was [Best Model].

2. **Computational Efficiency**: The LightGBM model was the most computationally efficient, with the fastest training and inference times, as well as the lowest memory usage and model size. This makes it suitable for deployment in resource-constrained environments.

3. **Feature Importance**: The most important features for attack detection were [Important Features], which suggests that these sensors or measurements are particularly relevant for identifying anomalies in the industrial control system.

4. **Class Imbalance**: The dataset exhibited significant class imbalance, with normal samples far outnumbering attack samples. Techniques like SMOTE and class weighting were effective in addressing this issue.

5. **Ensemble Model**: The ensemble model combining multiple approaches [improved/did not improve] performance compared to individual models, suggesting that [different models capture complementary aspects of the data/the best individual model already captures most of the signal in the data].

### Recommendations

1. **Model Selection**: For real-time attack detection in resource-constrained environments, we recommend using the [Best Model] due to its balance of performance and efficiency.

2. **Feature Engineering**: Focus on the most important features identified in our analysis to simplify the model and improve efficiency without sacrificing performance.

3. **Threshold Optimization**: Adjust the classification threshold based on the specific requirements of the application. If false negatives (missed attacks) are more costly than false positives, lower the threshold to increase recall at the expense of precision.

4. **Continuous Monitoring**: Implement a system for continuous monitoring and periodic retraining of the model to adapt to evolving attack patterns.

5. **Explainability**: Incorporate model explainability techniques to help operators understand why specific events are flagged as attacks, which can aid in incident response and system improvement.

### Future Work

1. **Advanced Architectures**: Explore more advanced but still efficient architectures, such as lightweight transformers or neural architecture search, to further improve the performance-efficiency trade-off.

2. **Transfer Learning**: Investigate transfer learning approaches to leverage knowledge from other industrial control systems or datasets.

3. **Anomaly Detection**: Implement unsupervised anomaly detection methods that can identify novel attack patterns not seen in the training data.

4. **Edge Deployment**: Optimize models for deployment on edge devices within the industrial control system to enable real-time detection with minimal latency.

5. **Multi-modal Learning**: Incorporate additional data sources, such as network traffic or system logs, to improve detection capabilities through multi-modal learning.