# HAI-20.07 Dataset Analysis: Optimized LightGBM Model

This notebook implements an optimized LightGBM model for attack detection in industrial control systems using the HAI-20.07 dataset.

## 1. Import Libraries

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import pickle
import gc
import psutil
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import lightgbm as lgb
import optuna

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 2. Load Preprocessed Data

In [None]:
# Load preprocessed tabular data
with open('preprocessed_data/tabular_data.pkl', 'rb') as f:
    tabular_data = pickle.load(f)

X_train_enhanced_scaled = tabular_data['X_train_enhanced_scaled']
y_train_enhanced = tabular_data['y_train_enhanced']
X_test_enhanced_scaled = tabular_data['X_test_enhanced_scaled']
y_test_enhanced = tabular_data['y_test_enhanced']
X_train_enhanced_balanced = tabular_data['X_train_enhanced_balanced']
y_train_enhanced_balanced = tabular_data['y_train_enhanced_balanced']
feature_columns_enhanced = tabular_data['feature_columns_enhanced']

print("X_train_enhanced_balanced shape:", X_train_enhanced_balanced.shape)
print("y_train_enhanced_balanced shape:", y_train_enhanced_balanced.shape)
print("X_test_enhanced_scaled shape:", X_test_enhanced_scaled.shape)
print("y_test_enhanced shape:", y_test_enhanced.shape)
print("Number of features:", len(feature_columns_enhanced))

## 3. Define Utility Functions

In [None]:
# Define a function to measure memory usage accurately
def get_memory_usage():
    """Get current memory usage in MB"""
    # Force garbage collection before measuring memory
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return memory_info.rss / (1024 * 1024)  # Convert to MB

# Function to calculate model size in MB
def get_lgb_model_size(model):
    with open('temp_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    size_bytes = os.path.getsize('temp_model.pkl')
    os.remove('temp_model.pkl')
    return size_bytes / (1024 * 1024)  # Convert to MB

## 4. Hyperparameter Optimization with Optuna

In [None]:
# Define an objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 1.0),
        'verbose': -1,
        'random_state': RANDOM_SEED
    }
    
    # Create dataset for LightGBM
    train_data = lgb.Dataset(X_train_enhanced_balanced, label=y_train_enhanced_balanced)
    
    # Perform cross-validation
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=100,
        nfold=5,
        stratified=True,
        early_stopping_rounds=20,
        seed=RANDOM_SEED,
        verbose_eval=False
    )
    
    # Return the best AUC score
    return cv_results['auc-mean'][-1]

# Run hyperparameter optimization
def optimize_hyperparameters(n_trials=50):
    print("Optimizing LightGBM hyperparameters...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    
    print("Best trial:")
    trial = study.best_trial
    print(f"  Value: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    return trial.params

In [None]:
# Run hyperparameter optimization with a small number of trials for demonstration
best_params = optimize_hyperparameters(n_trials=20)

## 5. Train and Evaluate LightGBM Model

In [None]:
# Train an optimized LightGBM model
def train_lightgbm_model(X_train, y_train, X_test, y_test, params=None):
    # Measure memory usage before training
    memory_before = get_memory_usage()
    
    # Define LightGBM parameters if not provided
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'num_threads': 4,
            'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1)  # Handle class imbalance
        }
    
    # Train model
    start_time = time.time()
    
    # Create dataset for LightGBM
    train_data = lgb.Dataset(X_train, label=y_train)
    
    # Train with early stopping
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=100)
        ]
    )
    
    training_time = time.time() - start_time
    print(f"Training time: {training_time:.2f} seconds")
    
    # Measure memory usage after training
    memory_after = get_memory_usage()
    memory_used = memory_after - memory_before
    print(f"Memory used: {memory_used:.2f} MB")
    
    # Calculate model size
    model_size = get_lgb_model_size(model)
    print(f"Model size: {model_size:.2f} MB")
    
    # Make predictions
    inference_start = time.time()
    y_pred_proba = model.predict(X_test)
    inference_time = (time.time() - inference_start) / len(X_test)
    print(f"Average inference time per sample: {inference_time*1000:.4f} ms")
    
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc_score:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Optimized LightGBM')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Save results
    results = {
        'model_name': 'Optimized LightGBM',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc_score,
        'training_time': training_time,
        'inference_time': inference_time,
        'memory_used': memory_used,
        'model_size': model_size,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'params': params
    }
    
    # Create directory for results if it doesn't exist
    if not os.path.exists('model_results'):
        os.makedirs('model_results')
    
    # Save results
    with open('model_results/lightgbm_results.pkl', 'wb') as f:
        pickle.dump(results, f)
    
    # Save model
    model.save_model('model_results/lightgbm_model.txt')
    
    return model, y_pred, y_pred_proba, results

In [None]:
# Train the LightGBM model with optimized hyperparameters
print("Training Optimized LightGBM model...")
lgbm_model, y_pred_lgbm, y_prob_lgbm, lgbm_results = train_lightgbm_model(
    X_train_enhanced_balanced, y_train_enhanced_balanced, 
    X_test_enhanced_scaled, y_test_enhanced,
    params=best_params
)

## 6. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = lgbm_model.feature_importance(importance_type='gain')

# Create a DataFrame for feature importance
feature_names = [f'Feature_{i}' for i in range(len(feature_importance))]
if len(feature_columns_enhanced) == len(feature_importance):
    feature_names = feature_columns_enhanced
    
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort by importance
importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

# Display top 20 features
print("Top 20 most important features:")
print(importance_df.head(20))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title('Top 20 Feature Importance (Gain)')
plt.tight_layout()
plt.show()

## 7. Visualize Model Performance

In [None]:
# Plot ROC curve
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test_enhanced, y_prob_lgbm)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Optimized LightGBM')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Plot precision-recall curve
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test_enhanced, y_prob_lgbm)
pr_auc = auc(recall, precision)

plt.figure(figsize=(10, 8))
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Optimized LightGBM')
plt.legend(loc="best")
plt.grid(True)
plt.show()

## 8. Threshold Optimization

In [None]:
# Optimize threshold for better F1 score
thresholds = np.arange(0.1, 0.9, 0.05)
f1_scores = []

for threshold in thresholds:
    y_pred_threshold = (y_prob_lgbm > threshold).astype(int)
    f1 = f1_score(y_test_enhanced, y_pred_threshold)
    f1_scores.append(f1)

# Find the best threshold
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]
best_f1 = f1_scores[best_threshold_idx]

print(f"Best threshold: {best_threshold:.2f} with F1 score: {best_f1:.4f}")

# Plot F1 scores for different thresholds
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, marker='o')
plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Best threshold: {best_threshold:.2f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

# Recalculate metrics with the optimized threshold
y_pred_optimized = (y_prob_lgbm > best_threshold).astype(int)
accuracy_optimized = accuracy_score(y_test_enhanced, y_pred_optimized)
precision_optimized = precision_score(y_test_enhanced, y_pred_optimized, zero_division=0)
recall_optimized = recall_score(y_test_enhanced, y_pred_optimized, zero_division=0)
f1_optimized = f1_score(y_test_enhanced, y_pred_optimized, zero_division=0)

print(f"Optimized Metrics:")
print(f"Accuracy: {accuracy_optimized:.4f}")
print(f"Precision: {precision_optimized:.4f}")
print(f"Recall: {recall_optimized:.4f}")
print(f"F1 Score: {f1_optimized:.4f}")

# Plot confusion matrix with optimized threshold
plt.figure(figsize=(8, 6))
cm_optimized = confusion_matrix(y_test_enhanced, y_pred_optimized)
sns.heatmap(cm_optimized, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Optimized LightGBM (Optimized Threshold)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()