# HAI-20.07 Dataset Analysis with Optimized Efficient Architectures

This notebook analyzes the HAI-20.07 dataset and implements optimized, computationally efficient architectures for attack detection in industrial control systems. The focus is on achieving high performance with low computational resources ("低消耗高效能").

Improvements over the previous analysis include:
1. Better handling of class imbalance
2. Accurate memory usage tracking
3. More efficient model architectures
4. Improved evaluation metrics and visualization
5. Proper dataset loading and preprocessing

## 1. Setup and Configuration

In [None]:
# Check if running in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f"Running in Google Colab: {IN_COLAB}")

if IN_COLAB:
    # Install required packages
    !pip install -q xgboost scikit-learn matplotlib seaborn torch tensorflow tqdm psutil memory_profiler imbalanced-learn lightgbm optuna

## 2. Download Dataset from Kaggle (if in Colab)

In [None]:
# Download and extract the HAI Security Dataset
if IN_COLAB:
    import os
    
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Check if dataset already exists
    if not os.path.exists('/content/hai-security-dataset'):
        print("Downloading HAI Security Dataset from Kaggle...")
        
        # Install Kaggle API if not already installed
        !pip install -q kaggle
        
        # Create Kaggle API configuration directory
        !mkdir -p ~/.kaggle
        
        # Request user to upload kaggle.json file
        from google.colab import files
        print("Please upload your kaggle.json file (can be obtained from Kaggle > Account > API > Create New API Token)")
        uploaded = files.upload()
        
        # Check if the correct file was uploaded
        if 'kaggle.json' in uploaded:
            # Copy to Kaggle's configuration directory
            !cp kaggle.json ~/.kaggle/kaggle.json
            # Set correct permissions
            !chmod 600 ~/.kaggle/kaggle.json
            print("Credentials successfully set")
            
            # Download the dataset
            !kaggle datasets download icsdataset/hai-security-dataset -p /content/
            
            # Extract the dataset
            print("\nExtracting dataset...")
            !unzip -q /content/hai-security-dataset.zip -d /content/
            print("Dataset extracted successfully.")
        else:
            print("kaggle.json file not found, please ensure you upload the correct file")
    else:
        print("HAI Security Dataset already exists in /content/hai-security-dataset")
    
    # List the contents of the dataset directory
    print("\nContents of the dataset directory:")
    !ls -la /content/hai-security-dataset/hai-20.07

## 3. Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, auc
import warnings
import time
import os
import psutil
import gc
from tqdm.notebook import tqdm

# Set display options
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 4. Define Utility Functions

In [None]:
# Define a function to measure memory usage accurately
def get_memory_usage():
    """Get current memory usage in MB"""
    # Force garbage collection before measuring memory
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return memory_info.rss / (1024 * 1024)  # Convert to MB

# Define a class to track model performance and efficiency
class ModelEvaluator:
    def __init__(self):
        self.results = []
        
    def add_result(self, model_name, accuracy, precision, recall, f1, auc_score, 
                   training_time, inference_time, memory_usage, model_size=None):
        self.results.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'AUC': auc_score,
            'Training Time (s)': training_time,
            'Inference Time (ms)': inference_time * 1000,  # Convert to milliseconds
            'Memory Usage (MB)': memory_usage,
            'Model Size (MB)': model_size if model_size is not None else 0
        })
        
    def get_results_df(self):
        return pd.DataFrame(self.results)
    
    def plot_comparison(self):
        results_df = self.get_results_df()
        
        # Create a figure with 6 subplots
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        
        # Performance metrics
        performance_df = results_df.melt(id_vars=['Model'],
                                        value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'],
                                        var_name='Metric', value_name='Value')
        sns.barplot(x='Model', y='Value', hue='Metric', data=performance_df, ax=axes[0, 0])
        axes[0, 0].set_title('Performance Metrics Comparison')
        axes[0, 0].set_ylim(0.5, 1.0)  # Adjust as needed
        axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
        axes[0, 0].legend(loc='lower right')
        
        # Training time
        sns.barplot(x='Model', y='Training Time (s)', data=results_df, ax=axes[0, 1])
        axes[0, 1].set_title('Training Time Comparison (seconds)')
        axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45, ha='right')
        axes[0, 1].set_yscale('log')  # Log scale for better visualization
        
        # Inference time
        sns.barplot(x='Model', y='Inference Time (ms)', data=results_df, ax=axes[0, 2])
        axes[0, 2].set_title('Inference Time Comparison (milliseconds)')
        axes[0, 2].set_xticklabels(axes[0, 2].get_xticklabels(), rotation=45, ha='right')
        axes[0, 2].set_yscale('log')  # Log scale for better visualization
        
        # Memory usage
        sns.barplot(x='Model', y='Memory Usage (MB)', data=results_df, ax=axes[1, 0])
        axes[1, 0].set_title('Memory Usage Comparison (MB)')
        axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45, ha='right')
        
        # Model size
        if 'Model Size (MB)' in results_df.columns:
            sns.barplot(x='Model', y='Model Size (MB)', data=results_df, ax=axes[1, 1])
            axes[1, 1].set_title('Model Size Comparison (MB)')
            axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=45, ha='right')
        
        # Efficiency score
        if 'Efficiency Score' in results_df.columns:
            sns.barplot(x='Model', y='Efficiency Score', data=results_df, ax=axes[1, 2])
            axes[1, 2].set_title('Efficiency Score Comparison')
            axes[1, 2].set_xticklabels(axes[1, 2].get_xticklabels(), rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
    def plot_confusion_matrices(self, y_true, y_pred_dict):
        """Plot confusion matrices for all models"""
        n_models = len(y_pred_dict)
        fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 5))
        
        if n_models == 1:
            axes = [axes]
            
        for i, (model_name, y_pred) in enumerate(y_pred_dict.items()):
            cm = confusion_matrix(y_true, y_pred)
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
            axes[i].set_title(f'Confusion Matrix - {model_name}')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('True')
            
        plt.tight_layout()
        plt.show()
        
    def plot_pr_curves(self, y_true, y_prob_dict):
        """Plot precision-recall curves for all models"""
        plt.figure(figsize=(10, 8))
        
        for model_name, y_prob in y_prob_dict.items():
            precision, recall, _ = precision_recall_curve(y_true, y_prob)
            pr_auc = auc(recall, precision)
            plt.plot(recall, precision, lw=2, label=f'{model_name} (AUC = {pr_auc:.3f})')
            
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curves')
        plt.legend(loc='best')
        plt.grid(True)
        plt.show()

# Initialize the evaluator
evaluator = ModelEvaluator()

## 5. Load and Explore Data

In [None]:
# Define data path based on environment
if IN_COLAB:
    data_path = '/content/hai-security-dataset/hai-20.07/'
else:
    data_path = 'hai-security-dataset/hai-20.07/'

# Load training datasets
train1 = pd.read_csv(f'{data_path}train1.csv', sep=';')
train2 = pd.read_csv(f'{data_path}train2.csv', sep=';')

# Load testing datasets
test1 = pd.read_csv(f'{data_path}test1.csv', sep=';')
test2 = pd.read_csv(f'{data_path}test2.csv', sep=';')

# Display basic information about the datasets
print("Training Dataset 1 Shape:", train1.shape)
print("Training Dataset 2 Shape:", train2.shape)
print("Testing Dataset 1 Shape:", test1.shape)
print("Testing Dataset 2 Shape:", test2.shape)

In [None]:
# Convert time column to datetime
train1['time'] = pd.to_datetime(train1['time'])
train2['time'] = pd.to_datetime(train2['time'])
test1['time'] = pd.to_datetime(test1['time'])
test2['time'] = pd.to_datetime(test2['time'])

# Identify feature columns (excluding time and attack labels)
feature_columns = [col for col in train1.columns if col not in ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']]
print(f"Number of feature columns: {len(feature_columns)}")
print(f"First few feature columns: {feature_columns[:5]}...")

# Check attack distribution
print("\nAttack distribution in train1:")
print(train1['attack'].value_counts(normalize=True) * 100)

print("\nAttack distribution in train2:")
print(train2['attack'].value_counts(normalize=True) * 100)

print("\nAttack distribution in test1:")
print(test1['attack'].value_counts(normalize=True) * 100)

print("\nAttack distribution in test2:")
print(test2['attack'].value_counts(normalize=True) * 100)

In [None]:
# Visualize the class distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
train_combined = pd.concat([train1, train2], ignore_index=True)
train_attack_counts = train_combined['attack'].value_counts()
plt.pie(train_attack_counts, labels=['Normal', 'Attack'] if len(train_attack_counts) > 1 else ['Normal'], 
        autopct='%1.1f%%', startangle=90, colors=['lightblue', 'salmon'] if len(train_attack_counts) > 1 else ['lightblue'])
plt.title('Training Data Class Distribution')

plt.subplot(1, 2, 2)
test_combined = pd.concat([test1, test2], ignore_index=True)
test_attack_counts = test_combined['attack'].value_counts()
plt.pie(test_attack_counts, labels=['Normal', 'Attack'], autopct='%1.1f%%', startangle=90, colors=['lightblue', 'salmon'])
plt.title('Testing Data Class Distribution')

plt.tight_layout()
plt.show()

## 6. Data Preprocessing

In [None]:
# Combine training datasets
train_combined = pd.concat([train1, train2], ignore_index=True)

# Combine testing datasets
test_combined = pd.concat([test1, test2], ignore_index=True)

# Extract features and target
X_train = train_combined[feature_columns]
y_train = train_combined['attack']

X_test = test_combined[feature_columns]
y_test = test_combined['attack']

# Check for missing values
print("Missing values in training data:")
print(X_train.isnull().sum().sum())
print("Missing values in testing data:")
print(X_test.isnull().sum().sum())

# Check for infinite values
print("Infinite values in training data:")
print(np.isinf(X_train).sum().sum())
print("Infinite values in testing data:")
print(np.isinf(X_test).sum().sum())

# Replace any infinite values with NaN and then fill with column mean
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Fill missing values with column mean
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())  # Use training mean for test data

# Use RobustScaler instead of StandardScaler to be less affected by outliers
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

## 7. Feature Engineering

In [None]:
# Create a function to add time-based features
def add_time_features(df):
    # Make a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Extract time-based features
    df_copy['hour'] = df_copy['time'].dt.hour
    df_copy['minute'] = df_copy['time'].dt.minute
    df_copy['second'] = df_copy['time'].dt.second
    df_copy['day_of_week'] = df_copy['time'].dt.dayofweek
    
    # Add cyclical time features (to capture periodicity)
    df_copy['hour_sin'] = np.sin(2 * np.pi * df_copy['hour'] / 24)
    df_copy['hour_cos'] = np.cos(2 * np.pi * df_copy['hour'] / 24)
    df_copy['minute_sin'] = np.sin(2 * np.pi * df_copy['minute'] / 60)
    df_copy['minute_cos'] = np.cos(2 * np.pi * df_copy['minute'] / 60)
    
    return df_copy

# Create a function to add rolling window statistics
def add_rolling_features(df, window_sizes=[5, 10], top_n=3):
    # Make a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Sort by time to ensure correct rolling window calculation
    df_copy = df_copy.sort_values('time')
    
    # Select features for rolling statistics (exclude time and attack columns)
    rolling_features = [col for col in df_copy.columns if col not in ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']]
    
    # Use only the top N most important features for rolling statistics to keep it efficient
    # In a real scenario, you would select based on feature importance
    top_features = rolling_features[:top_n]
    
    # Calculate rolling statistics for different window sizes
    for window_size in window_sizes:
        for feature in top_features:
            df_copy[f'{feature}_rolling_mean_{window_size}'] = df_copy[feature].rolling(window=window_size).mean()
            df_copy[f'{feature}_rolling_std_{window_size}'] = df_copy[feature].rolling(window=window_size).std()
    
    # Drop NaN values created by rolling window
    df_copy = df_copy.dropna()
    
    return df_copy

# Create a function to add lag features
def add_lag_features(df, lag_steps=[1, 2], top_n=3):
    # Make a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Sort by time to ensure correct lag calculation
    df_copy = df_copy.sort_values('time')
    
    # Select features for lag (exclude time and attack columns)
    lag_features = [col for col in df_copy.columns if col not in ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']]
    
    # Use only the top N most important features for lag to keep it efficient
    top_features = lag_features[:top_n]
    
    # Calculate lag features
    for lag in lag_steps:
        for feature in top_features:
            df_copy[f'{feature}_lag_{lag}'] = df_copy[feature].shift(lag)
    
    # Drop NaN values created by lag
    df_copy = df_copy.dropna()
    
    return df_copy

# Create a function to add statistical features
def add_statistical_features(df):
    # Make a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Select features for statistics (exclude time and attack columns)
    stat_features = [col for col in df_copy.columns if col not in ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']]
    
    # Group features by prefix (e.g., P1_, P2_, etc.)
    prefixes = set([col.split('_')[0] for col in stat_features])
    
    # Calculate statistics for each group
    for prefix in prefixes:
        group_cols = [col for col in stat_features if col.startswith(prefix)]
        if len(group_cols) > 1:  # Only calculate if there are multiple columns in the group
            df_copy[f'{prefix}_mean'] = df_copy[group_cols].mean(axis=1)
            df_copy[f'{prefix}_std'] = df_copy[group_cols].std(axis=1)
            df_copy[f'{prefix}_max'] = df_copy[group_cols].max(axis=1)
            df_copy[f'{prefix}_min'] = df_copy[group_cols].min(axis=1)
            df_copy[f'{prefix}_range'] = df_copy[f'{prefix}_max'] - df_copy[f'{prefix}_min']
    
    return df_copy

In [None]:
# Apply feature engineering steps
print("Applying feature engineering...")

# Add time features
train_features = add_time_features(train_combined)
test_features = add_time_features(test_combined)

# Add statistical features
train_features = add_statistical_features(train_features)
test_features = add_statistical_features(test_features)

# Add rolling features (with smaller window sizes and fewer top features)
train_features = add_rolling_features(train_features, window_sizes=[5, 10], top_n=3)
test_features = add_rolling_features(test_features, window_sizes=[5, 10], top_n=3)

# Add lag features (with fewer lag steps and fewer top features)
train_features = add_lag_features(train_features, lag_steps=[1, 2], top_n=3)
test_features = add_lag_features(test_features, lag_steps=[1, 2], top_n=3)

# Extract features and target from the enhanced datasets
feature_columns_enhanced = [col for col in train_features.columns
                           if col not in ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']]

X_train_enhanced = train_features[feature_columns_enhanced]
y_train_enhanced = train_features['attack']

X_test_enhanced = test_features[feature_columns_enhanced]
y_test_enhanced = test_features['attack']

# Scale the enhanced features
scaler_enhanced = RobustScaler()
X_train_enhanced_scaled = scaler_enhanced.fit_transform(X_train_enhanced)
X_test_enhanced_scaled = scaler_enhanced.transform(X_test_enhanced)

print("X_train_enhanced_scaled shape:", X_train_enhanced_scaled.shape)
print("X_test_enhanced_scaled shape:", X_test_enhanced_scaled.shape)
print(f"Number of enhanced features: {X_train_enhanced_scaled.shape[1]}")

## 8. Handle Class Imbalance

In [None]:
# Import SMOTE for handling class imbalance
from imblearn.over_sampling import SMOTE

# Check class distribution
print("Class distribution in training data:")
print(y_train_enhanced.value_counts(normalize=True) * 100)

print("\nClass distribution in testing data:")
print(y_test_enhanced.value_counts(normalize=True) * 100)

# Apply SMOTE to handle class imbalance
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=RANDOM_SEED, sampling_strategy=0.3)  # Create minority class samples to be 30% of majority
X_train_enhanced_balanced, y_train_enhanced_balanced = smote.fit_resample(X_train_enhanced_scaled, y_train_enhanced)

print("Class distribution after SMOTE:")
print(pd.Series(y_train_enhanced_balanced).value_counts(normalize=True) * 100)
print(f"X_train_enhanced_balanced shape: {X_train_enhanced_balanced.shape}")

## 9. Prepare Data for Sequence Models

In [None]:
# Function to create sequences for time series models
def create_sequences(X, y, time_steps=10, stride=1):
    Xs, ys = [], []
    for i in range(0, len(X) - time_steps, stride):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps - 1])  # Use the label of the last timestep
    return np.array(Xs), np.array(ys)

# Create sequences for training and testing with a stride of 5 to reduce data size
TIME_STEPS = 10
STRIDE = 5

# Create sequences from the original scaled data
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, TIME_STEPS, STRIDE)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, TIME_STEPS, STRIDE)

print("X_train_seq shape:", X_train_seq.shape)
print("y_train_seq shape:", y_train_seq.shape)
print("X_test_seq shape:", X_test_seq.shape)
print("y_test_seq shape:", y_test_seq.shape)

# Check class distribution in sequence data
print("\nClass distribution in training sequence data:")
print(pd.Series(y_train_seq).value_counts(normalize=True) * 100)

print("\nClass distribution in testing sequence data:")
print(pd.Series(y_test_seq).value_counts(normalize=True) * 100)

# Apply SMOTE to handle class imbalance in sequence data
print("\nApplying SMOTE to balance the training sequence data...")
# Reshape sequences for SMOTE
X_train_seq_reshaped = X_train_seq.reshape(X_train_seq.shape[0], -1)
smote_seq = SMOTE(random_state=RANDOM_SEED, sampling_strategy=0.3)  # Create minority class samples to be 30% of majority
X_train_seq_balanced_reshaped, y_train_seq_balanced = smote_seq.fit_resample(X_train_seq_reshaped, y_train_seq)
# Reshape back to sequences
X_train_seq_balanced = X_train_seq_balanced_reshaped.reshape(-1, TIME_STEPS, X_train_scaled.shape[1])

print("Class distribution after SMOTE:")
print(pd.Series(y_train_seq_balanced).value_counts(normalize=True) * 100)
print(f"X_train_seq_balanced shape: {X_train_seq_balanced.shape}")

## 10. Save Preprocessed Data for Model Training

In [None]:
# Save preprocessed data for model training
import pickle

# Create a directory for preprocessed data if it doesn't exist
if not os.path.exists('preprocessed_data'):
    os.makedirs('preprocessed_data')

# Save tabular data
with open('preprocessed_data/tabular_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train_enhanced_scaled': X_train_enhanced_scaled,
        'y_train_enhanced': y_train_enhanced,
        'X_test_enhanced_scaled': X_test_enhanced_scaled,
        'y_test_enhanced': y_test_enhanced,
        'X_train_enhanced_balanced': X_train_enhanced_balanced,
        'y_train_enhanced_balanced': y_train_enhanced_balanced,
        'feature_columns_enhanced': feature_columns_enhanced,
        'scaler_enhanced': scaler_enhanced
    }, f)

# Save sequence data
with open('preprocessed_data/sequence_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train_seq': X_train_seq,
        'y_train_seq': y_train_seq,
        'X_test_seq': X_test_seq,
        'y_test_seq': y_test_seq,
        'X_train_seq_balanced': X_train_seq_balanced,
        'y_train_seq_balanced': y_train_seq_balanced,
        'TIME_STEPS': TIME_STEPS,
        'STRIDE': STRIDE,
        'scaler': scaler
    }, f)

print("Preprocessed data saved successfully.")