# HAI 21.03 Dataset Analysis

Analysis of the HAI 21.03 version dataset using Polars for efficient data processing.

In [None]:
# Import libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import time
from tqdm.notebook import tqdm
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model, save_model
from tensorflow.keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Input, Dropout

# Import custom preprocessing functions
import sys
sys.path.append('.')
from data_preprocessing import (
    lazy_load_csv, get_file_info, process_in_chunks, save_to_efficient_format,
    add_time_features, add_lag_features, add_rolling_features,
    plot_time_series, plot_correlation_matrix, plot_distribution
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading and Exploration

In [None]:
# Define dataset paths
base_path = 'hai-security-dataset/hai-21.03/'
train_files = [f'{base_path}train1.csv', f'{base_path}train2.csv', f'{base_path}train3.csv']
test_files = [f'{base_path}test1.csv', f'{base_path}test2.csv', f'{base_path}test3.csv', 
              f'{base_path}test4.csv', f'{base_path}test5.csv']

# Create output directory for processed data
processed_dir = 'processed_data/hai-21.03/'
os.makedirs(processed_dir, exist_ok=True)

In [None]:
# Get file information
for file_path in train_files + test_files:
    info = get_file_info(file_path)
    print(f"File: {info['file_name']}")
    print(f"Size: {info['file_size_mb']:.2f} MB")
    print(f"Columns: {info['num_columns']}")
    print(f"Estimated rows: {info['estimated_rows']}")
    print("-" * 50)

In [None]:
# Load a sample of the first training file to explore
sample_df = pl.read_csv(train_files[0], n_rows=10000)

# Display basic information
print(f"Number of columns: {len(sample_df.columns)}")
print(f"Number of rows: {len(sample_df)}")
print(f"Column names: {sample_df.columns}")

In [None]:
# Identify column types
time_column = sample_df.columns[0]  # First column is timestamp
data_columns = sample_df.columns[1:-4]  # Middle columns are data points
label_columns = sample_df.columns[-4:]  # Last 4 columns are attack labels

print(f"Time column: {time_column}")
print(f"Number of data columns: {len(data_columns)}")
print(f"Label columns: {label_columns}")

In [None]:
# Convert timestamp to datetime and display sample
sample_df = sample_df.with_column(
    pl.col(time_column).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
)
sample_df.head(5)

## 2. Data Visualization

In [None]:
# Identify new columns in HAI 21.03 compared to HAI 20.07
# HAI 20.07 had 59 data points, HAI 21.03 has 78 data points
# Let's find the new columns

# Load a sample of HAI 20.07 data to compare
hai_20_07_sample = pl.read_csv('hai-security-dataset/hai-20.07/train1.csv', n_rows=1)
hai_21_03_sample = sample_df

# Get column names
hai_20_07_cols = set(hai_20_07_sample.columns)
hai_21_03_cols = set(hai_21_03_sample.columns)

# Find new columns
new_cols = hai_21_03_cols - hai_20_07_cols
print(f"New columns in HAI 21.03: {sorted(list(new_cols))}")

In [None]:
# Plot time series for key process variables
key_vars = ['P1_PIT01', 'P1_TIT01', 'P1_LIT01', 'P2_SIT01']
plot_time_series(sample_df, time_column, key_vars, title='Key Process Variables')

In [None]:
# Plot time series for new variables
new_vars = [col for col in new_cols if col not in label_columns and col != time_column]
if len(new_vars) > 4:
    new_vars = new_vars[:4]  # Limit to 4 for better visualization
    
plot_time_series(sample_df, time_column, new_vars, title='New Variables in HAI 21.03')

In [None]:
# Plot correlation matrix for key variables
corr_vars = key_vars + ['P1_FT01Z', 'P1_FT02Z', 'P1_FT03Z']
plot_correlation_matrix(sample_df, columns=corr_vars)

In [None]:
# Plot distributions of key variables
plot_distribution(sample_df, key_vars)

In [None]:
# Load a sample of test data to visualize attacks
test_sample_df = pl.read_csv(test_files[0], n_rows=10000).with_column(
    pl.col(time_column).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
)

# Plot time series with attack regions highlighted
plot_time_series(test_sample_df, time_column, key_vars, 
                 title='Process Variables During Attacks', 
                 attack_column='attack')

## 3. Data Preprocessing and Feature Engineering

In [None]:
def preprocess_dataset(file_path, output_path=None, is_training=True):
    """
    Preprocess a dataset file with feature engineering
    
    Args:
        file_path (str): Path to the CSV file
        output_path (str): Path to save the processed file
        is_training (bool): Whether this is a training dataset
        
    Returns:
        pl.DataFrame: Processed DataFrame
    """
    print(f"Processing {file_path}...")
    
    # Lazy load the dataset
    df_lazy = lazy_load_csv(file_path)
    
    # Add time features
    df_lazy = add_time_features(df_lazy)
    
    # Collect the data (we'll need it for lag features)
    print("Collecting data...")
    df = df_lazy.collect()
    
    # Identify column types
    time_column = df.columns[0]  # First column is timestamp
    data_columns = df.columns[1:-4]  # Middle columns are data points
    label_columns = df.columns[-4:]  # Last 4 columns are attack labels
    
    # Select key process variables for feature engineering
    # Include both common variables and new variables specific to HAI 21.03
    key_vars = ['P1_PIT01', 'P1_TIT01', 'P1_LIT01', 'P1_FT01Z', 'P1_FT02Z', 'P1_FT03Z', 'P2_SIT01',
                'P1_PP01AR', 'P1_PP01BR', 'P1_PP02R', 'P2_SCO', 'P2_SCST', 'P4_ST_GOV']
    
    # Add lag features for key variables
    print("Adding lag features...")
    df = add_lag_features(df, key_vars, lags=[1, 5, 10, 30])
    
    # Add rolling window features for key variables
    print("Adding rolling window features...")
    df = add_rolling_features(df, key_vars, windows=[5, 10, 30])
    
    # Save to efficient format if output_path is provided
    if output_path:
        save_to_efficient_format(df, output_path)
    
    return df

In [None]:
# Process training datasets
train_dfs = []
for i, file_path in enumerate(train_files):
    output_path = f"{processed_dir}train{i+1}.parquet"
    df = preprocess_dataset(file_path, output_path, is_training=True)
    train_dfs.append(df)

# Combine training datasets
train_df = pl.concat(train_dfs)
print(f"Combined training data shape: {train_df.shape}")

In [None]:
# Process test datasets
test_dfs = []
for i, file_path in enumerate(test_files):
    output_path = f"{processed_dir}test{i+1}.parquet"
    df = preprocess_dataset(file_path, output_path, is_training=False)
    test_dfs.append(df)

# Combine test datasets
test_df = pl.concat(test_dfs)
print(f"Combined test data shape: {test_df.shape}")

## 4. Feature Selection and Scaling

In [None]:
def prepare_model_data(train_df, test_df):
    """
    Prepare data for modeling by selecting features and scaling
    
    Args:
        train_df (pl.DataFrame): Training DataFrame
        test_df (pl.DataFrame): Test DataFrame
        
    Returns:
        tuple: (X_train, X_test, y_test, scaler)
    """
    # Identify column types
    time_column = train_df.columns[0]  # First column is timestamp
    data_columns = [col for col in train_df.columns if col not in [time_column, 'hour', 'day_of_week', 
                                                                   'day', 'month', 'year', 'is_weekend', 
                                                                   'time_of_day', 'attack', 'attack_P1', 
                                                                   'attack_P2', 'attack_P3']]
    
    # Select only numeric columns
    numeric_cols = [col for col in data_columns if train_df[col].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]]
    
    # Convert to numpy arrays
    X_train = train_df.select(numeric_cols).to_numpy()
    X_test = test_df.select(numeric_cols).to_numpy()
    
    # Get attack labels for test data
    y_test = test_df.select('attack').to_numpy().flatten()
    
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Save the scaler
    with open(f"{processed_dir}scaler.pkl", 'wb') as f:
        pickle.dump(scaler, f)
    
    # Save feature names
    with open(f"{processed_dir}feature_names.pkl", 'wb') as f:
        pickle.dump(numeric_cols, f)
    
    return X_train_scaled, X_test_scaled, y_test, scaler, numeric_cols

In [None]:
# Prepare data for modeling
X_train, X_test, y_test, scaler, feature_names = prepare_model_data(train_df, test_df)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Number of features: {len(feature_names)}")

## 5. Anomaly Detection Models

### 5.1 Isolation Forest

In [None]:
# Train Isolation Forest model
print("Training Isolation Forest model...")
iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42, n_jobs=-1)
iso_forest.fit(X_train)

# Save the model
with open(f"{processed_dir}isolation_forest_model.pkl", 'wb') as f:
    pickle.dump(iso_forest, f)

# Predict on test data
y_pred_if = iso_forest.predict(X_test)
# Convert predictions to binary (1 for inlier, -1 for outlier)
y_pred_if_binary = np.where(y_pred_if == 1, 0, 1)  # 0 for normal, 1 for anomaly

# Calculate anomaly scores
anomaly_scores_if = iso_forest.score_samples(X_test)
# Invert scores (lower score = more anomalous)
anomaly_scores_if = -anomaly_scores_if

In [None]:
# Evaluate Isolation Forest model
print("Isolation Forest Results:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_if_binary))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_if_binary))

In [None]:
# Plot ROC curve for Isolation Forest
from sklearn.metrics import roc_curve, auc

fpr_if, tpr_if, thresholds_if = roc_curve(y_test, anomaly_scores_if)
roc_auc_if = auc(fpr_if, tpr_if)

plt.figure(figsize=(10, 6))
plt.plot(fpr_if, tpr_if, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_if:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Isolation Forest')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Plot Precision-Recall curve for Isolation Forest
precision_if, recall_if, _ = precision_recall_curve(y_test, anomaly_scores_if)
pr_auc_if = auc(recall_if, precision_if)

plt.figure(figsize=(10, 6))
plt.plot(recall_if, precision_if, color='darkorange', lw=2, label=f'PR curve (area = {pr_auc_if:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Isolation Forest')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()

### 5.2 LSTM Autoencoder

In [None]:
# Prepare data for autoencoder
def create_sequences(data, seq_length):
    xs = []
    for i in range(len(data) - seq_length):
        x = data[i:(i + seq_length)]
        xs.append(x)
    return np.array(xs)

# Define sequence length
seq_length = 30

# Create sequences
X_train_seq = create_sequences(X_train, seq_length)
X_test_seq = create_sequences(X_test, seq_length)

# Adjust y_test to match sequence length
y_test_seq = y_test[seq_length:]

print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}")
print(f"y_test_seq shape: {y_test_seq.shape}")

In [None]:
# Build LSTM Autoencoder model
def build_lstm_autoencoder(input_shape):
    model = Sequential([
        LSTM(128, activation='relu', input_shape=input_shape, return_sequences=True),
        Dropout(0.2),
        LSTM(64, activation='relu', return_sequences=False),
        RepeatVector(input_shape[0]),
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(128, activation='relu', return_sequences=True),
        TimeDistributed(Dense(input_shape[1]))
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Create and train the model
input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])
lstm_autoencoder = build_lstm_autoencoder(input_shape)
lstm_autoencoder.summary()

In [None]:
# Train the model
history = lstm_autoencoder.fit(
    X_train_seq, X_train_seq,
    epochs=15,
    batch_size=64,
    validation_split=0.1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)
    ],
    verbose=1
)

In [None]:
# Save the model
lstm_autoencoder.save(f"{processed_dir}lstm_autoencoder_model")

# Plot training history
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Predict on test data
X_test_pred = lstm_autoencoder.predict(X_test_seq)

# Calculate MSE for each sample
mse = np.mean(np.square(X_test_seq - X_test_pred), axis=(1, 2))

# Plot MSE distribution
plt.figure(figsize=(10, 6))
plt.hist(mse, bins=50)
plt.title('MSE Distribution')
plt.xlabel('MSE')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
# Find threshold for anomaly detection
threshold = np.percentile(mse, 95)  # 95th percentile
print(f"Threshold: {threshold}")

# Classify anomalies
y_pred_ae = (mse > threshold).astype(int)

# Evaluate autoencoder model
print("Autoencoder Results:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_seq, y_pred_ae))
print("\nClassification Report:")
print(classification_report(y_test_seq, y_pred_ae))

In [None]:
# Plot ROC curve for Autoencoder
fpr_ae, tpr_ae, thresholds_ae = roc_curve(y_test_seq, mse)
roc_auc_ae = auc(fpr_ae, tpr_ae)

plt.figure(figsize=(10, 6))
plt.plot(fpr_ae, tpr_ae, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_ae:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - LSTM Autoencoder')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Plot Precision-Recall curve for Autoencoder
precision_ae, recall_ae, _ = precision_recall_curve(y_test_seq, mse)
pr_auc_ae = auc(recall_ae, precision_ae)

plt.figure(figsize=(10, 6))
plt.plot(recall_ae, precision_ae, color='darkorange', lw=2, label=f'PR curve (area = {pr_auc_ae:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - LSTM Autoencoder')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()

## 6. Visualize Anomaly Detection Results

In [None]:
# Create a DataFrame with timestamps, actual labels, and predictions
results_df = pl.DataFrame({
    'timestamp': test_df.slice(seq_length).select('timestamp').to_series(),
    'actual': y_test_seq,
    'pred_isolation_forest': y_pred_if_binary[seq_length:],
    'pred_autoencoder': y_pred_ae,
    'score_isolation_forest': anomaly_scores_if[seq_length:],
    'score_autoencoder': mse
})

# Plot time series of anomaly scores
plt.figure(figsize=(15, 10))

# Plot Isolation Forest scores
plt.subplot(2, 1, 1)
plt.plot(results_df['timestamp'].to_numpy(), results_df['score_isolation_forest'].to_numpy(), label='Anomaly Score')
plt.scatter(results_df.filter(pl.col('actual') > 0)['timestamp'].to_numpy(), 
           results_df.filter(pl.col('actual') > 0)['score_isolation_forest'].to_numpy(), 
           color='red', label='Actual Anomaly')
plt.title('Isolation Forest Anomaly Scores')
plt.xlabel('Time')
plt.ylabel('Anomaly Score')
plt.legend()
plt.grid(True)

# Plot Autoencoder scores
plt.subplot(2, 1, 2)
plt.plot(results_df['timestamp'].to_numpy(), results_df['score_autoencoder'].to_numpy(), label='Anomaly Score')
plt.scatter(results_df.filter(pl.col('actual') > 0)['timestamp'].to_numpy(), 
           results_df.filter(pl.col('actual') > 0)['score_autoencoder'].to_numpy(), 
           color='red', label='Actual Anomaly')
plt.axhline(y=threshold, color='green', linestyle='--', label=f'Threshold ({threshold:.4f})')
plt.title('Autoencoder Reconstruction Error (MSE)')
plt.xlabel('Time')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Compare model performance
plt.figure(figsize=(10, 8))

# Plot ROC curves
plt.plot(fpr_if[seq_length:], tpr_if[seq_length:], label=f'Isolation Forest (AUC = {roc_auc_if:.2f})')
plt.plot(fpr_ae, tpr_ae, label=f'LSTM Autoencoder (AUC = {roc_auc_ae:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Analyze feature importance for Isolation Forest
def get_feature_importance(model, feature_names):
    """
    Get feature importance from Isolation Forest model
    
    Args:
        model: Trained Isolation Forest model
        feature_names: List of feature names
        
    Returns:
        DataFrame with feature importance
    """
    # Get feature importance
    importances = np.mean([tree.feature_importances_ for tree in model.estimators_], axis=0)
    
    # Create DataFrame
    importance_df = pl.DataFrame({
        'feature': feature_names,
        'importance': importances
    })
    
    # Sort by importance
    importance_df = importance_df.sort('importance', descending=True)
    
    return importance_df

# Get feature importance
importance_df = get_feature_importance(iso_forest, feature_names)

# Plot top 20 features
top_features = importance_df.head(20)
plt.figure(figsize=(12, 8))
plt.barh(top_features['feature'].to_numpy(), top_features['importance'].to_numpy())
plt.xlabel('Importance')
plt.title('Top 20 Important Features')
plt.gca().invert_yaxis()  # Invert y-axis to show most important at the top
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

## 8. Save Results

In [None]:
# Save results to CSV
results_df.write_csv(f"{processed_dir}anomaly_detection_results.csv")

# Save feature importance
importance_df.write_csv(f"{processed_dir}feature_importance.csv")

# Save model performance metrics
performance_metrics = {
    'isolation_forest_roc_auc': roc_auc_if,
    'isolation_forest_pr_auc': pr_auc_if,
    'autoencoder_roc_auc': roc_auc_ae,
    'autoencoder_pr_auc': pr_auc_ae,
    'autoencoder_threshold': threshold
}

with open(f"{processed_dir}model_performance.pkl", 'wb') as f:
    pickle.dump(performance_metrics, f)

print("Results saved successfully!")