# HAI-21.03 Dataset Analysis and ResBiLSTM Model

This notebook analyzes the HAI-21.03 dataset, applies feature engineering, and implements a Residual Bidirectional LSTM model for anomaly detection.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Import custom utility modules
from hai_utils import *
from feature_engineering import *

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Loading Data

First, we load the HAI-21.03 dataset from Parquet files.

In [None]:
# Set data directory
parquet_dir = 'parquet_data'
dataset_name = 'hai-21.03'

# Load dataset
data_dict = load_dataset(parquet_dir, dataset_name)

# Display dataset information
for name, df in data_dict.items():
    print(f"Dataset: {name}")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"Columns: {df.columns.tolist()[:5]}...")
    print("\n")

## 2. Data Exploration

Next, we explore the basic characteristics of the dataset, including time series visualization, feature correlations, and data distribution.

In [None]:
# Check time column
train1_df = data_dict['train1']
time_col = None
for col in train1_df.columns:
    if col.lower() in ['time', 'timestamp']:
        time_col = col
        break

print(f"Time column: {time_col}")
if time_col:
    print(f"Time range: {train1_df[time_col].min()} to {train1_df[time_col].max()}")
    print(f"Total duration: {train1_df[time_col].max() - train1_df[time_col].min()}")

In [None]:
# Check basic statistics of the dataset
train1_df.describe().T.head(10)

In [None]:
# Check for missing values
missing_values = train1_df.isnull().sum()
if missing_values.sum() > 0:
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])
else:
    print("No missing values found.")

In [None]:
# Visualize time series for important features
# Select first 10 non-time, non-target features
feature_cols = [col for col in train1_df.columns if col != time_col and col != 'attack'][:10]
plot_time_series(train1_df, feature_cols, time_col=time_col)

In [None]:
# Visualize feature correlation matrix
plot_correlation_matrix(train1_df, n_features=20)

## 3. Feature Engineering

Now we apply feature engineering to the dataset using our custom module to create new features.

In [None]:
# Create output directory
output_dir = 'engineered_data'
os.makedirs(output_dir, exist_ok=True)

# Apply feature engineering to the training data
print("Applying feature engineering to the dataset...")
results = process_dataset_directory(
    parquet_dir, 
    output_dir, 
    dataset_name, 
    time_col=time_col, 
    target_col='attack', 
    feature_selection_method='importance', 
    n_features=50, 
    add_time=True, 
    add_stats=True, 
    add_lag=True, 
    add_diff=True, 
    add_interaction=True, 
    add_pca=True
)

In [None]:
# Load engineered data
engineered_dataset_name = f"{dataset_name}_engineered"
engineered_data_dict = load_dataset(output_dir, engineered_dataset_name)

# Display engineered dataset information
for name, df in engineered_data_dict.items():
    print(f"Dataset: {name}")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"Columns: {df.columns.tolist()[:5]}...")
    print("\n")

In [None]:
# Visualize feature importance
for name, importance_df in results.items():
    if not importance_df.empty:
        plt.figure(figsize=(12, 10))
        plot_feature_importance(importance_df, n_features=20)
        plt.title(f"Feature Importance - {name}")
        plt.show()

## 4. Model Preparation

Now we prepare the data for model training.

In [None]:
# Prepare train and test datasets
train_df = engineered_data_dict['train1']  # Use train1 as training data
test_df = engineered_data_dict['test1']    # Use test1 as test data

# Check if attack column exists
if 'attack' not in train_df.columns:
    print("Warning: No 'attack' column in training data, which is expected as training data has no attack labels.")
    # Add dummy attack column (all zeros) for training data
    train_df['attack'] = 0

# Preprocess data
X_train, X_test, y_train, y_test, feature_names, scaler = preprocess_data(
    train_df, test_df, 
    target_col='attack', 
    time_col=time_col,
    feature_selection=False,  # We already performed feature selection during feature engineering
    scaler_type='standard'
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Number of features: {len(feature_names)}")

In [None]:
# Create sequences for time series modeling
time_steps = 100  # Number of time steps in each sequence
step = 1  # Step size between sequences

X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps=time_steps, step=step)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, time_steps=time_steps, step=step)

print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}")
print(f"y_train_seq shape: {y_train_seq.shape}")
print(f"y_test_seq shape: {y_test_seq.shape}")

In [None]:
# Split training data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_seq, y_train_seq, test_size=0.2, random_state=42
)

print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"Attack ratio in training set: {np.mean(y_train_final):.4f}")
print(f"Attack ratio in validation set: {np.mean(y_val):.4f}")

## 5. ResBiLSTM Model

Now we create and train the Residual Bidirectional LSTM model.

In [None]:
# Create model
input_shape = (time_steps, X_train_final.shape[2])
model = create_residual_bilstm_model(
    input_shape=input_shape,
    lstm_units=64,
    dense_units=32,
    dropout_rate=0.3
)

# Display model summary
model.summary()

In [None]:
# Train model
history, model = train_model(
    model=model,
    X_train=X_train_final,
    y_train=y_train_final,
    X_val=X_val,
    y_val=y_val,
    batch_size=32,
    epochs=50,
    patience=10,
    model_path='best_resbilstm_model_hai_21_03.h5'
)

In [None]:
# Plot training history
plot_training_history(history)

## 6. Model Evaluation

Now we evaluate the model on the test set.

In [None]:
# Evaluate model on test set
results = evaluate_model(model, X_test_seq, y_test_seq)

# Print evaluation metrics
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1']:.4f}")
print(f"AUC: {results['auc']:.4f}")

In [None]:
# Plot evaluation results
plot_evaluation_results(results)

In [None]:
# Plot feature importance
plot_feature_importance(model, feature_names, n_top=15)

## 7. Visualizing Predictions

Finally, we visualize the model's predictions on the test set.

In [None]:
# Get predictions
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']

# Create a DataFrame with actual and predicted values
pred_df = pd.DataFrame({
    'Actual': y_test_seq,
    'Predicted': y_pred.flatten(),
    'Probability': y_pred_proba.flatten()
})

# Plot actual vs predicted values
plt.figure(figsize=(15, 6))
plt.plot(pred_df.index, pred_df['Actual'], label='Actual', marker='o', markersize=3, linestyle='-', alpha=0.7)
plt.plot(pred_df.index, pred_df['Probability'], label='Predicted Probability', marker=None, linestyle='-', alpha=0.7)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Threshold (0.5)')
plt.title('Actual vs Predicted Values')
plt.xlabel('Sample Index')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot prediction errors
errors = pred_df[pred_df['Actual'] != pred_df['Predicted']]
print(f"Number of errors: {len(errors)} out of {len(pred_df)} samples ({len(errors)/len(pred_df)*100:.2f}%)")

# Plot false positives and false negatives
false_positives = pred_df[(pred_df['Actual'] == 0) & (pred_df['Predicted'] == 1)]
false_negatives = pred_df[(pred_df['Actual'] == 1) & (pred_df['Predicted'] == 0)]

print(f"False positives: {len(false_positives)} ({len(false_positives)/len(pred_df)*100:.2f}%)")
print(f"False negatives: {len(false_negatives)} ({len(false_negatives)/len(pred_df)*100:.2f}%)")

# Plot probability distribution for errors
plt.figure(figsize=(12, 6))
sns.histplot(false_positives['Probability'], color='red', label='False Positives', alpha=0.5, bins=20)
sns.histplot(false_negatives['Probability'], color='blue', label='False Negatives', alpha=0.5, bins=20)
plt.axvline(x=0.5, color='black', linestyle='--', alpha=0.7, label='Threshold (0.5)')
plt.title('Probability Distribution for Errors')
plt.xlabel('Predicted Probability')
plt.ylabel('Count')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = results['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(results['fpr'], results['tpr'], label=f'ROC Curve (AUC = {results["auc"]:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()