# HAI Dataset Analysis and ResBiLSTM Model Template

This notebook provides a template for analyzing any HAI dataset and implementing a Residual Bidirectional LSTM model for anomaly detection.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Import custom utility module
from hai_utils import *

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Configuration

Set the dataset parameters below.

In [None]:
# Dataset configuration
DATASET_NAME = 'hai-20.07'  # Change to the dataset you want to analyze
PARQUET_DIR = 'parquet_data'
TRAIN_FILE = 'train2'  # File containing training data with attack labels
TEST_FILE = 'test2'    # File containing test data with attack labels
TIME_STEPS = 100       # Number of time steps for sequence creation
STEP_SIZE = 1          # Step size for sequence creation
N_FEATURES = 30        # Number of features to select
FEATURE_SELECTION = True  # Whether to perform feature selection

## 2. Loading Data

Load the dataset from Parquet files.

In [None]:
# Load dataset
data_dict = load_dataset(PARQUET_DIR, DATASET_NAME)

# Display dataset information
for name, df in data_dict.items():
    print(f"Dataset: {name}")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"Columns: {df.columns.tolist()[:5]}...")
    print("\n")

In [None]:
# Get train and test dataframes
train_df = data_dict[TRAIN_FILE]
test_df = data_dict[TEST_FILE]

# Check if attack column exists
if 'attack' not in train_df.columns:
    print("Warning: 'attack' column not found in training data!")
    print(f"Available columns: {train_df.columns.tolist()}")
else:
    print(f"Attack column found in training data with {train_df['attack'].sum()} attack samples")
    
if 'attack' not in test_df.columns:
    print("Warning: 'attack' column not found in test data!")
    print(f"Available columns: {test_df.columns.tolist()}")
else:
    print(f"Attack column found in test data with {test_df['attack'].sum()} attack samples")

## 3. Data Exploration

Explore the basic characteristics of the dataset.

In [None]:
# Check attack distribution
if 'attack' in train_df.columns:
    plot_attack_distribution(train_df)

In [None]:
# Detect time column
time_col = None
for col in train_df.columns:
    if col.lower() in ['time', 'timestamp']:
        time_col = col
        break

print(f"Time column: {time_col}")
if time_col:
    print(f"Time range: {train_df[time_col].min()} to {train_df[time_col].max()}")
    print(f"Total duration: {train_df[time_col].max() - train_df[time_col].min()}")

In [None]:
# Check basic statistics
train_df.describe().T.head(10)

In [None]:
# Check for missing values
missing_values = train_df.isnull().sum()
if missing_values.sum() > 0:
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])
else:
    print("No missing values found.")

In [None]:
# Visualize time series for important features
# Select first 10 non-time, non-target features
feature_cols = [col for col in train_df.columns if col != time_col and col != 'attack'][:10]
plot_time_series(train_df, feature_cols, time_col=time_col)

In [None]:
# Visualize feature correlation matrix
plot_correlation_matrix(train_df, n_features=20)

In [None]:
# Compare normal vs attack samples
if 'attack' in train_df.columns:
    # Select a few features to compare
    features_to_compare = feature_cols[:5]
    
    fig, axes = plt.subplots(len(features_to_compare), 1, figsize=(15, 4*len(features_to_compare)))
    
    for i, feature in enumerate(features_to_compare):
        normal_data = train_df[train_df['attack'] == 0][feature]
        attack_data = train_df[train_df['attack'] == 1][feature]
        
        sns.kdeplot(normal_data, label='Normal', ax=axes[i])
        sns.kdeplot(attack_data, label='Attack', ax=axes[i])
        
        axes[i].set_title(f'Distribution of {feature}')
        axes[i].legend()
    
    plt.tight_layout()
    plt.show()

## 4. Feature Engineering

Prepare the data for model training.

In [None]:
# Preprocess data
X_train, X_test, y_train, y_test, feature_names, scaler = preprocess_data(
    train_df, test_df, 
    target_col='attack', 
    time_col=time_col,
    feature_selection=FEATURE_SELECTION, 
    n_features=N_FEATURES,
    scaler_type='standard'
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Selected features: {feature_names[:10]}...")

In [None]:
# Create sequences for time series modeling
X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps=TIME_STEPS, step=STEP_SIZE)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, time_steps=TIME_STEPS, step=STEP_SIZE)

print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}")
print(f"y_train_seq shape: {y_train_seq.shape}")
print(f"y_test_seq shape: {y_test_seq.shape}")

In [None]:
# Split training data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_seq, y_train_seq, test_size=0.2, random_state=42, stratify=y_train_seq
)

print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"Attack ratio in training set: {np.mean(y_train_final):.4f}")
print(f"Attack ratio in validation set: {np.mean(y_val):.4f}")

## 5. ResBiLSTM Model

Create and train the Residual Bidirectional LSTM model.

In [None]:
# Model configuration
LSTM_UNITS = 64
DENSE_UNITS = 32
DROPOUT_RATE = 0.3
BATCH_SIZE = 32
EPOCHS = 50
PATIENCE = 10
MODEL_PATH = f'best_resbilstm_{DATASET_NAME.replace("-", "_")}.h5'

In [None]:
# Create model
input_shape = (TIME_STEPS, X_train_final.shape[2])
model = create_residual_bilstm_model(
    input_shape=input_shape,
    lstm_units=LSTM_UNITS,
    dense_units=DENSE_UNITS,
    dropout_rate=DROPOUT_RATE
)

# Display model summary
model.summary()

In [None]:
# Train model
history, model = train_model(
    model=model,
    X_train=X_train_final,
    y_train=y_train_final,
    X_val=X_val,
    y_val=y_val,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    patience=PATIENCE,
    model_path=MODEL_PATH
)

In [None]:
# Plot training history
plot_training_history(history)

## 6. Model Evaluation

Evaluate the model on the test set.

In [None]:
# Evaluate model on test set
results = evaluate_model(model, X_test_seq, y_test_seq)

# Print evaluation metrics
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1']:.4f}")
print(f"AUC: {results['auc']:.4f}")

In [None]:
# Plot evaluation results
plot_evaluation_results(results)

In [None]:
# Plot feature importance
plot_feature_importance(model, feature_names, n_top=15)

## 7. Visualizing Predictions

Visualize the model's predictions on the test set.

In [None]:
# Get predictions
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']

# Create a DataFrame with actual and predicted values
pred_df = pd.DataFrame({
    'Actual': y_test_seq,
    'Predicted': y_pred.flatten(),
    'Probability': y_pred_proba.flatten()
})

# Plot actual vs predicted values
plt.figure(figsize=(15, 6))
plt.plot(pred_df.index, pred_df['Actual'], label='Actual', marker='o', markersize=3, linestyle='-', alpha=0.7)
plt.plot(pred_df.index, pred_df['Probability'], label='Predicted Probability', marker=None, linestyle='-', alpha=0.7)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Threshold (0.5)')
plt.title('Actual vs Predicted Values')
plt.xlabel('Sample Index')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot prediction errors
errors = pred_df[pred_df['Actual'] != pred_df['Predicted']]
print(f"Number of errors: {len(errors)} out of {len(pred_df)} samples ({len(errors)/len(pred_df)*100:.2f}%)")

# Plot false positives and false negatives
false_positives = pred_df[(pred_df['Actual'] == 0) & (pred_df['Predicted'] == 1)]
false_negatives = pred_df[(pred_df['Actual'] == 1) & (pred_df['Predicted'] == 0)]

print(f"False positives: {len(false_positives)} ({len(false_positives)/len(pred_df)*100:.2f}%)")
print(f"False negatives: {len(false_negatives)} ({len(false_negatives)/len(pred_df)*100:.2f}%)")

# Plot probability distribution for errors
plt.figure(figsize=(12, 6))
sns.histplot(false_positives['Probability'], color='red', label='False Positives', alpha=0.5, bins=20)
sns.histplot(false_negatives['Probability'], color='blue', label='False Negatives', alpha=0.5, bins=20)
plt.axvline(x=0.5, color='black', linestyle='--', alpha=0.7, label='Threshold (0.5)')
plt.title('Probability Distribution for Errors')
plt.xlabel('Predicted Probability')
plt.ylabel('Count')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 8. Save Model and Results

Save the model, scaler, and results for future use.

In [None]:
# Create results directory if it doesn't exist
results_dir = f'results_{DATASET_NAME.replace("-", "_")}'
os.makedirs(results_dir, exist_ok=True)

# Save model if not already saved during training
if not os.path.exists(MODEL_PATH):
    model.save(MODEL_PATH)

# Save scaler
import joblib
scaler_path = os.path.join(results_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")

# Save feature names
feature_names_path = os.path.join(results_dir, 'feature_names.txt')
with open(feature_names_path, 'w') as f:
    for feature in feature_names:
        f.write(f"{feature}\n")
print(f"Feature names saved to {feature_names_path}")

# Save model configuration
config = {
    'dataset_name': DATASET_NAME,
    'train_file': TRAIN_FILE,
    'test_file': TEST_FILE,
    'time_steps': TIME_STEPS,
    'step_size': STEP_SIZE,
    'n_features': N_FEATURES,
    'feature_selection': FEATURE_SELECTION,
    'lstm_units': LSTM_UNITS,
    'dense_units': DENSE_UNITS,
    'dropout_rate': DROPOUT_RATE,
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'patience': PATIENCE,
    'model_path': MODEL_PATH,
    'accuracy': results['accuracy'],
    'precision': results['precision'],
    'recall': results['recall'],
    'f1': results['f1'],
    'auc': results['auc']
}

config_path = os.path.join(results_dir, 'config.json')
import json
with open(config_path, 'w') as f:
    json.dump(config, f, indent=4)
print(f"Configuration saved to {config_path}")

## 9. Conclusion

In this notebook, we analyzed the HAI dataset and built a Residual Bidirectional LSTM model for anomaly detection. The model achieved good performance in detecting attacks, as evidenced by the evaluation metrics.

Key steps in the analysis:
1. Data loading and exploration
2. Feature engineering and sequence creation
3. Model creation and training
4. Model evaluation and visualization
5. Saving model and results for future use

This template can be adapted for any HAI dataset by changing the configuration parameters at the beginning of the notebook.