# Advanced Deep Learning Models for HAI Security Dataset Anomaly Detection

This notebook implements advanced deep learning architectures for anomaly detection on the HAI security dataset. We'll explore CNN-LSTM hybrid models and Transformer-based approaches, which can capture both spatial and temporal patterns in the data.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import time
import joblib
from datetime import datetime
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D
from tensorflow.keras.layers import LSTM, Bidirectional, TimeDistributed, Flatten, RepeatVector, Attention
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if len(tf.config.list_physical_devices('GPU')) > 0:
    print("GPU is available for training")
    # Set memory growth to avoid memory allocation errors
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPU available, using CPU for training")

## 1. Load Preprocessed Data

First, let's load the preprocessed data created in the preprocessing notebook.

In [None]:
def load_processed_data(file_path):
    """
    Load processed data from NPZ file.
    
    Args:
        file_path: Path to the NPZ file
        
    Returns:
        DataFrame: Loaded data
    """
    # Load NPZ file
    npz_data = np.load(file_path, allow_pickle=True)
    
    # Convert to DataFrame
    df = pd.DataFrame(npz_data['data'], columns=npz_data['columns'])
    
    return df

In [None]:
# Load preprocessor
preprocessor_path = './models/hai_hai_20_07_standard_preprocessor.joblib'
preprocessor_dict = joblib.load(preprocessor_path)

# Extract important information
feature_columns = preprocessor_dict['feature_columns']
attack_columns = preprocessor_dict['attack_columns']
timestamp_col = preprocessor_dict['timestamp_col']

print(f"Number of features: {len(feature_columns)}")
print(f"Attack columns: {attack_columns}")
print(f"Timestamp column: {timestamp_col}")

In [None]:
# Get list of processed data files
train_data_dir = './processed_data/hai-20.07/train'
test_data_dir = './processed_data/hai-20.07/test'

train_files = sorted(glob.glob(f'{train_data_dir}/*.npz'))
test_files = sorted(glob.glob(f'{test_data_dir}/*.npz'))

print(f"Training files: {[os.path.basename(f) for f in train_files]}")
print(f"Test files: {[os.path.basename(f) for f in test_files]}")

## 2. Prepare Data for Advanced Models

We'll prepare sequence data for our advanced deep learning models.

In [None]:
def create_sequences(data, feature_cols, target_col=None, seq_length=100):
    """
    Create sequences for deep learning models.
    
    Args:
        data: DataFrame containing the data
        feature_cols: List of feature column names
        target_col: Target column name (None for unsupervised learning)
        seq_length: Length of each sequence
        
    Returns:
        tuple: (X, y) - Sequences and targets (if target_col is provided)
    """
    X = []
    y = [] if target_col is not None else None
    
    # Extract features
    features = data[feature_cols].values
    
    # Extract target if provided
    targets = data[target_col].values if target_col is not None else None
    
    # Create sequences
    for i in range(len(features) - seq_length):
        X.append(features[i:i+seq_length])
        if target_col is not None:
            # Use the label of the last timestep in the sequence
            y.append(targets[i+seq_length])
    
    return np.array(X), np.array(y) if target_col is not None else None

In [None]:
def load_and_prepare_sequence_data(file_paths, feature_cols, target_col=None, seq_length=100, max_files=None):
    """
    Load and prepare sequence data from multiple files.
    
    Args:
        file_paths: List of file paths
        feature_cols: List of feature column names
        target_col: Target column name (None for unsupervised learning)
        seq_length: Length of each sequence
        max_files: Maximum number of files to load (None for all files)
        
    Returns:
        tuple: (X, y) - Combined sequences and targets
    """
    all_X = []
    all_y = [] if target_col is not None else None
    
    # Limit the number of files if specified
    if max_files is not None:
        file_paths = file_paths[:max_files]
    
    for file_path in file_paths:
        print(f"Processing {os.path.basename(file_path)}...")
        
        # Load data
        df = load_processed_data(file_path)
        
        # Create sequences
        X, y = create_sequences(df, feature_cols, target_col, seq_length)
        
        all_X.append(X)
        if target_col is not None:
            all_y.append(y)
    
    # Combine data from all files
    combined_X = np.vstack(all_X) if all_X else np.array([])
    combined_y = np.concatenate(all_y) if all_y else None
    
    return combined_X, combined_y

In [None]:
# Set parameters
target_col = 'attack' if attack_columns else None  # Target column
seq_length = 100  # Sequence length

# Load and prepare training data
print("Loading and preparing training data...")
X_train, _ = load_and_prepare_sequence_data(train_files, feature_columns, target_col=None, 
                                           seq_length=seq_length, max_files=2)

# Load and prepare test data
print("\nLoading and preparing test data...")
X_test, y_test = load_and_prepare_sequence_data(test_files, feature_columns, target_col=target_col, 
                                               seq_length=seq_length, max_files=2)

print(f"\nTraining data shape: {X_train.shape}")
if y_test is not None:
    print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")
else:
    print(f"Test data shape: {X_test.shape}")

In [None]:
# Convert data to float32 for neural network models
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

## 3. CNN-LSTM Hybrid Model

First, let's implement a CNN-LSTM hybrid model that combines convolutional layers for feature extraction with LSTM layers for temporal modeling.

In [None]:
def build_cnn_lstm_autoencoder(input_shape, encoding_dim=32, filters=[64, 128], kernel_size=3, lstm_units=[64, 32]):
    """
    Build a CNN-LSTM autoencoder model.
    
    Args:
        input_shape: Shape of input data (seq_length, num_features)
        encoding_dim: Dimension of the encoded representation
        filters: List of filter sizes for CNN layers
        kernel_size: Kernel size for CNN layers
        lstm_units: List of units for LSTM layers
        
    Returns:
        tuple: (model, encoder, decoder) - Full model, encoder part, and decoder part
    """
    # Encoder
    encoder_inputs = Input(shape=input_shape, name='encoder_input')
    
    # CNN layers for feature extraction
    x = encoder_inputs
    for i, filter_size in enumerate(filters):
        x = Conv1D(filters=filter_size, kernel_size=kernel_size, activation='relu', padding='same', name=f'encoder_conv_{i+1}')(x)
        x = BatchNormalization(name=f'encoder_bn_conv_{i+1}')(x)
        x = MaxPooling1D(pool_size=2, padding='same', name=f'encoder_pool_{i+1}')(x)
    
    # LSTM layers for temporal modeling
    for i, units in enumerate(lstm_units[:-1]):
        x = Bidirectional(LSTM(units, return_sequences=True, name=f'encoder_lstm_{i+1}'))(x)
        x = BatchNormalization(name=f'encoder_bn_lstm_{i+1}')(x)
    
    # Final LSTM layer
    x = Bidirectional(LSTM(lstm_units[-1], return_sequences=False, name=f'encoder_lstm_{len(lstm_units)}'))(x)
    x = BatchNormalization(name=f'encoder_bn_lstm_{len(lstm_units)}')(x)
    
    # Bottleneck layer
    encoder_output = Dense(encoding_dim, activation='relu', name='encoder_output')(x)
    
    # Create encoder model
    encoder = Model(encoder_inputs, encoder_output, name='encoder')
    
    # Decoder
    decoder_inputs = Input(shape=(encoding_dim,), name='decoder_input')
    
    # Dense layer to match LSTM input dimensions
    x = Dense(lstm_units[-1] * 2, activation='relu', name='decoder_dense_1')(decoder_inputs)  # *2 for bidirectional
    x = BatchNormalization(name='decoder_bn_dense_1')(x)
    
    # Calculate sequence length after pooling operations
    pooled_seq_length = input_shape[0] // (2 ** len(filters))
    
    # Repeat vector to create sequence
    x = RepeatVector(pooled_seq_length, name='decoder_repeat')(x)
    
    # LSTM layers
    for i, units in enumerate(reversed(lstm_units)):
        x = Bidirectional(LSTM(units, return_sequences=True, name=f'decoder_lstm_{i+1}'))(x)
        x = BatchNormalization(name=f'decoder_bn_lstm_{i+1}')(x)
    
    # Upsampling to original sequence length using Conv1D and UpSampling1D
    for i, filter_size in enumerate(reversed(filters)):
        # Upsampling
        x = TimeDistributed(Dense(filter_size * 2, activation='relu'), name=f'decoder_upsample_{i+1}')(x)
        # Reshape to double the sequence length
        current_shape = x.shape
        target_shape = (current_shape[1] * 2, filter_size)
        x = Reshape(target_shape, name=f'decoder_reshape_{i+1}')(x)
        
        # Conv1D for smoothing
        x = Conv1D(filters=filter_size, kernel_size=kernel_size, activation='relu', padding='same', name=f'decoder_conv_{i+1}')(x)
        x = BatchNormalization(name=f'decoder_bn_conv_{i+1}')(x)
    
    # Output layer
    decoder_output = Conv1D(filters=input_shape[1], kernel_size=1, activation='linear', padding='same', name='decoder_output')(x)
    
    # Create decoder model
    decoder = Model(decoder_inputs, decoder_output, name='decoder')
    
    # Create autoencoder model
    autoencoder_input = Input(shape=input_shape, name='autoencoder_input')
    encoded = encoder(autoencoder_input)
    decoded = decoder(encoded)
    autoencoder = Model(autoencoder_input, decoded, name='cnn_lstm_autoencoder')
    
    return autoencoder, encoder, decoder

In [None]:
# Set model parameters
input_shape = (X_train.shape[1], X_train.shape[2])  # (seq_length, num_features)
encoding_dim = 32  # Dimension of the encoded representation
filters = [32, 64]  # Filter sizes for CNN layers
kernel_size = 3  # Kernel size for CNN layers
lstm_units = [64, 32]  # Units for LSTM layers

# Build model
cnn_lstm_autoencoder, cnn_lstm_encoder, cnn_lstm_decoder = build_cnn_lstm_autoencoder(
    input_shape, encoding_dim, filters, kernel_size, lstm_units
)

# Compile model
cnn_lstm_autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Print model summary
cnn_lstm_autoencoder.summary()

## 4. Train CNN-LSTM Model

Now we'll train the CNN-LSTM model on normal data to learn the normal behavior patterns.

In [None]:
# Set training parameters
batch_size = 64
epochs = 50
validation_split = 0.1

# Create model checkpoint callback
os.makedirs('./models', exist_ok=True)
checkpoint_path = './models/cnn_lstm_autoencoder_hai_20_07.h5'
checkpoint = ModelCheckpoint(checkpoint_path, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

# Create early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', 
                              patience=10, 
                              verbose=1, 
                              mode='min', 
                              restore_best_weights=True)

# Create learning rate scheduler callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                             factor=0.5, 
                             patience=5, 
                             verbose=1, 
                             min_lr=0.0001)

# Create TensorBoard callback
log_dir = './logs/cnn_lstm_autoencoder_' + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir, 
                         histogram_freq=1, 
                         write_graph=True, 
                         write_images=True)

# Combine callbacks
callbacks = [checkpoint, early_stopping, reduce_lr, tensorboard]

In [None]:
# Train model
start_time = time.time()

cnn_lstm_history = cnn_lstm_autoencoder.fit(
    X_train, X_train,  # Input and output are the same for autoencoder
    batch_size=batch_size,
    epochs=epochs,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1
)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(cnn_lstm_history.history['loss'])
plt.plot(cnn_lstm_history.history['val_loss'])
plt.title('CNN-LSTM Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot zoomed-in view of the last epochs
plt.subplot(1, 2, 2)
last_epochs = min(20, len(cnn_lstm_history.history['loss']))  # Last 20 epochs or all if less
plt.plot(cnn_lstm_history.history['loss'][-last_epochs:])
plt.plot(cnn_lstm_history.history['val_loss'][-last_epochs:])
plt.title(f'CNN-LSTM Model Loss (Last {last_epochs} Epochs)')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

## 5. Transformer-Based Model

Now let's implement a Transformer-based model for anomaly detection.

In [None]:
def transformer_encoder_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    """
    Transformer encoder block.
    
    Args:
        inputs: Input tensor
        head_size: Size of each attention head
        num_heads: Number of attention heads
        ff_dim: Hidden layer size in feed forward network
        dropout: Dropout rate
        
    Returns:
        tensor: Output tensor
    """
    # Multi-head attention
    attention_output = MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs)
    
    # Add & Normalize (first residual connection)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    
    # Feed Forward Network
    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(inputs.shape[-1])(ffn_output)
    
    # Add & Normalize (second residual connection)
    return LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)

In [None]:
def build_transformer_autoencoder(input_shape, head_size=64, num_heads=4, ff_dim=256, num_transformer_blocks=2, encoding_dim=32):
    """
    Build a Transformer-based autoencoder model.
    
    Args:
        input_shape: Shape of input data (seq_length, num_features)
        head_size: Size of each attention head
        num_heads: Number of attention heads
        ff_dim: Hidden layer size in feed forward network
        num_transformer_blocks: Number of transformer blocks
        encoding_dim: Dimension of the encoded representation
        
    Returns:
        tuple: (model, encoder, decoder) - Full model, encoder part, and decoder part
    """
    # Encoder
    encoder_inputs = Input(shape=input_shape, name='encoder_input')
    
    # Initial projection
    x = Conv1D(filters=ff_dim, kernel_size=1, activation='relu', padding='same')(encoder_inputs)
    
    # Transformer blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder_block(x, head_size, num_heads, ff_dim)
    
    # Global pooling
    x = GlobalAveragePooling1D()(x)
    
    # Bottleneck layer
    encoder_output = Dense(encoding_dim, activation='relu', name='encoder_output')(x)
    
    # Create encoder model
    encoder = Model(encoder_inputs, encoder_output, name='encoder')
    
    # Decoder
    decoder_inputs = Input(shape=(encoding_dim,), name='decoder_input')
    
    # Dense layer to match sequence dimensions
    x = Dense(ff_dim, activation='relu')(decoder_inputs)
    
    # Repeat vector to create sequence
    x = RepeatVector(input_shape[0])(x)
    
    # Transformer blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder_block(x, head_size, num_heads, ff_dim)
    
    # Output layer
    decoder_output = TimeDistributed(Dense(input_shape[1]))(x)
    
    # Create decoder model
    decoder = Model(decoder_inputs, decoder_output, name='decoder')
    
    # Create autoencoder model
    autoencoder_input = Input(shape=input_shape, name='autoencoder_input')
    encoded = encoder(autoencoder_input)
    decoded = decoder(encoded)
    autoencoder = Model(autoencoder_input, decoded, name='transformer_autoencoder')
    
    return autoencoder, encoder, decoder

In [None]:
# Set model parameters
head_size = 64  # Size of each attention head
num_heads = 4  # Number of attention heads
ff_dim = 256  # Hidden layer size in feed forward network
num_transformer_blocks = 2  # Number of transformer blocks
encoding_dim = 32  # Dimension of the encoded representation

# Build model
transformer_autoencoder, transformer_encoder, transformer_decoder = build_transformer_autoencoder(
    input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, encoding_dim
)

# Compile model
transformer_autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Print model summary
transformer_autoencoder.summary()

## 6. Train Transformer Model

Now we'll train the Transformer model on normal data.

In [None]:
# Set training parameters
batch_size = 64
epochs = 50
validation_split = 0.1

# Create model checkpoint callback
checkpoint_path = './models/transformer_autoencoder_hai_20_07.h5'
checkpoint = ModelCheckpoint(checkpoint_path, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

# Create early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', 
                              patience=10, 
                              verbose=1, 
                              mode='min', 
                              restore_best_weights=True)

# Create learning rate scheduler callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                             factor=0.5, 
                             patience=5, 
                             verbose=1, 
                             min_lr=0.0001)

# Create TensorBoard callback
log_dir = './logs/transformer_autoencoder_' + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir, 
                         histogram_freq=1, 
                         write_graph=True, 
                         write_images=True)

# Combine callbacks
callbacks = [checkpoint, early_stopping, reduce_lr, tensorboard]

In [None]:
# Train model
start_time = time.time()

transformer_history = transformer_autoencoder.fit(
    X_train, X_train,  # Input and output are the same for autoencoder
    batch_size=batch_size,
    epochs=epochs,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1
)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(transformer_history.history['loss'])
plt.plot(transformer_history.history['val_loss'])
plt.title('Transformer Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot zoomed-in view of the last epochs
plt.subplot(1, 2, 2)
last_epochs = min(20, len(transformer_history.history['loss']))  # Last 20 epochs or all if less
plt.plot(transformer_history.history['loss'][-last_epochs:])
plt.plot(transformer_history.history['val_loss'][-last_epochs:])
plt.title(f'Transformer Model Loss (Last {last_epochs} Epochs)')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

## 7. Evaluate Models and Detect Anomalies

Now we'll evaluate both models on the test data and detect anomalies.

In [None]:
def find_optimal_threshold(mse, y_true):
    """
    Find the optimal threshold for anomaly detection using ROC curve.
    
    Args:
        mse: Reconstruction error values
        y_true: True labels (0 for normal, 1 for anomaly)
        
    Returns:
        float: Optimal threshold value
    """
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, mse)
    
    # Calculate the geometric mean of sensitivity and specificity
    gmeans = np.sqrt(tpr * (1 - fpr))
    
    # Find the optimal threshold
    ix = np.argmax(gmeans)
    optimal_threshold = thresholds[ix]
    
    print(f"Optimal threshold: {optimal_threshold:.6f}")
    print(f"At this threshold - TPR: {tpr[ix]:.4f}, FPR: {fpr[ix]:.4f}, G-mean: {gmeans[ix]:.4f}")
    
    # Plot ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, marker='.')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.scatter(fpr[ix], tpr[ix], marker='o', color='red', label=f'Optimal (Threshold = {optimal_threshold:.6f})')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Calculate AUC
    roc_auc = auc(fpr, tpr)
    print(f"ROC AUC: {roc_auc:.4f}")
    
    return optimal_threshold

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluate a model on test data.
    
    Args:
        model: Trained model
        X_test: Test data
        y_test: Test labels
        model_name: Name of the model
        
    Returns:
        tuple: (mse, threshold, y_pred) - Reconstruction errors, threshold, and predictions
    """
    print(f"\nEvaluating {model_name}...")
    
    # Predict on test data
    X_test_pred = model.predict(X_test)
    
    # Calculate reconstruction error (MSE for each sample)
    mse = np.mean(np.square(X_test - X_test_pred), axis=(1, 2))
    
    print(f"Reconstruction error statistics:")
    print(f"Min: {np.min(mse):.6f}")
    print(f"Max: {np.max(mse):.6f}")
    print(f"Mean: {np.mean(mse):.6f}")
    print(f"Std: {np.std(mse):.6f}")
    
    # Plot reconstruction error distribution
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(mse, bins=50)
    plt.title(f'{model_name} Reconstruction Error Distribution')
    plt.xlabel('Reconstruction Error (MSE)')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.hist(mse, bins=50, log=True)
    plt.title(f'{model_name} Reconstruction Error Distribution (Log Scale)')
    plt.xlabel('Reconstruction Error (MSE)')
    plt.ylabel('Frequency (Log Scale)')
    
    plt.tight_layout()
    plt.show()
    
    # Find optimal threshold
    threshold = find_optimal_threshold(mse, y_test)
    
    # Classify as anomaly if reconstruction error > threshold
    y_pred = (mse > threshold).astype(int)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Print classification report
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return mse, threshold, y_pred

In [None]:
# Load the best models
best_cnn_lstm_model = load_model('./models/cnn_lstm_autoencoder_hai_20_07.h5')
best_transformer_model = load_model('./models/transformer_autoencoder_hai_20_07.h5')

# Evaluate CNN-LSTM model
cnn_lstm_mse, cnn_lstm_threshold, cnn_lstm_pred = evaluate_model(best_cnn_lstm_model, X_test, y_test, "CNN-LSTM Model")

# Evaluate Transformer model
transformer_mse, transformer_threshold, transformer_pred = evaluate_model(best_transformer_model, X_test, y_test, "Transformer Model")

## 8. Compare Model Performance

Let's compare the performance of the CNN-LSTM and Transformer models.

In [None]:
# Calculate metrics for both models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = {
    'CNN-LSTM': {
        'accuracy': accuracy_score(y_test, cnn_lstm_pred),
        'precision': precision_score(y_test, cnn_lstm_pred),
        'recall': recall_score(y_test, cnn_lstm_pred),
        'f1': f1_score(y_test, cnn_lstm_pred),
        'auc': auc(*roc_curve(y_test, cnn_lstm_mse)[:2])
    },
    'Transformer': {
        'accuracy': accuracy_score(y_test, transformer_pred),
        'precision': precision_score(y_test, transformer_pred),
        'recall': recall_score(y_test, transformer_pred),
        'f1': f1_score(y_test, transformer_pred),
        'auc': auc(*roc_curve(y_test, transformer_mse)[:2])
    }
}

# Create comparison DataFrame
comparison_df = pd.DataFrame(metrics).T
comparison_df = comparison_df.round(4)

# Display comparison
print("Model Performance Comparison:")
display(comparison_df)

In [None]:
# Plot comparison of key metrics
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']

# Prepare data for plotting
plot_data = []
for model, model_metrics in metrics.items():
    for metric in metrics_to_plot:
        plot_data.append({
            'Model': model,
            'Metric': metric.capitalize(),
            'Value': model_metrics[metric]
        })

plot_df = pd.DataFrame(plot_data)

# Plot
plt.figure(figsize=(14, 8))
sns.barplot(x='Metric', y='Value', hue='Model', data=plot_df)
plt.title('Model Performance Comparison')
plt.ylim(0, 1.05)  # Metrics are between 0 and 1
plt.grid(axis='y')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 9. Save Models and Results

Let's save the models and results for future use.

In [None]:
# Save model metadata
cnn_lstm_metadata = {
    'threshold': cnn_lstm_threshold,
    'encoding_dim': encoding_dim,
    'filters': filters,
    'kernel_size': kernel_size,
    'lstm_units': lstm_units,
    'feature_columns': feature_columns,
    'metrics': metrics['CNN-LSTM'],
    'training_history': {
        'loss': cnn_lstm_history.history['loss'],
        'val_loss': cnn_lstm_history.history['val_loss']
    }
}

transformer_metadata = {
    'threshold': transformer_threshold,
    'encoding_dim': encoding_dim,
    'head_size': head_size,
    'num_heads': num_heads,
    'ff_dim': ff_dim,
    'num_transformer_blocks': num_transformer_blocks,
    'feature_columns': feature_columns,
    'metrics': metrics['Transformer'],
    'training_history': {
        'loss': transformer_history.history['loss'],
        'val_loss': transformer_history.history['val_loss']
    }
}

# Save metadata
joblib.dump(cnn_lstm_metadata, './models/cnn_lstm_metadata_hai_20_07.joblib')
joblib.dump(transformer_metadata, './models/transformer_metadata_hai_20_07.joblib')

print("Models and metadata saved successfully.")

## 10. Conclusion

In this notebook, we've implemented and compared advanced deep learning models for anomaly detection on the HAI security dataset. Key steps included:

1. Loading and preparing sequence data
2. Building and training a CNN-LSTM hybrid model
3. Building and training a Transformer-based model
4. Evaluating and comparing model performance
5. Saving models and results

The advanced deep learning models provide improved anomaly detection performance compared to simpler models, with the ability to capture complex spatial and temporal patterns in the data. The CNN-LSTM hybrid model combines the feature extraction capabilities of CNNs with the temporal modeling of LSTMs, while the Transformer model leverages self-attention mechanisms to capture long-range dependencies in the data.