# LSTM Model for HAI Security Dataset Anomaly Detection

This notebook implements a Long Short-Term Memory (LSTM) neural network for anomaly detection on the HAI security dataset. LSTM networks are well-suited for time series anomaly detection due to their ability to capture temporal dependencies in the data.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import time
import joblib
from datetime import datetime
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, BatchNormalization, Input, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.optimizers import Adam
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if len(tf.config.list_physical_devices('GPU')) > 0:
    print("GPU is available for training")
    # Set memory growth to avoid memory allocation errors
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPU available, using CPU for training")

## 1. Load Preprocessed Data

First, let's load the preprocessed data created in the preprocessing notebook.

In [None]:
def load_processed_data(file_path):
    """
    Load processed data from NPZ file.
    
    Args:
        file_path: Path to the NPZ file
        
    Returns:
        DataFrame: Loaded data
    """
    # Load NPZ file
    npz_data = np.load(file_path)
    
    # Convert to DataFrame
    df = pd.DataFrame(npz_data['data'], columns=npz_data['columns'])
    
    return df

In [None]:
# Load preprocessor
preprocessor_path = './models/hai_hai_20_07_standard_preprocessor.joblib'
preprocessor_dict = joblib.load(preprocessor_path)

# Extract important information
feature_columns = preprocessor_dict['feature_columns']
attack_columns = preprocessor_dict['attack_columns']
timestamp_col = preprocessor_dict['timestamp_col']

print(f"Number of features: {len(feature_columns)}")
print(f"Attack columns: {attack_columns}")
print(f"Timestamp column: {timestamp_col}")

In [None]:
# Get list of processed data files
train_data_dir = './processed_data/hai-20.07/train'
test_data_dir = './processed_data/hai-20.07/test'

train_files = sorted(glob.glob(f'{train_data_dir}/*.npz'))
test_files = sorted(glob.glob(f'{test_data_dir}/*.npz'))

print(f"Training files: {[os.path.basename(f) for f in train_files]}")
print(f"Test files: {[os.path.basename(f) for f in test_files]}")

## 2. Prepare Data for LSTM

LSTM models require sequences of data as input. We'll create sequences of fixed length from our time series data.

In [None]:
def create_sequences(data, feature_cols, target_col=None, seq_length=100):
    """
    Create sequences for LSTM input.
    
    Args:
        data: DataFrame containing the data
        feature_cols: List of feature column names
        target_col: Target column name (None for unsupervised learning)
        seq_length: Length of each sequence
        
    Returns:
        tuple: (X, y) - Sequences and targets (if target_col is provided)
    """
    X = []
    y = [] if target_col is not None else None
    
    # Extract features
    features = data[feature_cols].values
    
    # Extract target if provided
    targets = data[target_col].values if target_col is not None else None
    
    # Create sequences
    for i in range(len(features) - seq_length):
        X.append(features[i:i+seq_length])
        if target_col is not None:
            # Use the label of the last timestep in the sequence
            y.append(targets[i+seq_length])
    
    return np.array(X), np.array(y) if target_col is not None else None

In [None]:
def load_and_prepare_data(file_paths, feature_cols, target_col=None, seq_length=100, max_files=None):
    """
    Load and prepare data from multiple files.
    
    Args:
        file_paths: List of file paths
        feature_cols: List of feature column names
        target_col: Target column name (None for unsupervised learning)
        seq_length: Length of each sequence
        max_files: Maximum number of files to load (None for all files)
        
    Returns:
        tuple: (X, y) - Combined sequences and targets
    """
    all_X = []
    all_y = [] if target_col is not None else None
    
    # Limit the number of files if specified
    if max_files is not None:
        file_paths = file_paths[:max_files]
    
    for file_path in file_paths:
        print(f"Processing {os.path.basename(file_path)}...")
        
        # Load data
        df = load_processed_data(file_path)
        
        # Create sequences
        X, y = create_sequences(df, feature_cols, target_col, seq_length)
        
        all_X.append(X)
        if target_col is not None:
            all_y.append(y)
    
    # Combine data from all files
    combined_X = np.vstack(all_X) if all_X else np.array([])
    combined_y = np.concatenate(all_y) if all_y else None
    
    return combined_X, combined_y

In [None]:
# Set parameters
seq_length = 100  # Sequence length (number of time steps)
target_col = 'attack' if attack_columns else None  # Target column

# Load and prepare training data
print("Loading and preparing training data...")
X_train, _ = load_and_prepare_data(train_files, feature_columns, target_col=None, seq_length=seq_length, max_files=2)

# Load and prepare test data
print("\nLoading and preparing test data...")
X_test, y_test = load_and_prepare_data(test_files, feature_columns, target_col=target_col, seq_length=seq_length, max_files=2)

print(f"\nTraining data shape: {X_train.shape}")
if y_test is not None:
    print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")
else:
    print(f"Test data shape: {X_test.shape}")

## 3. Build LSTM Autoencoder Model

We'll use an LSTM autoencoder for anomaly detection. The autoencoder will learn to reconstruct normal behavior, and anomalies will have higher reconstruction errors.

In [None]:
def build_lstm_autoencoder(input_shape, encoding_dim=32):
    """
    Build an LSTM autoencoder model.
    
    Args:
        input_shape: Shape of input data (seq_length, num_features)
        encoding_dim: Dimension of the encoded representation
        
    Returns:
        tuple: (model, encoder, decoder) - Full model, encoder part, and decoder part
    """
    # Encoder
    encoder_inputs = Input(shape=input_shape)
    encoder_lstm1 = LSTM(128, return_sequences=True)(encoder_inputs)
    encoder_bn1 = BatchNormalization()(encoder_lstm1)
    encoder_lstm2 = LSTM(64, return_sequences=False)(encoder_bn1)
    encoder_bn2 = BatchNormalization()(encoder_lstm2)
    encoder_output = Dense(encoding_dim)(encoder_bn2)
    
    # Decoder
    decoder_inputs = Input(shape=(encoding_dim,))
    decoder_dense1 = Dense(64)(decoder_inputs)
    decoder_bn1 = BatchNormalization()(decoder_dense1)
    decoder_repeat = RepeatVector(input_shape[0])(decoder_bn1)
    decoder_lstm1 = LSTM(64, return_sequences=True)(decoder_repeat)
    decoder_bn2 = BatchNormalization()(decoder_lstm1)
    decoder_lstm2 = LSTM(128, return_sequences=True)(decoder_bn2)
    decoder_bn3 = BatchNormalization()(decoder_lstm2)
    decoder_output = TimeDistributed(Dense(input_shape[1]))(decoder_bn3)
    
    # Create models
    encoder = Model(encoder_inputs, encoder_output, name='encoder')
    decoder = Model(decoder_inputs, decoder_output, name='decoder')
    
    # Autoencoder (end-to-end model)
    autoencoder_output = decoder(encoder(encoder_inputs))
    autoencoder = Model(encoder_inputs, autoencoder_output, name='autoencoder')
    
    return autoencoder, encoder, decoder

In [None]:
# Set model parameters
encoding_dim = 32  # Dimension of the encoded representation
input_shape = (X_train.shape[1], X_train.shape[2])  # (seq_length, num_features)

# Build model
autoencoder, encoder, decoder = build_lstm_autoencoder(input_shape, encoding_dim)

# Compile model
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Print model summary
autoencoder.summary()

## 4. Train LSTM Autoencoder Model

Now we'll train the autoencoder on normal data (without attacks) to learn the normal behavior patterns.

In [None]:
# Set training parameters
batch_size = 64
epochs = 50
validation_split = 0.1

# Create model checkpoint callback
os.makedirs('./models', exist_ok=True)
checkpoint_path = './models/lstm_autoencoder_hai_20_07.h5'
checkpoint = ModelCheckpoint(checkpoint_path, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

# Create early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', 
                              patience=10, 
                              verbose=1, 
                              mode='min', 
                              restore_best_weights=True)

# Create TensorBoard callback
log_dir = './logs/lstm_autoencoder_' + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir, 
                         histogram_freq=1, 
                         write_graph=True, 
                         write_images=True)

# Combine callbacks
callbacks = [checkpoint, early_stopping, tensorboard]

In [None]:
# Train model
start_time = time.time()

history = autoencoder.fit(
    X_train, X_train,  # Input and output are the same for autoencoder
    batch_size=batch_size,
    epochs=epochs,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1
)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot zoomed-in view of the last epochs
plt.subplot(1, 2, 2)
last_epochs = min(20, len(history.history['loss']))  # Last 20 epochs or all if less
plt.plot(history.history['loss'][-last_epochs:])
plt.plot(history.history['val_loss'][-last_epochs:])
plt.title(f'Model Loss (Last {last_epochs} Epochs)')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

## 5. Evaluate Model and Detect Anomalies

Now we'll use the trained autoencoder to detect anomalies in the test data.

In [None]:
# Load the best model
best_model = load_model(checkpoint_path)

# Predict on test data
X_test_pred = best_model.predict(X_test)

# Calculate reconstruction error
mse = np.mean(np.square(X_test - X_test_pred), axis=(1, 2))

print(f"Reconstruction error statistics:")
print(f"Min: {np.min(mse):.6f}")
print(f"Max: {np.max(mse):.6f}")
print(f"Mean: {np.mean(mse):.6f}")
print(f"Std: {np.std(mse):.6f}")

In [None]:
# Plot reconstruction error distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(mse, bins=50)
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(mse, bins=50, log=True)
plt.title('Reconstruction Error Distribution (Log Scale)')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Frequency (Log Scale)')

plt.tight_layout()
plt.show()

### 5.1 Determine Anomaly Threshold

We need to determine a threshold for the reconstruction error to classify data points as normal or anomalous.

In [None]:
def find_optimal_threshold(mse, y_true):
    """
    Find the optimal threshold for anomaly detection using ROC curve.
    
    Args:
        mse: Reconstruction error values
        y_true: True labels (0 for normal, 1 for anomaly)
        
    Returns:
        float: Optimal threshold value
    """
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, mse)
    
    # Calculate the geometric mean of sensitivity and specificity
    gmeans = np.sqrt(tpr * (1 - fpr))
    
    # Find the optimal threshold
    ix = np.argmax(gmeans)
    optimal_threshold = thresholds[ix]
    
    print(f"Optimal threshold: {optimal_threshold:.6f}")
    print(f"At this threshold - TPR: {tpr[ix]:.4f}, FPR: {fpr[ix]:.4f}, G-mean: {gmeans[ix]:.4f}")
    
    # Plot ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, marker='.')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.scatter(fpr[ix], tpr[ix], marker='o', color='red', label=f'Optimal (Threshold = {optimal_threshold:.6f})')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return optimal_threshold

In [None]:
# Find optimal threshold if labels are available
if y_test is not None:
    threshold = find_optimal_threshold(mse, y_test)
else:
    # If no labels, use a statistical approach
    threshold = np.mean(mse) + 3 * np.std(mse)  # Mean + 3 standard deviations
    print(f"Using statistical threshold: {threshold:.6f}")

In [None]:
# Classify as anomaly if reconstruction error > threshold
y_pred = (mse > threshold).astype(int)

# Evaluate if labels are available
if y_test is not None:
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Calculate ROC AUC
    roc_auc = auc(*roc_curve(y_test, mse)[:2])
    print(f"ROC AUC: {roc_auc:.4f}")

### 5.2 Visualize Anomalies

Let's visualize some examples of normal and anomalous sequences.

In [None]:
def plot_reconstruction_examples(X, X_pred, mse, threshold, y_true=None, num_examples=3):
    """
    Plot examples of normal and anomalous sequences with their reconstructions.
    
    Args:
        X: Original sequences
        X_pred: Reconstructed sequences
        mse: Reconstruction errors
        threshold: Anomaly threshold
        y_true: True labels (optional)
        num_examples: Number of examples to plot for each category
    """
    # Predicted labels
    y_pred = (mse > threshold).astype(int)
    
    # Find indices of true positives, false positives, true negatives, and false negatives
    if y_true is not None:
        tp_indices = np.where((y_true == 1) & (y_pred == 1))[0]
        fp_indices = np.where((y_true == 0) & (y_pred == 1))[0]
        tn_indices = np.where((y_true == 0) & (y_pred == 0))[0]
        fn_indices = np.where((y_true == 1) & (y_pred == 0))[0]
        
        categories = [
            ('True Positive (Correctly Detected Anomaly)', tp_indices),
            ('False Positive (False Alarm)', fp_indices),
            ('True Negative (Correctly Identified Normal)', tn_indices),
            ('False Negative (Missed Anomaly)', fn_indices)
        ]
    else:
        # If no true labels, just show predicted anomalies and normal
        anomaly_indices = np.where(y_pred == 1)[0]
        normal_indices = np.where(y_pred == 0)[0]
        
        categories = [
            ('Predicted Anomaly', anomaly_indices),
            ('Predicted Normal', normal_indices)
        ]
    
    # Plot examples for each category
    for category_name, indices in categories:
        if len(indices) == 0:
            print(f"No examples found for category: {category_name}")
            continue
        
        # Select random examples
        selected_indices = np.random.choice(indices, size=min(num_examples, len(indices)), replace=False)
        
        for i, idx in enumerate(selected_indices):
            # Select a random feature to visualize
            feature_idx = np.random.randint(0, X.shape[2])
            
            plt.figure(figsize=(12, 6))
            plt.subplot(2, 1, 1)
            plt.plot(X[idx, :, feature_idx], label='Original')
            plt.plot(X_pred[idx, :, feature_idx], label='Reconstruction')
            plt.title(f"{category_name} - Example {i+1} (Feature {feature_idx}) - MSE: {mse[idx]:.6f}")
            plt.legend()
            plt.grid(True)
            
            plt.subplot(2, 1, 2)
            plt.plot(np.abs(X[idx, :, feature_idx] - X_pred[idx, :, feature_idx]), color='red')
            plt.title('Absolute Reconstruction Error')
            plt.grid(True)
            
            plt.tight_layout()
            plt.show()

In [None]:
# Plot reconstruction examples
plot_reconstruction_examples(X_test, X_test_pred, mse, threshold, y_test, num_examples=2)

### 5.3 Analyze Anomaly Patterns

Let's analyze the patterns of detected anomalies to understand their characteristics.

In [None]:
# Calculate feature-wise reconstruction error
feature_mse = np.mean(np.square(X_test - X_test_pred), axis=1)  # Average over time steps

# Get indices of anomalies
anomaly_indices = np.where(y_pred == 1)[0]

if len(anomaly_indices) > 0:
    # Calculate average feature-wise error for anomalies
    anomaly_feature_mse = feature_mse[anomaly_indices].mean(axis=0)
    
    # Get top 10 features with highest reconstruction error
    top_features_idx = np.argsort(anomaly_feature_mse)[-10:]
    top_features_error = anomaly_feature_mse[top_features_idx]
    
    # Plot top features
    plt.figure(figsize=(12, 6))
    plt.barh(range(len(top_features_idx)), top_features_error[::-1])
    plt.yticks(range(len(top_features_idx)), [f"Feature {idx}" for idx in top_features_idx[::-1]])
    plt.title('Top 10 Features with Highest Reconstruction Error in Anomalies')
    plt.xlabel('Mean Squared Error')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 features with highest reconstruction error in anomalies:")
    for i, idx in enumerate(top_features_idx[::-1]):
        print(f"{i+1}. Feature {idx}: {anomaly_feature_mse[idx]:.6f}")
else:
    print("No anomalies detected.")

## 6. Save Model and Results

Finally, let's save the model and results for future use.

In [None]:
# Save model components
os.makedirs('./models', exist_ok=True)
encoder.save('./models/lstm_encoder_hai_20_07.h5')
decoder.save('./models/lstm_decoder_hai_20_07.h5')

# Save threshold and other metadata
model_metadata = {
    'threshold': threshold,
    'encoding_dim': encoding_dim,
    'seq_length': seq_length,
    'feature_columns': feature_columns,
    'training_history': {
        'loss': history.history['loss'],
        'val_loss': history.history['val_loss']
    }
}

joblib.dump(model_metadata, './models/lstm_model_metadata_hai_20_07.joblib')
print("Model and metadata saved successfully.")

## 7. Conclusion

In this notebook, we've implemented an LSTM autoencoder for anomaly detection on the HAI security dataset. The model learns to reconstruct normal behavior patterns and identifies anomalies based on reconstruction error. Key steps included:

1. Loading and preparing preprocessed data
2. Building an LSTM autoencoder model
3. Training the model on normal data
4. Detecting anomalies using reconstruction error
5. Evaluating the model's performance
6. Analyzing anomaly patterns

The LSTM autoencoder provides an effective approach for detecting anomalies in industrial control system data, which can help identify potential security threats or system malfunctions.