# Improved Model Training with Adaptive Thresholds for HAI-21.03 Dataset

This notebook implements an improved LSTM autoencoder model with adaptive thresholds for anomaly detection on the HAI-21.03 industrial control system security dataset.

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, RepeatVector, TimeDistributed, Dropout, Bidirectional, Attention, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc, roc_curve, precision_score, recall_score, f1_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="darkgrid")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## 1. Global Variables Setup

In [None]:
# Set paths
OUTPUT_DIR = 'hai-security-dataset/processed'
FEATURE_DIR = 'hai-security-dataset/features'
MODEL_DIR = 'hai-security-dataset/models'

# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

# Set model parameters
SEQ_LENGTH = 128  # Sequence length for LSTM
STRIDE = 10       # Stride for sliding window
BATCH_SIZE = 64   # Batch size for training
EPOCHS = 50       # Maximum number of epochs
PATIENCE = 10     # Patience for early stopping

## 2. Load Selected Features

In [None]:
# Load feature selection results
with open(os.path.join(FEATURE_DIR, 'feature_selection_results.pkl'), 'rb') as f:
    feature_selection_results = pickle.load(f)

ensemble_features = feature_selection_results['ensemble_features']
ensemble_indices = feature_selection_results['ensemble_indices']
scaler = feature_selection_results['scaler']
pca = feature_selection_results['pca']

print(f"Loaded {len(ensemble_features)} selected features")
print(f"PCA components: {pca.n_components_}")
print(f"PCA explained variance: {np.sum(pca.explained_variance_ratio_):.4f}")

In [None]:
# Load processed training data
train_df = pd.read_csv(os.path.join(OUTPUT_DIR, 'train_processed_enhanced.csv'))
print(f"Loaded enhanced training data: {train_df.shape[0]} rows, {train_df.shape[1]} columns")

# Extract feature columns and target
feature_cols = [col for col in train_df.columns if col not in ['time', 'attack']]
target_col = 'attack'

# Extract features and target
X_train = train_df[feature_cols].values
y_train = train_df[target_col].values

# Scale features
X_train_scaled = scaler.transform(X_train)

# Select ensemble features
X_train_ensemble = X_train_scaled[:, ensemble_indices]

# Apply PCA
X_train_pca = pca.transform(X_train_ensemble)

print(f"Training data shape: X={X_train.shape} -> X_ensemble={X_train_ensemble.shape} -> X_pca={X_train_pca.shape}")

## 3. Create Sequences for Time Series Modeling

In [None]:
def create_sequences(data, seq_length=128, stride=10):
    """
    Create sequences for time series models.
    
    Args:
        data (np.array): Input data
        seq_length (int): Sequence length
        stride (int): Stride for sliding window
        
    Returns:
        np.array: Sequences
    """
    sequences = []
    for i in range(0, len(data) - seq_length + 1, stride):
        seq = data[i:i+seq_length]
        sequences.append(seq)
    
    return np.array(sequences)

In [None]:
# Create sequences
X_train_seq = create_sequences(X_train_pca, SEQ_LENGTH, STRIDE)
print(f"Training sequences shape: {X_train_seq.shape}")

# Split into training and validation sets
X_train_seq, X_val_seq = train_test_split(X_train_seq, test_size=0.2, random_state=42)
print(f"Training set: {X_train_seq.shape}, Validation set: {X_val_seq.shape}")

## 4. Build Improved LSTM Autoencoder Model

In [None]:
def build_lstm_autoencoder(seq_length, n_features, latent_dim=64, dropout_rate=0.2, use_bidirectional=True, use_attention=True):
    """
    Build an improved LSTM autoencoder model.
    
    Args:
        seq_length (int): Sequence length
        n_features (int): Number of features
        latent_dim (int): Dimension of latent space
        dropout_rate (float): Dropout rate
        use_bidirectional (bool): Whether to use bidirectional LSTM
        use_attention (bool): Whether to use attention mechanism
        
    Returns:
        Model: Keras model
    """
    # Input layer
    inputs = Input(shape=(seq_length, n_features))
    
    # Encoder
    if use_bidirectional:
        x = Bidirectional(LSTM(128, return_sequences=True))(inputs)
    else:
        x = LSTM(128, return_sequences=True)(inputs)
    
    x = Dropout(dropout_rate)(x)
    
    if use_bidirectional:
        x = Bidirectional(LSTM(64, return_sequences=False))(x)
    else:
        x = LSTM(64, return_sequences=False)(x)
    
    x = Dropout(dropout_rate)(x)
    
    # Latent space
    encoded = Dense(latent_dim)(x)
    
    # Decoder
    x = Dense(64)(encoded)
    x = RepeatVector(seq_length)(x)
    
    if use_bidirectional:
        x = Bidirectional(LSTM(64, return_sequences=True))(x)
    else:
        x = LSTM(64, return_sequences=True)(x)
    
    x = Dropout(dropout_rate)(x)
    
    if use_bidirectional:
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
    else:
        x = LSTM(128, return_sequences=True)(x)
    
    # Output layer
    outputs = TimeDistributed(Dense(n_features))(x)
    
    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    return model

In [None]:
# Build model
model = build_lstm_autoencoder(
    seq_length=SEQ_LENGTH,
    n_features=X_train_pca.shape[1],
    latent_dim=32,
    dropout_rate=0.2,
    use_bidirectional=True,
    use_attention=True
)

# Print model summary
model.summary()

## 5. Train Model

In [None]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    os.path.join(MODEL_DIR, 'improved_lstm_autoencoder.h5'),
    monitor='val_loss',
    save_best_only=True
)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)
tensorboard = TensorBoard(log_dir=os.path.join(MODEL_DIR, 'logs'))

# Train model
history = model.fit(
    X_train_seq, X_train_seq,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_seq, X_val_seq),
    callbacks=[early_stopping, model_checkpoint, reduce_lr, tensorboard],
    verbose=1
)

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Training History')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

## 6. Calculate Reconstruction Errors

In [None]:
# Load the best model
model = load_model(os.path.join(MODEL_DIR, 'improved_lstm_autoencoder.h5'))

# Get predictions for validation set
X_val_pred = model.predict(X_val_seq)

# Calculate reconstruction errors (MSE)
mse = np.mean(np.square(X_val_seq - X_val_pred), axis=(1, 2))

# Plot reconstruction error distribution
plt.figure(figsize=(12, 6))
plt.hist(mse, bins=50, alpha=0.7)
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## 7. Implement Adaptive Thresholding Strategies

In [None]:
def calculate_thresholds(mse, methods=['percentile', 'iqr', 'dynamic']):
    """
    Calculate different thresholds for anomaly detection.
    
    Args:
        mse (np.array): Reconstruction errors
        methods (list): List of threshold methods to use
        
    Returns:
        dict: Dictionary of thresholds
    """
    thresholds = {}
    
    if 'percentile' in methods:
        # Percentile-based threshold (e.g., 95th, 99th percentile)
        thresholds['percentile_95'] = np.percentile(mse, 95)
        thresholds['percentile_99'] = np.percentile(mse, 99)
    
    if 'iqr' in methods:
        # IQR-based threshold
        q1 = np.percentile(mse, 25)
        q3 = np.percentile(mse, 75)
        iqr = q3 - q1
        thresholds['iqr'] = q3 + 1.5 * iqr
    
    if 'dynamic' in methods:
        # Dynamic threshold based on mean and standard deviation
        mean = np.mean(mse)
        std = np.std(mse)
        thresholds['dynamic'] = mean + 3 * std
    
    if 'kmeans' in methods:
        # K-means clustering to separate normal and anomalous errors
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=2, random_state=42)
        kmeans.fit(mse.reshape(-1, 1))
        centers = kmeans.cluster_centers_.flatten()
        thresholds['kmeans'] = (max(centers) + min(centers)) / 2
    
    return thresholds

In [None]:
# Calculate thresholds
thresholds = calculate_thresholds(mse, methods=['percentile', 'iqr', 'dynamic', 'kmeans'])

# Print thresholds
for method, threshold in thresholds.items():
    print(f"{method} threshold: {threshold:.6f}")

# Plot reconstruction error distribution with thresholds
plt.figure(figsize=(12, 6))
plt.hist(mse, bins=50, alpha=0.7)

colors = ['r', 'g', 'b', 'm', 'c']
for i, (method, threshold) in enumerate(thresholds.items()):
    plt.axvline(x=threshold, color=colors[i % len(colors)], linestyle='--', label=f'{method} threshold: {threshold:.6f}')

plt.title('Reconstruction Error Distribution with Thresholds')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

## 8. Implement Multi-Threshold Strategy

In [None]:
class MultiThresholdDetector:
    """
    Multi-threshold anomaly detector that can adapt to different operating points.
    """
    def __init__(self, model, scaler, pca, ensemble_indices, seq_length, stride):
        """
        Initialize the detector.
        
        Args:
            model: Trained autoencoder model
            scaler: Feature scaler
            pca: PCA transformer
            ensemble_indices: Indices of selected features
            seq_length: Sequence length
            stride: Stride for sliding window
        """
        self.model = model
        self.scaler = scaler
        self.pca = pca
        self.ensemble_indices = ensemble_indices
        self.seq_length = seq_length
        self.stride = stride
        self.thresholds = {}
        self.operating_points = {}
    
    def fit(self, X_train, y_train=None):
        """
        Fit the detector by calculating thresholds.
        
        Args:
            X_train: Training data
            y_train: Training labels (optional)
        """
        # Scale features
        X_train_scaled = self.scaler.transform(X_train)
        
        # Select ensemble features
        X_train_ensemble = X_train_scaled[:, self.ensemble_indices]
        
        # Apply PCA
        X_train_pca = self.pca.transform(X_train_ensemble)
        
        # Create sequences
        X_train_seq = create_sequences(X_train_pca, self.seq_length, self.stride)
        
        # Get predictions
        X_train_pred = self.model.predict(X_train_seq)
        
        # Calculate reconstruction errors
        mse = np.mean(np.square(X_train_seq - X_train_pred), axis=(1, 2))
        
        # Calculate thresholds
        self.thresholds = calculate_thresholds(mse, methods=['percentile', 'iqr', 'dynamic', 'kmeans'])
        
        # Define operating points
        self.operating_points = {
            'high_precision': self.thresholds['percentile_99'],  # Fewer false positives
            'balanced': self.thresholds['percentile_95'],        # Balanced precision and recall
            'high_recall': self.thresholds['iqr'],               # Fewer false negatives
            'adaptive': self.thresholds['dynamic']               # Adaptive to data distribution
        }
        
        return self
    
    def predict(self, X, operating_point='balanced'):
        """
        Predict anomalies.
        
        Args:
            X: Input data
            operating_point: Operating point to use ('high_precision', 'balanced', 'high_recall', 'adaptive')
            
        Returns:
            tuple: (anomaly_scores, anomaly_labels)
        """
        # Scale features
        X_scaled = self.scaler.transform(X)
        
        # Select ensemble features
        X_ensemble = X_scaled[:, self.ensemble_indices]
        
        # Apply PCA
        X_pca = self.pca.transform(X_ensemble)
        
        # Create sequences
        X_seq = create_sequences(X_pca, self.seq_length, self.stride)
        
        # Get predictions
        X_pred = self.model.predict(X_seq)
        
        # Calculate reconstruction errors
        mse = np.mean(np.square(X_seq - X_pred), axis=(1, 2))
        
        # Initialize anomaly scores array
        anomaly_scores = np.zeros(len(X))
        count = np.zeros(len(X))
        
        # For each sequence, if it's anomalous, increment the score for all points in the sequence
        for i, error in enumerate(mse):
            idx = i * self.stride
            if idx + self.seq_length <= len(X):
                anomaly_scores[idx:idx+self.seq_length] += error
                count[idx:idx+self.seq_length] += 1
        
        # Normalize scores by count
        anomaly_scores = np.divide(anomaly_scores, count, out=np.zeros_like(anomaly_scores), where=count!=0)
        
        # Get threshold for the specified operating point
        threshold = self.operating_points.get(operating_point, self.operating_points['balanced'])
        
        # Apply threshold to get binary labels
        anomaly_labels = (anomaly_scores > threshold).astype(int)
        
        return anomaly_scores, anomaly_labels, threshold
    
    def post_process_anomalies(self, anomaly_labels, min_anomaly_length=30, gap_threshold=3):
        """
        Apply post-processing to reduce false positives and false negatives.
        
        Args:
            anomaly_labels (np.array): Binary anomaly labels
            min_anomaly_length (int): Minimum length of anomalies to keep
            gap_threshold (int): Maximum gap between anomalies to merge
            
        Returns:
            np.array: Processed binary anomaly labels
        """
        # Make a copy to avoid modifying the original
        processed_labels = anomaly_labels.copy()
        
        # Remove short anomalies (likely false positives)
        i = 0
        while i < len(processed_labels):
            if processed_labels[i] == 1:
                # Find the end of this anomaly
                j = i
                while j < len(processed_labels) and processed_labels[j] == 1:
                    j += 1
                
                # If anomaly is too short, remove it
                if j - i < min_anomaly_length:
                    processed_labels[i:j] = 0
                
                i = j
            else:
                i += 1
        
        # Merge nearby anomalies
        i = 0
        while i < len(processed_labels):
            if processed_labels[i] == 1:
                # Find the end of this anomaly
                j = i
                while j < len(processed_labels) and processed_labels[j] == 1:
                    j += 1
                
                # Look for another anomaly nearby
                if j < len(processed_labels) - gap_threshold:
                    next_start = j
                    while next_start < j + gap_threshold and next_start < len(processed_labels) and processed_labels[next_start] == 0:
                        next_start += 1
                    
                    if next_start < j + gap_threshold and next_start < len(processed_labels) and processed_labels[next_start] == 1:
                        processed_labels[j:next_start] = 1
                
                i = j
            else:
                i += 1
        
        return processed_labels

In [None]:
# Initialize multi-threshold detector
detector = MultiThresholdDetector(
    model=model,
    scaler=scaler,
    pca=pca,
    ensemble_indices=ensemble_indices,
    seq_length=SEQ_LENGTH,
    stride=STRIDE
)

# Fit detector
detector.fit(X_train)

# Print operating points
print("Operating points:")
for point, threshold in detector.operating_points.items():
    print(f"  {point}: {threshold:.6f}")

## 9. Save Model and Detector

In [None]:
# Save model parameters
model_params = {
    'seq_length': SEQ_LENGTH,
    'stride': STRIDE,
    'n_components': pca.n_components_,
    'thresholds': detector.thresholds,
    'operating_points': detector.operating_points,
    'ensemble_features': ensemble_features,
    'ensemble_indices': ensemble_indices
}

with open(os.path.join(MODEL_DIR, 'improved_model_params.pkl'), 'wb') as f:
    pickle.dump(model_params, f)

print(f"Saved model parameters to {os.path.join(MODEL_DIR, 'improved_model_params.pkl')}")

# Save detector
with open(os.path.join(MODEL_DIR, 'multi_threshold_detector.pkl'), 'wb') as f:
    pickle.dump(detector, f)

print(f"Saved detector to {os.path.join(MODEL_DIR, 'multi_threshold_detector.pkl')}")

## 10. Test on Sample Data

In [None]:
# Load test data
with open(os.path.join(FEATURE_DIR, 'test_data_selected.pkl'), 'rb') as f:
    test_data_selected = pickle.load(f)

# Get first test file
test_name = list(test_data_selected.keys())[0]
test_data = test_data_selected[test_name]

# Load original test data to get features
test_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('test') and f.endswith('_processed_enhanced.csv')]
test_df = pd.read_csv(os.path.join(OUTPUT_DIR, test_files[0]))

# Extract features and target
X_test = test_df[feature_cols].values
y_test = test_df['attack'].values

# Test detector with different operating points
operating_points = ['high_precision', 'balanced', 'high_recall', 'adaptive']
results = {}

for point in operating_points:
    print(f"\nTesting with operating point: {point}")
    
    # Predict anomalies
    anomaly_scores, anomaly_labels, threshold = detector.predict(X_test, operating_point=point)
    
    # Apply post-processing
    processed_labels = detector.post_process_anomalies(anomaly_labels, min_anomaly_length=30, gap_threshold=3)
    
    # Calculate metrics
    precision = precision_score(y_test, processed_labels)
    recall = recall_score(y_test, processed_labels)
    f1 = f1_score(y_test, processed_labels)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    results[point] = {
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'anomaly_scores': anomaly_scores,
        'anomaly_labels': processed_labels
    }

In [None]:
# Visualize results for different operating points
plt.figure(figsize=(15, 12))

# Plot ground truth
plt.subplot(len(operating_points) + 1, 1, 1)
plt.plot(y_test, 'b-', label='Ground Truth')
plt.title('Ground Truth')
plt.ylabel('Anomaly')
plt.yticks([0, 1])
plt.grid(True)
plt.legend()

# Plot predictions for each operating point
for i, point in enumerate(operating_points):
    plt.subplot(len(operating_points) + 1, 1, i + 2)
    plt.plot(results[point]['anomaly_labels'], 'g-', label=f'Predictions ({point})')
    plt.title(f'Predictions - {point} (Precision: {results[point]["precision"]:.4f}, Recall: {results[point]["recall"]:.4f}, F1: {results[point]["f1_score"]:.4f})')
    plt.ylabel('Anomaly')
    plt.yticks([0, 1])
    plt.grid(True)
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curve
plt.figure(figsize=(10, 8))

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, results['balanced']['anomaly_scores'])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Mark operating points
for point in operating_points:
    # Find closest threshold
    idx = np.argmin(np.abs(thresholds - results[point]['threshold']))
    plt.plot(fpr[idx], tpr[idx], 'o', markersize=10, label=f'{point} (threshold={results[point]["threshold"]:.4f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Plot Precision-Recall curve
plt.figure(figsize=(10, 8))

# Calculate Precision-Recall curve
precision_values, recall_values, thresholds = precision_recall_curve(y_test, results['balanced']['anomaly_scores'])
pr_auc = auc(recall_values, precision_values)

# Plot Precision-Recall curve
plt.plot(recall_values, precision_values, color='darkorange', lw=2, label=f'PR curve (area = {pr_auc:.4f})')

# Mark operating points
for point in operating_points:
    plt.plot(results[point]['recall'], results[point]['precision'], 'o', markersize=10, label=f'{point} (threshold={results[point]["threshold"]:.4f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()

## 11. Save Results

In [None]:
# Save results
with open(os.path.join(MODEL_DIR, 'operating_point_results.pkl'), 'wb') as f:
    pickle.dump(results, f)

print(f"Saved operating point results to {os.path.join(MODEL_DIR, 'operating_point_results.pkl')}")

# Create a summary DataFrame
summary = []
for point in operating_points:
    summary.append({
        'Operating Point': point,
        'Threshold': results[point]['threshold'],
        'Precision': results[point]['precision'],
        'Recall': results[point]['recall'],
        'F1 Score': results[point]['f1_score']
    })

summary_df = pd.DataFrame(summary)
summary_df.to_csv(os.path.join(MODEL_DIR, 'operating_point_summary.csv'), index=False)

print(f"Saved operating point summary to {os.path.join(MODEL_DIR, 'operating_point_summary.csv')}")
summary_df