# HAI-20.07 LSTM Autoencoder Model

Anomaly detection using LSTM Autoencoder.

In [None]:
import sys
sys.path.append('..')

import polars as pl
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
from sklearn.preprocessing import StandardScaler

from utils.model_utils import ModelManager
from utils.evaluation import Evaluator
from utils.visualization import Visualizer

## 1. Load and Prepare Data

In [None]:
# Load preprocessed data
processed_dir = Path('processed_data')

train_df1 = pl.read_parquet(processed_dir / 'train1.parquet')
train_df2 = pl.read_parquet(processed_dir / 'train2.parquet')
test_df1 = pl.read_parquet(processed_dir / 'test1.parquet')
test_df2 = pl.read_parquet(processed_dir / 'test2.parquet')

# Combine training data
train_data = pl.concat([train_df1, train_df2])

# Separate features and labels
feature_cols = [col for col in train_data.columns if not col.startswith('attack')]
X_train = train_data.select(feature_cols).to_numpy()
X_test1 = test_df1.select(feature_cols).to_numpy()
X_test2 = test_df2.select(feature_cols).to_numpy()

y_test1 = test_df1.select('attack').to_numpy().ravel()
y_test2 = test_df2.select('attack').to_numpy().ravel()

# Create sequences
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length + 1):
        sequences.append(data[i:i+seq_length])
    return np.array(sequences)

# Parameters
sequence_length = 60  # 1 minute of data (assuming 1 Hz sampling)
n_features = X_train.shape[1]

# Create sequences
X_train_seq = create_sequences(X_train, sequence_length)
X_test1_seq = create_sequences(X_test1, sequence_length)
X_test2_seq = create_sequences(X_test2, sequence_length)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_seq)
X_test1_tensor = torch.FloatTensor(X_test1_seq)
X_test2_tensor = torch.FloatTensor(X_test2_seq)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test1_dataset = TensorDataset(X_test1_tensor, X_test1_tensor)
test1_loader = DataLoader(test1_dataset, batch_size=32, shuffle=False)

test2_dataset = TensorDataset(X_test2_tensor, X_test2_tensor)
test2_loader = DataLoader(test2_dataset, batch_size=32, shuffle=False)

## 2. Model Definition

In [None]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMAutoencoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Encoder
        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        
        # Decoder
        self.decoder = nn.LSTM(
            input_size=hidden_size,
            hidden_size=input_size,
            num_layers=num_layers,
            batch_first=True
        )
        
    def forward(self, x):
        # Encode
        encoded, _ = self.encoder(x)
        
        # Decode
        decoded, _ = self.decoder(encoded)
        
        return decoded

# Initialize model
model = LSTMAutoencoder(
    input_size=n_features,
    hidden_size=64,
    num_layers=2
)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## 3. Model Training

In [None]:
# Training parameters
n_epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training history
history = {'train_loss': []}

# Training loop
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    
    for batch_x, _ in train_loader:
        batch_x = batch_x.to(device)
        
        # Forward pass
        output = model(batch_x)
        loss = criterion(output, batch_x)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Average loss for the epoch
    train_loss /= len(train_loader)
    history['train_loss'].append(train_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {train_loss:.4f}')

## 4. Anomaly Detection

In [None]:
def get_reconstruction_errors(model, dataloader, device):
    model.eval()
    reconstruction_errors = []
    
    with torch.no_grad():
        for batch_x, _ in dataloader:
            batch_x = batch_x.to(device)
            output = model(batch_x)
            
            # Calculate reconstruction error for each sequence
            errors = torch.mean(torch.pow(batch_x - output, 2), dim=(1,2))
            reconstruction_errors.extend(errors.cpu().numpy())
    
    return np.array(reconstruction_errors)

# Get reconstruction errors
train_errors = get_reconstruction_errors(model, train_loader, device)
test1_errors = get_reconstruction_errors(model, test1_loader, device)
test2_errors = get_reconstruction_errors(model, test2_loader, device)

# Calculate threshold (e.g., 95th percentile of training errors)
threshold = np.percentile(train_errors, 95)

# Get predictions
y_pred1 = (test1_errors > threshold).astype(int)
y_pred2 = (test2_errors > threshold).astype(int)

# Adjust labels to match predictions length
y_test1_adj = y_test1[sequence_length-1:]
y_test2_adj = y_test2[sequence_length-1:]

## 5. Performance Evaluation

In [None]:
# Initialize evaluator
evaluator = Evaluator()

# Calculate metrics for test set 1
metrics1 = evaluator.calculate_basic_metrics(y_test1_adj, y_pred1)
etapr1 = evaluator.calculate_etapr(y_test1_adj, y_pred1)
delay1 = evaluator.calculate_detection_delay(
    y_test1_adj,
    y_pred1,
    test_df1.select('time').to_numpy().ravel()[sequence_length-1:]
)

# Calculate metrics for test set 2
metrics2 = evaluator.calculate_basic_metrics(y_test2_adj, y_pred2)
etapr2 = evaluator.calculate_etapr(y_test2_adj, y_pred2)
delay2 = evaluator.calculate_detection_delay(
    y_test2_adj,
    y_pred2,
    test_df2.select('time').to_numpy().ravel()[sequence_length-1:]
)

# Print results
print("Test Set 1 Results:")
print(f"Basic Metrics: {metrics1}")
print(f"eTaPR Metrics: {etapr1}")
print(f"Detection Delay: {delay1}")

print("\nTest Set 2 Results:")
print(f"Basic Metrics: {metrics2}")
print(f"eTaPR Metrics: {etapr2}")
print(f"Detection Delay: {delay2}")

## 6. Visualization

In [None]:
# Initialize visualizer
visualizer = Visualizer(save_dir='figures')

# Plot training history
fig = go.Figure()
fig.add_trace(go.Scatter(y=history['train_loss'], name='Training Loss'))
fig.update_layout(title='Training History', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

# Plot confusion matrix
cm1 = evaluator.calculate_confusion_matrix(y_test1_adj, y_pred1)
fig = visualizer.plot_confusion_matrix(cm1, ['Normal', 'Attack'], title='Test Set 1 Confusion Matrix')
fig.show()

# Plot reconstruction errors
fig = go.Figure()
fig.add_trace(go.Scatter(y=test1_errors, name='Reconstruction Error'))
fig.add_hline(y=threshold, line_dash='dash', line_color='red', name='Threshold')
fig.update_layout(title='Reconstruction Errors (Test Set 1)', showlegend=True)
fig.show()

## 7. Save Model

In [None]:
# Initialize model manager
model_manager = ModelManager(base_dir='models')

# Prepare metadata
metadata = {
    'model_type': 'lstm_autoencoder',
    'dataset_version': '20.07',
    'parameters': {
        'input_size': n_features,
        'hidden_size': 64,
        'num_layers': 2,
        'sequence_length': sequence_length,
        'threshold': float(threshold)
    },
    'performance': {
        'test1': {
            'basic_metrics': metrics1,
            'etapr_metrics': etapr1,
            'detection_delay': delay1
        },
        'test2': {
            'basic_metrics': metrics2,
            'etapr_metrics': etapr2,
            'detection_delay': delay2
        }
    },
    'training_history': history
}

# Save model
model_manager.save_torch_model(
    model=model,
    model_name='lstm_autoencoder',
    version='v1',
    metadata=metadata
)