# HAI Dataset Test Analysis and Anomaly Detection

This notebook implements anomaly detection models and analyzes their performance on test data.

In [None]:
import polars as pl
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import joblib

# Set plot style
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

In [None]:
# Define paths
PROCESSED_DIR = Path('../data/processed')
MODELS_DIR = Path('../models')

In [None]:
# Load processed data
train_files = list(PROCESSED_DIR.glob('train_*.parquet'))
test_files = list(PROCESSED_DIR.glob('test_*.parquet'))

train_dfs = [pl.read_parquet(f) for f in train_files]
test_dfs = [pl.read_parquet(f) for f in test_files]

train_df = pl.concat(train_dfs)
test_df = pl.concat(test_dfs)

In [None]:
# LSTM Autoencoder Model
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        
        self.decoder = nn.LSTM(
            input_size=hidden_size,
            hidden_size=input_size,
            num_layers=num_layers,
            batch_first=True
        )
        
    def forward(self, x):
        # Encode
        encoded, _ = self.encoder(x)
        
        # Decode
        decoded, _ = self.decoder(encoded)
        
        return decoded

In [None]:
# TCN Model
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding):
        super().__init__()
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
                              stride=stride, padding=padding, dilation=dilation)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        out = self.dropout(self.relu(self.conv1(x)))
        return out

class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, kernel_size=2):
        super().__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation = 2 ** i
            in_channels = input_size if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers.append(
                TemporalBlock(in_channels, out_channels, kernel_size,
                             stride=1, dilation=dilation,
                             padding=(kernel_size-1) * dilation)
            )
        self.network = nn.Sequential(*layers)
        self.linear = nn.Linear(num_channels[-1], output_size)
        
    def forward(self, x):
        x = self.network(x)
        x = self.linear(x.transpose(1,2))
        return x

In [None]:
# Prepare data for models
def prepare_sequences(df, sequence_length=60):
    # Select features
    feature_cols = [
        col for col in df.columns 
        if any(x in col for x in ['_error', '_PV_mean_', '_PV_std_'])
    ]
    
    # Convert to numpy and scale
    data = df.select(feature_cols).to_numpy()
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    # Create sequences
    sequences = []
    for i in range(len(scaled_data) - sequence_length + 1):
        sequences.append(scaled_data[i:i+sequence_length])
    
    return np.array(sequences), scaler

In [None]:
# Train models
# 1. Isolation Forest
isolation_forest = IsolationForest(random_state=42, contamination=0.1)
isolation_forest.fit(train_df.select(pl.all()).to_numpy())
joblib.dump(isolation_forest, MODELS_DIR / 'isolation_forest/model.joblib')

# 2. LSTM Autoencoder
sequences, scaler = prepare_sequences(train_df)
input_size = sequences.shape[2]
hidden_size = 32

lstm_autoencoder = LSTMAutoencoder(input_size, hidden_size)
lstm_autoencoder.train()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(lstm_autoencoder.parameters())

# Convert to torch tensors
train_sequences = torch.FloatTensor(sequences)

# Training loop
n_epochs = 50
batch_size = 32

for epoch in range(n_epochs):
    for i in range(0, len(train_sequences), batch_size):
        batch = train_sequences[i:i+batch_size]
        
        optimizer.zero_grad()
        outputs = lstm_autoencoder(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
        
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

torch.save(lstm_autoencoder.state_dict(), MODELS_DIR / 'lstm_autoencoder/model.pth')
joblib.dump(scaler, MODELS_DIR / 'lstm_autoencoder/scaler.joblib')

In [None]:
# Evaluate models on test data
def get_anomaly_scores(model, data):
    if isinstance(model, IsolationForest):
        return -model.score_samples(data)
    elif isinstance(model, LSTMAutoencoder):
        model.eval()
        with torch.no_grad():
            reconstructed = model(torch.FloatTensor(data))
            reconstruction_error = torch.mean((torch.FloatTensor(data) - reconstructed) ** 2, dim=(1,2))
            return reconstruction_error.numpy()

# Get anomaly scores
if_scores = get_anomaly_scores(isolation_forest, test_df.select(pl.all()).to_numpy())
test_sequences, _ = prepare_sequences(test_df)
lstm_scores = get_anomaly_scores(lstm_autoencoder, test_sequences)

In [None]:
# Plot results
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot Isolation Forest scores
axes[0].plot(if_scores)
axes[0].set_title('Isolation Forest Anomaly Scores')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Anomaly Score')

# Plot LSTM Autoencoder scores
axes[1].plot(lstm_scores)
axes[1].set_title('LSTM Autoencoder Reconstruction Error')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Reconstruction Error')

plt.tight_layout()
plt.show()

In [None]:
# Analyze attack detection performance
def plot_attack_detection(scores, attack_labels, title):
    plt.figure(figsize=(15, 6))
    
    # Plot scores
    plt.plot(scores, label='Anomaly Score', alpha=0.7)
    
    # Highlight attack periods
    attack_regions = np.where(attack_labels == 1)[0]
    plt.fill_between(range(len(scores)), 
                     min(scores), max(scores),
                     where=attack_labels == 1,
                     color='red', alpha=0.3,
                     label='Attack Period')
    
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Anomaly Score')
    plt.legend()
    plt.show()

# Get attack labels
attack_labels = test_df.select('attack').to_numpy().flatten()

# Plot detection results
plot_attack_detection(if_scores, attack_labels, 'Isolation Forest Attack Detection')
plot_attack_detection(lstm_scores, attack_labels, 'LSTM Autoencoder Attack Detection')

In [None]:
# Analyze detection performance by control loop
control_loops = ['P1-PC', 'P1-LC', 'P1-FC', 'P1-TC']

for loop in control_loops:
    # Get attack periods for this loop
    loop_attacks = test_df.select(f'{loop.lower()}_attack').to_numpy().flatten()
    
    plt.figure(figsize=(15, 6))
    
    # Plot anomaly scores during attacks
    plt.scatter(range(len(if_scores)), if_scores, 
               c=loop_attacks, cmap='coolwarm',
               alpha=0.6)
    
    plt.title(f'Anomaly Detection Performance for {loop}')
    plt.xlabel('Time')
    plt.ylabel('Anomaly Score')
    plt.colorbar(label='Attack')
    plt.show()