# HAI-22.04 Anomaly Detection Model Training

This notebook trains anomaly detection models on the HAI-22.04 dataset. HAI-22.04 contains 88 columns including 86 data points. The training data does not contain attack labels, while the test data includes attack labels.

We will train the following models:
1. Isolation Forest
2. PCA Reconstruction Error
3. LSTM Autoencoder
4. Variational Autoencoder (VAE)
5. Ensemble Model

All models will use GPU acceleration (where applicable) and will be saved.

## 1. Import Required Libraries

In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
import time
import pickle
from datetime import datetime
from tqdm import tqdm

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

# Deep learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 2. Check GPU Availability

In [None]:
# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if device.type == 'cuda':
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 3. Load Dataset

In [None]:
# Set dataset path
data_path = '../hai-security-dataset/hai-22.04/'

# Create directory for saving models if it doesn't exist
os.makedirs('models', exist_ok=True)

# List all training and test files
train_files = sorted(glob(os.path.join(data_path, 'train*.csv')))
test_files = sorted(glob(os.path.join(data_path, 'test*.csv')))

print(f"Training files: {[os.path.basename(f) for f in train_files]}")
print(f"Test files: {[os.path.basename(f) for f in test_files]}")

In [None]:
# Load training data
train_dfs = []
for file in train_files:
    print(f"Loading {os.path.basename(file)}...")
    # HAI-22.04 uses comma as separator
    df = pd.read_csv(file)
    train_dfs.append(df)
    print(f"Shape: {df.shape}")

# Concatenate all training data
train_df = pd.concat(train_dfs, axis=0, ignore_index=True)
print(f"\nCombined training data shape: {train_df.shape}")

In [None]:
# Load test data
test_dfs = []
for file in test_files:
    print(f"Loading {os.path.basename(file)}...")
    # HAI-22.04 uses comma as separator
    df = pd.read_csv(file)
    test_dfs.append(df)
    print(f"Shape: {df.shape}")

# Concatenate all test data
test_df = pd.concat(test_dfs, axis=0, ignore_index=True)
print(f"\nCombined test data shape: {test_df.shape}")

## 4. Data Exploration and Preprocessing

In [None]:
# Check column names in training data
print("Training data column names:")
print(train_df.columns.tolist())

In [None]:
# Check column names in test data
print("Test data column names:")
print(test_df.columns.tolist())

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum().sum())

print("\nMissing values in test data:")
print(test_df.isnull().sum().sum())

In [None]:
# Convert timestamp column to datetime format
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

# Check time range
print(f"Training data time range: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
print(f"Test data time range: {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")

In [None]:
# Check attack labels
attack_columns = [col for col in test_df.columns if 'attack' in col.lower()]
print(f"Attack label columns: {attack_columns}")

# Calculate attack ratio
for col in attack_columns:
    attack_count = test_df[col].sum()
    attack_percentage = (attack_count / len(test_df)) * 100
    print(f"{col}: {attack_count} attacks ({attack_percentage:.2f}% of data)")

In [None]:
# Plot attack distribution
plt.figure(figsize=(14, 6))
plt.plot(test_df['timestamp'], test_df['Attack'])
plt.title('Attack Distribution')
plt.xlabel('Time')
plt.ylabel('Attack (1) / Normal (0)')
plt.show()

In [None]:
# Separate features and labels
# Exclude timestamp column and attack label columns
feature_columns = [col for col in train_df.columns if col != 'timestamp' and 'attack' not in col.lower()]

# Training data
X_train = train_df[feature_columns].values

# Test data
X_test = test_df[feature_columns].values
y_test = test_df['Attack'].values

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
with open('models/hai_22_04_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

## 5. Model Training and Evaluation

### 5.1 Isolation Forest

In [None]:
# Create and train Isolation Forest model
print("Training Isolation Forest model...")
start_time = time.time()

iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42, n_jobs=-1)
iso_forest.fit(X_train_scaled)

# Calculate training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Save model
with open('models/hai_22_04_isolation_forest.pkl', 'wb') as f:
    pickle.dump(iso_forest, f)
print("Model saved")

In [None]:
# Make predictions on test set
# Isolation Forest returns anomaly scores, need to convert to binary labels
# Prediction value of 1 means normal, -1 means anomaly, need to convert to 0 for normal, 1 for anomaly
y_pred_iso = iso_forest.predict(X_test_scaled)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)  # Convert labels

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred_iso)
recall = recall_score(y_test, y_pred_iso)
f1 = f1_score(y_test, y_pred_iso)

print(f"Isolation Forest model evaluation:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_iso)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('Isolation Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### 5.2 PCA Reconstruction Error

In [None]:
# Create and train PCA model
print("Training PCA model...")
start_time = time.time()

# Choose to retain 95% of variance
pca = PCA(n_components=0.95, random_state=42)
pca.fit(X_train_scaled)

# Calculate training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")
print(f"Number of principal components selected: {pca.n_components_}")

# Save model
with open('models/hai_22_04_pca.pkl', 'wb') as f:
    pickle.dump(pca, f)
print("Model saved")

In [None]:
# Calculate reconstruction error
def reconstruction_error(pca, X):
    X_transformed = pca.transform(X)
    X_reconstructed = pca.inverse_transform(X_transformed)
    error = np.mean(np.square(X - X_reconstructed), axis=1)
    return error

# Calculate reconstruction error for training set
train_error = reconstruction_error(pca, X_train_scaled)

# Calculate reconstruction error for test set
test_error = reconstruction_error(pca, X_test_scaled)

# Set threshold (using 95th percentile of training errors)
threshold = np.percentile(train_error, 95)
print(f"Threshold: {threshold:.6f}")

# Make predictions based on threshold
y_pred_pca = (test_error > threshold).astype(int)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred_pca)
recall = recall_score(y_test, y_pred_pca)
f1 = f1_score(y_test, y_pred_pca)

print(f"PCA Reconstruction Error model evaluation:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_pca)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('PCA Reconstruction Error Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### 5.3 LSTM Autoencoder

In [None]:
# Define LSTM Autoencoder model
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1, dropout=0.2):
        super(LSTMAutoencoder, self).__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Encoder
        self.encoder = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Decoder
        self.decoder = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=input_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
    def forward(self, x):
        # Encode
        _, (hidden, _) = self.encoder(x)
        
        # Use the last layer's hidden state
        hidden_repeat = hidden[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        
        # Decode
        output, _ = self.decoder(hidden_repeat)
        
        return output

In [None]:
# Prepare time series data
def create_sequences(data, seq_length):
    xs = []
    for i in range(len(data) - seq_length + 1):
        x = data[i:(i + seq_length)]
        xs.append(x)
    return np.array(xs)

# Set sequence length
seq_length = 10

# Create training sequences
X_train_seq = create_sequences(X_train_scaled, seq_length)
print(f"Training sequences shape: {X_train_seq.shape}")

# Create test sequences
X_test_seq = create_sequences(X_test_scaled, seq_length)
y_test_seq = y_test[seq_length-1:]
print(f"Test sequences shape: {X_test_seq.shape}")
print(f"Test labels shape: {y_test_seq.shape}")

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_seq).to(device)
X_test_tensor = torch.FloatTensor(X_test_seq).to(device)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize model
input_dim = X_train_scaled.shape[1]
hidden_dim = 32
num_layers = 2

model = LSTMAutoencoder(input_dim, hidden_dim, num_layers).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Print model summary
print(model)

In [None]:
# Train LSTM Autoencoder
print("Training LSTM Autoencoder...")
start_time = time.time()

num_epochs = 50
train_losses = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    
    for batch_idx, (data, _) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.6f}")

# Calculate training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Save model
torch.save(model.state_dict(), 'models/hai_22_04_lstm_autoencoder.pt')
print("Model saved")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.title('LSTM Autoencoder Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
# Evaluate LSTM Autoencoder
model.eval()

# Calculate reconstruction error on training data
with torch.no_grad():
    X_train_pred = model(X_train_tensor).cpu().numpy()
    train_mse = np.mean(np.square(X_train_seq - X_train_pred), axis=(1, 2))
    
    X_test_pred = model(X_test_tensor).cpu().numpy()
    test_mse = np.mean(np.square(X_test_seq - X_test_pred), axis=(1, 2))

# Set threshold (using 95th percentile of training errors)
threshold = np.percentile(train_mse, 95)
print(f"Threshold: {threshold:.6f}")

# Make predictions based on threshold
y_pred_lstm = (test_mse > threshold).astype(int)

# Calculate evaluation metrics
precision = precision_score(y_test_seq, y_pred_lstm)
recall = recall_score(y_test_seq, y_pred_lstm)
f1 = f1_score(y_test_seq, y_pred_lstm)

print(f"LSTM Autoencoder model evaluation:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test_seq, y_pred_lstm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('LSTM Autoencoder Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### 5.4 Variational Autoencoder (VAE)

In [None]:
# Define VAE model
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, latent_dim=20):
        super(VAE, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        
        # Mean and variance layers
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_var = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
    
    def encode(self, x):
        hidden = self.encoder(x)
        mu = self.fc_mu(hidden)
        log_var = self.fc_var(hidden)
        return mu, log_var
    
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        z = mu + eps * std
        return z
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x_recon = self.decode(z)
        return x_recon, mu, log_var

In [None]:
# Prepare data for VAE
X_train_tensor_flat = torch.FloatTensor(X_train_scaled).to(device)
X_test_tensor_flat = torch.FloatTensor(X_test_scaled).to(device)

# Create DataLoader
train_dataset_vae = TensorDataset(X_train_tensor_flat, X_train_tensor_flat)
train_loader_vae = DataLoader(train_dataset_vae, batch_size=64, shuffle=True)

# Initialize model
input_dim = X_train_scaled.shape[1]
hidden_dim = 64
latent_dim = 20

vae_model = VAE(input_dim, hidden_dim, latent_dim).to(device)
optimizer = optim.Adam(vae_model.parameters(), lr=0.001)

# Print model summary
print(vae_model)

In [None]:
# VAE loss function
def vae_loss_function(recon_x, x, mu, log_var):
    # Reconstruction loss
    recon_loss = nn.MSELoss(reduction='sum')(recon_x, x)
    
    # KL divergence
    kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    
    return recon_loss + kl_loss

In [None]:
# Train VAE
print("Training VAE...")
start_time = time.time()

num_epochs = 50
train_losses = []

for epoch in range(num_epochs):
    vae_model.train()
    train_loss = 0
    
    for batch_idx, (data, _) in enumerate(train_loader_vae):
        optimizer.zero_grad()
        recon_batch, mu, log_var = vae_model(data)
        loss = vae_loss_function(recon_batch, data, mu, log_var)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader_vae.dataset)
    train_losses.append(train_loss)
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.6f}")

# Calculate training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Save model
torch.save(vae_model.state_dict(), 'models/hai_22_04_vae.pt')
print("Model saved")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.title('VAE Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
# Evaluate VAE
vae_model.eval()

# Calculate reconstruction error on training data
with torch.no_grad():
    X_train_recon, _, _ = vae_model(X_train_tensor_flat)
    train_mse = nn.MSELoss(reduction='none')(X_train_recon, X_train_tensor_flat).mean(dim=1).cpu().numpy()
    
    X_test_recon, _, _ = vae_model(X_test_tensor_flat)
    test_mse = nn.MSELoss(reduction='none')(X_test_recon, X_test_tensor_flat).mean(dim=1).cpu().numpy()

# Set threshold (using 95th percentile of training errors)
threshold = np.percentile(train_mse, 95)
print(f"Threshold: {threshold:.6f}")

# Make predictions based on threshold
y_pred_vae = (test_mse > threshold).astype(int)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred_vae)
recall = recall_score(y_test, y_pred_vae)
f1 = f1_score(y_test, y_pred_vae)

print(f"VAE model evaluation:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_vae)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('VAE Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### 5.5 Ensemble Model

In [None]:
# Create ensemble predictions
# For LSTM predictions, we need to align with other predictions
y_pred_lstm_aligned = np.zeros_like(y_test)
y_pred_lstm_aligned[seq_length-1:] = y_pred_lstm

# Create ensemble by majority voting
ensemble_pred = np.zeros_like(y_test)
for i in range(len(y_test)):
    votes = [y_pred_iso[i], y_pred_pca[i], y_pred_vae[i]]
    if i >= seq_length-1:
        votes.append(y_pred_lstm_aligned[i])
    
    # Majority vote
    ensemble_pred[i] = 1 if sum(votes) >= len(votes)/2 else 0

# Calculate evaluation metrics
precision = precision_score(y_test, ensemble_pred)
recall = recall_score(y_test, ensemble_pred)
f1 = f1_score(y_test, ensemble_pred)

print(f"Ensemble model evaluation:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, ensemble_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('Ensemble Model Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Model Comparison

In [None]:
# Compare all models
models = ['Isolation Forest', 'PCA', 'LSTM Autoencoder', 'VAE', 'Ensemble']
predictions = [y_pred_iso, y_pred_pca, y_pred_lstm_aligned, y_pred_vae, ensemble_pred]

# Calculate metrics for all models
results = []
for model_name, y_pred in zip(models, predictions):
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append([model_name, precision, recall, f1])

# Create DataFrame for results
results_df = pd.DataFrame(results, columns=['Model', 'Precision', 'Recall', 'F1 Score'])
results_df

In [None]:
# Plot comparison
plt.figure(figsize=(12, 8))

x = np.arange(len(models))
width = 0.25

plt.bar(x - width, results_df['Precision'], width, label='Precision')
plt.bar(x, results_df['Recall'], width, label='Recall')
plt.bar(x + width, results_df['F1 Score'], width, label='F1 Score')

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, models, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()