# HAI 20.07 Dataset Analysis

This notebook analyzes the HAI 20.07 dataset using various anomaly detection methods.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_curve, auc
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings

# Set up GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1e9} GB")
    print(f"Memory cached: {torch.cuda.memory_reserved(0) / 1e9} GB")

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Ignore warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Data Loading and Exploration

In [None]:
# Define paths
data_path = 'hai-security-dataset/hai-20.07/'
train_files = ['train1.csv', 'train2.csv']
test_files = ['test1.csv', 'test2.csv']

# Function to load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Convert timestamp to datetime
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])
    # Set timestamp as index
    df.set_index(df.columns[0], inplace=True)
    return df

# Load training data
train_dfs = []
for file in train_files:
    df = load_data(os.path.join(data_path, file))
    train_dfs.append(df)
    print(f"Loaded {file} with shape {df.shape}")

# Load test data
test_dfs = []
for file in test_files:
    df = load_data(os.path.join(data_path, file))
    test_dfs.append(df)
    print(f"Loaded {file} with shape {df.shape}")

# Concatenate training data
train_df = pd.concat(train_dfs)
print(f"Combined training data shape: {train_df.shape}")

# Display the first few rows of the training data
train_df.head()

In [None]:
# Check for missing values
missing_values = train_df.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

# Get basic statistics
train_df.describe()

In [None]:
# Identify feature and label columns
# Assuming the last 4 columns are labels
feature_cols = train_df.columns[:-4]
label_cols = train_df.columns[-4:]

print(f"Number of features: {len(feature_cols)}")
print(f"Label columns: {label_cols.tolist()}")

# Check label distribution in test data
for i, test_df in enumerate(test_dfs):
    plt.figure(figsize=(10, 6))
    for j, col in enumerate(label_cols):
        plt.subplot(2, 2, j+1)
        test_df[col].value_counts().plot(kind='bar')
        plt.title(f'Label Distribution - {col}')
        plt.ylabel('Count')
    plt.tight_layout()
    plt.suptitle(f'Label Distribution in test{i+1}.csv')
    plt.subplots_adjust(top=0.9)
    plt.show()

In [None]:
# Visualize the time series data for a few selected features
selected_features = feature_cols[:5]  # Select first 5 features for visualization

plt.figure(figsize=(15, 10))
for i, feature in enumerate(selected_features):
    plt.subplot(len(selected_features), 1, i+1)
    plt.plot(train_df.index, train_df[feature])
    plt.title(feature)
    plt.tight_layout()
plt.show()

# Visualize correlations between features
plt.figure(figsize=(20, 16))
corr_matrix = train_df[feature_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=False, square=True)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Function to preprocess data
def preprocess_data(train_df, test_dfs, feature_cols, label_cols, scaler_type='standard'):
    # Extract features and labels
    X_train = train_df[feature_cols].values
    
    X_tests = []
    y_tests = []
    for test_df in test_dfs:
        X_test = test_df[feature_cols].values
        y_test = test_df[label_cols].values
        X_tests.append(X_test)
        y_tests.append(y_test)
    
    # Scale the features
    if scaler_type == 'standard':
        scaler = StandardScaler()
    else:  # minmax
        scaler = MinMaxScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_tests_scaled = [scaler.transform(X_test) for X_test in X_tests]
    
    return X_train_scaled, X_tests_scaled, y_tests, scaler

# Preprocess the data
X_train_scaled, X_tests_scaled, y_tests, scaler = preprocess_data(train_df, test_dfs, feature_cols, label_cols)

print(f"Training data shape: {X_train_scaled.shape}")
for i, X_test_scaled in enumerate(X_tests_scaled):
    print(f"Test{i+1} data shape: {X_test_scaled.shape}")

In [None]:
# Function to create sequences for time series models
def create_sequences(data, seq_length):
    xs = []
    for i in range(len(data) - seq_length):
        x = data[i:(i + seq_length)]
        xs.append(x)
    return np.array(xs)

# Create sequences for training and testing
seq_length = 10  # Sequence length for time series models

X_train_seq = create_sequences(X_train_scaled, seq_length)
X_tests_seq = [create_sequences(X_test_scaled, seq_length) for X_test_scaled in X_tests_scaled]
y_tests_seq = [y_test[seq_length:] for y_test in y_tests]

print(f"Training sequences shape: {X_train_seq.shape}")
for i, X_test_seq in enumerate(X_tests_seq):
    print(f"Test{i+1} sequences shape: {X_test_seq.shape}")

In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_seq).to(device)
X_tests_tensor = [torch.FloatTensor(X_test_seq).to(device) for X_test_seq in X_tests_seq]

# Create DataLoader for training
batch_size = 64
train_dataset = TensorDataset(X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print(f"Training tensor shape: {X_train_tensor.shape}")
for i, X_test_tensor in enumerate(X_tests_tensor):
    print(f"Test{i+1} tensor shape: {X_test_tensor.shape}")

## 3. Model Implementation

### 3.1 Traditional Machine Learning Methods

In [None]:
# Isolation Forest
def train_isolation_forest(X_train, contamination=0.01):
    print("Training Isolation Forest...")
    start_time = time.time()
    model = IsolationForest(n_estimators=100, contamination=contamination, random_state=42, n_jobs=-1)
    model.fit(X_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    return model

# One-Class SVM
def train_one_class_svm(X_train, nu=0.01):
    print("Training One-Class SVM...")
    start_time = time.time()
    model = OneClassSVM(kernel='rbf', gamma='auto', nu=nu)
    model.fit(X_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    return model

# PCA-based anomaly detection
def train_pca(X_train, n_components=0.95):
    print("Training PCA...")
    start_time = time.time()
    model = PCA(n_components=n_components, random_state=42)
    model.fit(X_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    return model

# Function to detect anomalies using PCA reconstruction error
def detect_anomalies_pca(model, X, threshold_factor=3):
    X_transformed = model.transform(X)
    X_reconstructed = model.inverse_transform(X_transformed)
    reconstruction_error = np.mean(np.square(X - X_reconstructed), axis=1)
    threshold = np.mean(reconstruction_error) + threshold_factor * np.std(reconstruction_error)
    return reconstruction_error > threshold, reconstruction_error

### 3.2 Deep Learning Methods

In [None]:
# LSTM Autoencoder
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1, dropout=0.2):
        super(LSTMAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Encoder
        self.encoder = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Decoder
        self.decoder = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=input_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
    def forward(self, x):
        # Encode
        _, (hidden, _) = self.encoder(x)
        # Use the last hidden state as the encoded representation
        hidden_repeat = hidden[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        # Decode
        output, _ = self.decoder(hidden_repeat)
        return output

# CNN Autoencoder
class CNNAutoencoder(nn.Module):
    def __init__(self, input_dim, seq_length):
        super(CNNAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.seq_length = seq_length
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(input_dim, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(32, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(16, 32, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose1d(32, input_dim, kernel_size=2, stride=2),
        )
        
    def forward(self, x):
        # Input shape: [batch, seq_length, input_dim]
        # Reshape for Conv1d: [batch, input_dim, seq_length]
        x = x.permute(0, 2, 1)
        # Encode
        encoded = self.encoder(x)
        # Decode
        decoded = self.decoder(encoded)
        # Reshape back: [batch, seq_length, input_dim]
        decoded = decoded.permute(0, 2, 1)
        return decoded

# Transformer Autoencoder
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, nhead=4, num_layers=2, dropout=0.1):
        super(TransformerAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        # Input projection
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(hidden_dim)
        
        # Transformer encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead, dim_feedforward=hidden_dim*4, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Output projection
        self.output_projection = nn.Linear(hidden_dim, input_dim)
        
    def forward(self, x):
        # Input shape: [batch, seq_length, input_dim]
        # Reshape for Transformer: [seq_length, batch, input_dim]
        x = x.permute(1, 0, 2)
        
        # Project input to hidden dimension
        x = self.input_projection(x)
        
        # Add positional encoding
        x = self.pos_encoder(x)
        
        # Apply transformer encoder
        x = self.transformer_encoder(x)
        
        # Project back to input dimension
        x = self.output_projection(x)
        
        # Reshape back: [batch, seq_length, input_dim]
        x = x.permute(1, 0, 2)
        
        return x

In [None]:
# Function to train deep learning models
def train_dl_model(model, train_loader, num_epochs=50, learning_rate=0.001):
    print(f"Training {model.__class__.__name__}...")
    start_time = time.time()
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    
    model.train()
    train_losses = []
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_idx, (data,) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}")
    
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Plot training loss
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses)
    plt.title(f'{model.__class__.__name__} Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()
    
    return model, train_losses

# Function to detect anomalies using reconstruction error
def detect_anomalies_dl(model, X, threshold_factor=3):
    model.eval()
    with torch.no_grad():
        X_tensor = torch.FloatTensor(X).to(device)
        X_reconstructed = model(X_tensor).cpu().numpy()
    
    reconstruction_error = np.mean(np.square(X - X_reconstructed), axis=(1, 2))
    threshold = np.mean(reconstruction_error) + threshold_factor * np.std(reconstruction_error)
    return reconstruction_error > threshold, reconstruction_error

## 4. Model Training and Evaluation

In [None]:
# Train traditional ML models
# Use non-sequential data for these models
isolation_forest = train_isolation_forest(X_train_scaled)
one_class_svm = train_one_class_svm(X_train_scaled)
pca_model = train_pca(X_train_scaled)

In [None]:
# Train deep learning models
input_dim = X_train_seq.shape[2]  # Number of features
seq_length = X_train_seq.shape[1]  # Sequence length
hidden_dim = 64  # Hidden dimension

# Initialize models
lstm_autoencoder = LSTMAutoencoder(input_dim, hidden_dim, num_layers=2).to(device)
cnn_autoencoder = CNNAutoencoder(input_dim, seq_length).to(device)
transformer_autoencoder = TransformerAutoencoder(input_dim, hidden_dim).to(device)

# Train models
lstm_autoencoder, lstm_losses = train_dl_model(lstm_autoencoder, train_loader, num_epochs=50)
cnn_autoencoder, cnn_losses = train_dl_model(cnn_autoencoder, train_loader, num_epochs=50)
transformer_autoencoder, transformer_losses = train_dl_model(transformer_autoencoder, train_loader, num_epochs=50)

In [None]:
# Function to evaluate models
def evaluate_model(model_name, y_true, y_pred):
    # Convert to binary (0 for normal, 1 for anomaly)
    y_true_binary = (y_true[:, 0] > 0).astype(int)
    y_pred_binary = y_pred.astype(int)
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary')
    
    # Confusion matrix
    cm = confusion_matrix(y_true_binary, y_pred_binary)
    
    # Print metrics
    print(f"{model_name} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return precision, recall, f1

In [None]:
# Evaluate traditional ML models
results = []

for test_idx, (X_test_scaled, y_test) in enumerate(zip(X_tests_scaled, y_tests)):
    print(f"\nEvaluating on Test{test_idx+1}:")
    
    # Isolation Forest
    y_pred_if = isolation_forest.predict(X_test_scaled)
    # Convert from -1/1 to 0/1 (1 for anomaly)
    y_pred_if = (y_pred_if == -1).astype(int)
    precision_if, recall_if, f1_if = evaluate_model("Isolation Forest", y_test, y_pred_if)
    
    # One-Class SVM
    y_pred_svm = one_class_svm.predict(X_test_scaled)
    # Convert from -1/1 to 0/1 (1 for anomaly)
    y_pred_svm = (y_pred_svm == -1).astype(int)
    precision_svm, recall_svm, f1_svm = evaluate_model("One-Class SVM", y_test, y_pred_svm)
    
    # PCA
    y_pred_pca, _ = detect_anomalies_pca(pca_model, X_test_scaled)
    precision_pca, recall_pca, f1_pca = evaluate_model("PCA", y_test, y_pred_pca)
    
    # Store results
    results.append({
        'test_idx': test_idx + 1,
        'isolation_forest': {'precision': precision_if, 'recall': recall_if, 'f1': f1_if},
        'one_class_svm': {'precision': precision_svm, 'recall': recall_svm, 'f1': f1_svm},
        'pca': {'precision': precision_pca, 'recall': recall_pca, 'f1': f1_pca}
    })

In [None]:
# Evaluate deep learning models
for test_idx, (X_test_seq, y_test_seq) in enumerate(zip(X_tests_seq, y_tests_seq)):
    print(f"\nEvaluating deep learning models on Test{test_idx+1}:")
    
    # LSTM Autoencoder
    y_pred_lstm, _ = detect_anomalies_dl(lstm_autoencoder, X_test_seq)
    precision_lstm, recall_lstm, f1_lstm = evaluate_model("LSTM Autoencoder", y_test_seq, y_pred_lstm)
    
    # CNN Autoencoder
    y_pred_cnn, _ = detect_anomalies_dl(cnn_autoencoder, X_test_seq)
    precision_cnn, recall_cnn, f1_cnn = evaluate_model("CNN Autoencoder", y_test_seq, y_pred_cnn)
    
    # Transformer Autoencoder
    y_pred_transformer, _ = detect_anomalies_dl(transformer_autoencoder, X_test_seq)
    precision_transformer, recall_transformer, f1_transformer = evaluate_model("Transformer Autoencoder", y_test_seq, y_pred_transformer)
    
    # Update results
    results[test_idx].update({
        'lstm_autoencoder': {'precision': precision_lstm, 'recall': recall_lstm, 'f1': f1_lstm},
        'cnn_autoencoder': {'precision': precision_cnn, 'recall': recall_cnn, 'f1': f1_cnn},
        'transformer_autoencoder': {'precision': precision_transformer, 'recall': recall_transformer, 'f1': f1_transformer}
    })

## 5. Results Visualization and Comparison

In [None]:
# Visualize F1 scores for all models
model_names = ['Isolation Forest', 'One-Class SVM', 'PCA', 'LSTM Autoencoder', 'CNN Autoencoder', 'Transformer Autoencoder']
model_keys = ['isolation_forest', 'one_class_svm', 'pca', 'lstm_autoencoder', 'cnn_autoencoder', 'transformer_autoencoder']

for test_idx, result in enumerate(results):
    f1_scores = [result[key]['f1'] for key in model_keys]
    
    plt.figure(figsize=(12, 6))
    bars = plt.bar(model_names, f1_scores)
    plt.title(f'F1 Scores on Test{test_idx+1}')
    plt.ylabel('F1 Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', rotation=0)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize precision-recall trade-off
for test_idx, result in enumerate(results):
    plt.figure(figsize=(10, 8))
    
    for i, key in enumerate(model_keys):
        plt.scatter(result[key]['recall'], result[key]['precision'], s=100, label=model_names[i])
    
    plt.title(f'Precision-Recall Trade-off on Test{test_idx+1}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize anomaly detection over time for the best performing model
# Find the best model based on average F1 score
avg_f1_scores = {}
for key in model_keys:
    avg_f1_scores[key] = np.mean([result[key]['f1'] for result in results])
    
best_model_key = max(avg_f1_scores, key=avg_f1_scores.get)
best_model_name = model_names[model_keys.index(best_model_key)]
print(f"Best performing model: {best_model_name} with average F1 score: {avg_f1_scores[best_model_key]:.4f}")

# Visualize anomaly detection over time for the best model
for test_idx, (X_test_scaled, y_test) in enumerate(zip(X_tests_scaled, y_tests)):
    # Get predictions from the best model
    if best_model_key == 'isolation_forest':
        y_pred = (isolation_forest.predict(X_test_scaled) == -1).astype(int)
    elif best_model_key == 'one_class_svm':
        y_pred = (one_class_svm.predict(X_test_scaled) == -1).astype(int)
    elif best_model_key == 'pca':
        y_pred, _ = detect_anomalies_pca(pca_model, X_test_scaled)
    else:
        # For deep learning models, use the sequence data
        X_test_seq = X_tests_seq[test_idx]
        y_test = y_tests_seq[test_idx]
        if best_model_key == 'lstm_autoencoder':
            y_pred, _ = detect_anomalies_dl(lstm_autoencoder, X_test_seq)
        elif best_model_key == 'cnn_autoencoder':
            y_pred, _ = detect_anomalies_dl(cnn_autoencoder, X_test_seq)
        elif best_model_key == 'transformer_autoencoder':
            y_pred, _ = detect_anomalies_dl(transformer_autoencoder, X_test_seq)
    
    # Convert true labels to binary
    y_true_binary = (y_test[:, 0] > 0).astype(int)
    
    # Plot true vs predicted anomalies over time
    plt.figure(figsize=(15, 6))
    plt.plot(y_true_binary, label='True Anomalies', color='blue', alpha=0.7)
    plt.plot(y_pred, label='Predicted Anomalies', color='red', alpha=0.7)
    plt.title(f'{best_model_name} - Anomaly Detection Over Time (Test{test_idx+1})')
    plt.xlabel('Time Step')
    plt.ylabel('Anomaly (1) / Normal (0)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
# Ensemble method: Majority voting
def ensemble_majority_voting(predictions):
    # Sum predictions and check if majority voted for anomaly
    return np.sum(predictions, axis=0) > (len(predictions) / 2)

# Evaluate ensemble method
for test_idx, (X_test_scaled, X_test_seq, y_test, y_test_seq) in enumerate(zip(X_tests_scaled, X_tests_seq, y_tests, y_tests_seq)):
    print(f"\nEvaluating ensemble method on Test{test_idx+1}:")
    
    # Get predictions from all models
    y_pred_if = (isolation_forest.predict(X_test_scaled) == -1).astype(int)
    y_pred_svm = (one_class_svm.predict(X_test_scaled) == -1).astype(int)
    y_pred_pca, _ = detect_anomalies_pca(pca_model, X_test_scaled)
    
    # For deep learning models, use the sequence data
    y_pred_lstm, _ = detect_anomalies_dl(lstm_autoencoder, X_test_seq)
    y_pred_cnn, _ = detect_anomalies_dl(cnn_autoencoder, X_test_seq)
    y_pred_transformer, _ = detect_anomalies_dl(transformer_autoencoder, X_test_seq)
    
    # Ensure all predictions have the same length
    min_length = min(len(y_pred_if), len(y_pred_lstm))
    y_pred_if = y_pred_if[:min_length]
    y_pred_svm = y_pred_svm[:min_length]
    y_pred_pca = y_pred_pca[:min_length]
    y_pred_lstm = y_pred_lstm[:min_length]
    y_pred_cnn = y_pred_cnn[:min_length]
    y_pred_transformer = y_pred_transformer[:min_length]
    y_true = y_test[:min_length] if len(y_test) > len(y_test_seq) else y_test_seq[:min_length]
    
    # Ensemble predictions using majority voting
    predictions = [y_pred_if, y_pred_svm, y_pred_pca, y_pred_lstm, y_pred_cnn, y_pred_transformer]
    y_pred_ensemble = ensemble_majority_voting(predictions)
    
    # Evaluate ensemble method
    precision_ensemble, recall_ensemble, f1_ensemble = evaluate_model("Ensemble (Majority Voting)", y_true, y_pred_ensemble)
    
    # Update results
    results[test_idx]['ensemble'] = {'precision': precision_ensemble, 'recall': recall_ensemble, 'f1': f1_ensemble}
    
    # Compare ensemble with individual models
    model_names_with_ensemble = model_names + ['Ensemble (Majority Voting)']
    model_keys_with_ensemble = model_keys + ['ensemble']
    
    f1_scores = [results[test_idx][key]['f1'] for key in model_keys_with_ensemble]
    
    plt.figure(figsize=(14, 6))
    bars = plt.bar(model_names_with_ensemble, f1_scores)
    plt.title(f'F1 Scores with Ensemble on Test{test_idx+1}')
    plt.ylabel('F1 Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', rotation=0)
    
    plt.tight_layout()
    plt.show()