# HAI-23.05 Dataset Anomaly Detection - Ensemble Method

This notebook uses an ensemble of multiple anomaly detection methods for the HAI-23.05 dataset.

The HAI dataset contains data from industrial control systems (ICS), where training data does not include attack labels, while test data includes attack labels.

## 1. Import Necessary Libraries

In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import pickle
import time
from scipy.stats import chi2

# Import machine learning libraries
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from sklearn.covariance import EmpiricalCovariance
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc, roc_curve, f1_score, precision_score, recall_score

# Import deep learning libraries
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## 2. Load Dataset

In [None]:
# Set data path
data_path = "../../hai-security-dataset/hai-23.05/"

# Load training data
train_files = [f for f in os.listdir(data_path) if f.startswith('hai-train')]
train_dfs = []

for file in train_files:
    print(f"Loading training file: {file}")
    df = pd.read_csv(f"{data_path}{file}")
    train_dfs.append(df)
    
# Combine training data
train_df = pd.concat(train_dfs, ignore_index=True)
print(f"Training data shape: {train_df.shape}")

In [None]:
# Load test data
test_files = [f for f in os.listdir(data_path) if f.startswith('hai-test')]
test_dfs = []

for file in test_files:
    print(f"Loading test file: {file}")
    df = pd.read_csv(f"{data_path}{file}")
    test_dfs.append(df)
    
# Combine test data
test_df = pd.concat(test_dfs, ignore_index=True)
print(f"Test data shape: {test_df.shape}")

In [None]:
# Load label data
label_files = [f for f in os.listdir(data_path) if f.startswith('label-test')]
label_dfs = []

for file, test_file in zip(label_files, test_files):
    print(f"Loading label file: {file} for test file: {test_file}")
    df = pd.read_csv(f"{data_path}{file}")
    label_dfs.append(df)
    
# Combine label data
label_df = pd.concat(label_dfs, ignore_index=True)
print(f"Label data shape: {label_df.shape}")

## 3. Data Preprocessing

In [None]:
# Check basic information of the dataset
print("Column names in training dataset:")
print(train_df.columns.tolist())

# Check for missing values
print("\nMissing values in training dataset:")
print(train_df.isnull().sum().sum())

print("\nMissing values in test dataset:")
print(test_df.isnull().sum().sum())

In [None]:
# Convert timestamp to datetime
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

# Check label data structure
print("Label data columns:")
print(label_df.columns.tolist())

# Merge test data with labels
if 'timestamp' in label_df.columns:
    label_df['timestamp'] = pd.to_datetime(label_df['timestamp'])
    test_with_labels = pd.merge(test_df, label_df, on='timestamp', how='left')
else:
    # If there's no timestamp in label_df, assume the order matches test_df
    test_with_labels = test_df.copy()
    for col in label_df.columns:
        test_with_labels[col] = label_df[col].values

# Extract attack labels
attack_columns = [col for col in test_with_labels.columns if 'attack' in col.lower()]
print(f"Attack label columns: {attack_columns}")

# Create a combined attack label (if multiple attack columns exist)
if len(attack_columns) > 1:
    test_with_labels['attack_combined'] = test_with_labels[attack_columns].max(axis=1)
    y_test = test_with_labels['attack_combined']
else:
    y_test = test_with_labels[attack_columns[0]]

# Print attack distribution
print(f"\nAttack distribution in test data:\n{y_test.value_counts()}")

In [None]:
# Select features for training and testing
# Exclude timestamp and attack labels
feature_columns = [col for col in train_df.columns if col not in ['timestamp'] + attack_columns]
print(f"Number of features: {len(feature_columns)}")

# Prepare training and testing data
X_train = train_df[feature_columns].values
X_test = test_df[feature_columns].values

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape after preprocessing: {X_train_scaled.shape}")
print(f"Testing data shape after preprocessing: {X_test_scaled.shape}")

## 4. Implement Individual Models

### 4.1 Isolation Forest

In [None]:
# Train Isolation Forest model
print("Training Isolation Forest model...")
start_time = time.time()

iso_forest = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination='auto',
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_train_scaled)

# Get anomaly scores (-1 for anomalies, 1 for normal)
iso_forest_scores_train = iso_forest.decision_function(X_train_scaled)
iso_forest_scores_test = iso_forest.decision_function(X_test_scaled)

# Convert to anomaly scores (higher score = more anomalous)
iso_forest_anomaly_scores_train = -iso_forest_scores_train
iso_forest_anomaly_scores_test = -iso_forest_scores_test

training_time = time.time() - start_time
print(f"Isolation Forest training completed in {training_time:.2f} seconds")

### 4.2 Local Outlier Factor

In [None]:
# Train Local Outlier Factor model
print("Training Local Outlier Factor model...")
start_time = time.time()

lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination='auto',
    novelty=True,  # Enable predict method
    n_jobs=-1
)

lof.fit(X_train_scaled)

# Get anomaly scores (-1 for anomalies, 1 for normal)
lof_scores_train = lof.decision_function(X_train_scaled)
lof_scores_test = lof.decision_function(X_test_scaled)

# Convert to anomaly scores (higher score = more anomalous)
lof_anomaly_scores_train = -lof_scores_train
lof_anomaly_scores_test = -lof_scores_test

training_time = time.time() - start_time
print(f"Local Outlier Factor training completed in {training_time:.2f} seconds")

### 4.3 One-Class SVM

In [None]:
# Train One-Class SVM model
print("Training One-Class SVM model...")
start_time = time.time()

ocsvm = OneClassSVM(
    kernel='rbf',
    gamma='scale',
    nu=0.01  # Approximate proportion of outliers
)

ocsvm.fit(X_train_scaled)

# Get anomaly scores (-1 for anomalies, 1 for normal)
ocsvm_scores_train = ocsvm.decision_function(X_train_scaled)
ocsvm_scores_test = ocsvm.decision_function(X_test_scaled)

# Convert to anomaly scores (higher score = more anomalous)
ocsvm_anomaly_scores_train = -ocsvm_scores_train
ocsvm_anomaly_scores_test = -ocsvm_scores_test

training_time = time.time() - start_time
print(f"One-Class SVM training completed in {training_time:.2f} seconds")

### 4.4 PCA with Mahalanobis Distance

In [None]:
# Apply PCA
print("Applying PCA...")
start_time = time.time()

# First, fit PCA with all components to determine the number of components to retain
pca_all = PCA()
pca_all.fit(X_train_scaled)

# Find number of components for 95% variance
n_components_95 = np.argmax(np.cumsum(pca_all.explained_variance_ratio_) >= 0.95) + 1
print(f"Number of components for 95% variance: {n_components_95}")

# Apply PCA with the selected number of components
pca = PCA(n_components=n_components_95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Estimate the covariance matrix from the PCA-transformed training data
cov = EmpiricalCovariance().fit(X_train_pca)

# Calculate the mean of the PCA-transformed training data
mean_vec = np.mean(X_train_pca, axis=0)

# Function to calculate Mahalanobis distance
def mahalanobis_distance(x, mean, cov):
    inv_cov = np.linalg.inv(cov.covariance_)
    x_minus_mean = x - mean
    left = np.dot(x_minus_mean, inv_cov)
    mahal = np.dot(left, x_minus_mean.T)
    return np.sqrt(mahal.diagonal())

# Calculate Mahalanobis distance for training and test data
mahal_scores_train = mahalanobis_distance(X_train_pca, mean_vec, cov)
mahal_scores_test = mahalanobis_distance(X_test_pca, mean_vec, cov)

# Mahalanobis distance is already an anomaly score (higher = more anomalous)
mahal_anomaly_scores_train = mahal_scores_train
mahal_anomaly_scores_test = mahal_scores_test

pca_time = time.time() - start_time
print(f"PCA and Mahalanobis distance calculation completed in {pca_time:.2f} seconds")

### 4.5 Autoencoder

In [None]:
# Build and train Autoencoder model
print("Building and training Autoencoder model...")
start_time = time.time()

# Define model parameters
input_dim = X_train_scaled.shape[1]
encoding_dim = 32
hidden_dim = 64

# Build the autoencoder model
input_layer = Input(shape=(input_dim,))
# Encoder
encoded = Dense(hidden_dim, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
# Decoder
decoded = Dense(hidden_dim, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

# Autoencoder model
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = autoencoder.fit(
    X_train_scaled, X_train_scaled,
    epochs=20,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=0
)

# Get reconstructions
X_train_reconstructed = autoencoder.predict(X_train_scaled)
X_test_reconstructed = autoencoder.predict(X_test_scaled)

# Calculate reconstruction error (MSE) for each sample
autoencoder_mse_train = np.mean(np.square(X_train_scaled - X_train_reconstructed), axis=1)
autoencoder_mse_test = np.mean(np.square(X_test_scaled - X_test_reconstructed), axis=1)

# MSE is already an anomaly score (higher = more anomalous)
autoencoder_anomaly_scores_train = autoencoder_mse_train
autoencoder_anomaly_scores_test = autoencoder_mse_test

training_time = time.time() - start_time
print(f"Autoencoder training completed in {training_time:.2f} seconds")

## 5. Normalize Anomaly Scores

In [None]:
# Function to normalize scores to [0, 1] range
def normalize_scores(scores):
    min_val = np.min(scores)
    max_val = np.max(scores)
    return (scores - min_val) / (max_val - min_val)

# Normalize all anomaly scores
iso_forest_norm_train = normalize_scores(iso_forest_anomaly_scores_train)
iso_forest_norm_test = normalize_scores(iso_forest_anomaly_scores_test)

lof_norm_train = normalize_scores(lof_anomaly_scores_train)
lof_norm_test = normalize_scores(lof_anomaly_scores_test)

ocsvm_norm_train = normalize_scores(ocsvm_anomaly_scores_train)
ocsvm_norm_test = normalize_scores(ocsvm_anomaly_scores_test)

mahal_norm_train = normalize_scores(mahal_anomaly_scores_train)
mahal_norm_test = normalize_scores(mahal_anomaly_scores_test)

autoencoder_norm_train = normalize_scores(autoencoder_anomaly_scores_train)
autoencoder_norm_test = normalize_scores(autoencoder_anomaly_scores_test)

## 6. Evaluate Individual Models

In [None]:
# Function to calculate ROC AUC for a model
def calculate_roc_auc(y_true, scores):
    fpr, tpr, _ = roc_curve(y_true, scores)
    return auc(fpr, tpr)

# Function to calculate PR AUC for a model
def calculate_pr_auc(y_true, scores):
    precision, recall, _ = precision_recall_curve(y_true, scores)
    return auc(recall, precision)

# Calculate AUC for each model
iso_forest_roc_auc = calculate_roc_auc(y_test, iso_forest_norm_test)
lof_roc_auc = calculate_roc_auc(y_test, lof_norm_test)
ocsvm_roc_auc = calculate_roc_auc(y_test, ocsvm_norm_test)
mahal_roc_auc = calculate_roc_auc(y_test, mahal_norm_test)
autoencoder_roc_auc = calculate_roc_auc(y_test, autoencoder_norm_test)

iso_forest_pr_auc = calculate_pr_auc(y_test, iso_forest_norm_test)
lof_pr_auc = calculate_pr_auc(y_test, lof_norm_test)
ocsvm_pr_auc = calculate_pr_auc(y_test, ocsvm_norm_test)
mahal_pr_auc = calculate_pr_auc(y_test, mahal_norm_test)
autoencoder_pr_auc = calculate_pr_auc(y_test, autoencoder_norm_test)

# Print results
print("ROC AUC Scores:")
print(f"Isolation Forest: {iso_forest_roc_auc:.4f}")
print(f"Local Outlier Factor: {lof_roc_auc:.4f}")
print(f"One-Class SVM: {ocsvm_roc_auc:.4f}")
print(f"PCA-Mahalanobis: {mahal_roc_auc:.4f}")
print(f"Autoencoder: {autoencoder_roc_auc:.4f}")

print("\nPR AUC Scores:")
print(f"Isolation Forest: {iso_forest_pr_auc:.4f}")
print(f"Local Outlier Factor: {lof_pr_auc:.4f}")
print(f"One-Class SVM: {ocsvm_pr_auc:.4f}")
print(f"PCA-Mahalanobis: {mahal_pr_auc:.4f}")
print(f"Autoencoder: {autoencoder_pr_auc:.4f}")

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(12, 10))

# Isolation Forest
fpr, tpr, _ = roc_curve(y_test, iso_forest_norm_test)
plt.plot(fpr, tpr, label=f'Isolation Forest (AUC = {iso_forest_roc_auc:.4f})')

# Local Outlier Factor
fpr, tpr, _ = roc_curve(y_test, lof_norm_test)
plt.plot(fpr, tpr, label=f'Local Outlier Factor (AUC = {lof_roc_auc:.4f})')

# One-Class SVM
fpr, tpr, _ = roc_curve(y_test, ocsvm_norm_test)
plt.plot(fpr, tpr, label=f'One-Class SVM (AUC = {ocsvm_roc_auc:.4f})')

# PCA-Mahalanobis
fpr, tpr, _ = roc_curve(y_test, mahal_norm_test)
plt.plot(fpr, tpr, label=f'PCA-Mahalanobis (AUC = {mahal_roc_auc:.4f})')

# Autoencoder
fpr, tpr, _ = roc_curve(y_test, autoencoder_norm_test)
plt.plot(fpr, tpr, label=f'Autoencoder (AUC = {autoencoder_roc_auc:.4f})')

# Reference line
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Individual Models')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## 7. Create Ensemble Model

In [None]:
# Create a DataFrame with all normalized scores
ensemble_train_df = pd.DataFrame({
    'iso_forest': iso_forest_norm_train,
    'lof': lof_norm_train,
    'ocsvm': ocsvm_norm_train,
    'mahalanobis': mahal_norm_train,
    'autoencoder': autoencoder_norm_train
})

ensemble_test_df = pd.DataFrame({
    'iso_forest': iso_forest_norm_test,
    'lof': lof_norm_test,
    'ocsvm': ocsvm_norm_test,
    'mahalanobis': mahal_norm_test,
    'autoencoder': autoencoder_norm_test
})

### 7.1 Simple Averaging Ensemble

In [None]:
# Calculate average scores
ensemble_avg_train = ensemble_train_df.mean(axis=1)
ensemble_avg_test = ensemble_test_df.mean(axis=1)

# Calculate ROC AUC and PR AUC for average ensemble
ensemble_avg_roc_auc = calculate_roc_auc(y_test, ensemble_avg_test)
ensemble_avg_pr_auc = calculate_pr_auc(y_test, ensemble_avg_test)

print(f"Average Ensemble ROC AUC: {ensemble_avg_roc_auc:.4f}")
print(f"Average Ensemble PR AUC: {ensemble_avg_pr_auc:.4f}")

### 7.2 Weighted Ensemble

In [None]:
# Define weights based on individual model performance (ROC AUC)
weights = {
    'iso_forest': iso_forest_roc_auc,
    'lof': lof_roc_auc,
    'ocsvm': ocsvm_roc_auc,
    'mahalanobis': mahal_roc_auc,
    'autoencoder': autoencoder_roc_auc
}

# Normalize weights to sum to 1
total_weight = sum(weights.values())
normalized_weights = {k: v/total_weight for k, v in weights.items()}

print("Normalized weights:")
for model, weight in normalized_weights.items():
    print(f"{model}: {weight:.4f}")

# Calculate weighted scores
ensemble_weighted_train = sum(ensemble_train_df[model] * weight for model, weight in normalized_weights.items())
ensemble_weighted_test = sum(ensemble_test_df[model] * weight for model, weight in normalized_weights.items())

# Calculate ROC AUC and PR AUC for weighted ensemble
ensemble_weighted_roc_auc = calculate_roc_auc(y_test, ensemble_weighted_test)
ensemble_weighted_pr_auc = calculate_pr_auc(y_test, ensemble_weighted_test)

print(f"\nWeighted Ensemble ROC AUC: {ensemble_weighted_roc_auc:.4f}")
print(f"Weighted Ensemble PR AUC: {ensemble_weighted_pr_auc:.4f}")

### 7.3 Maximum Ensemble

In [None]:
# Calculate maximum scores (taking the most anomalous score for each sample)
ensemble_max_train = ensemble_train_df.max(axis=1)
ensemble_max_test = ensemble_test_df.max(axis=1)

# Calculate ROC AUC and PR AUC for maximum ensemble
ensemble_max_roc_auc = calculate_roc_auc(y_test, ensemble_max_test)
ensemble_max_pr_auc = calculate_pr_auc(y_test, ensemble_max_test)

print(f"Maximum Ensemble ROC AUC: {ensemble_max_roc_auc:.4f}")
print(f"Maximum Ensemble PR AUC: {ensemble_max_pr_auc:.4f}")

## 8. Compare Ensemble Methods

In [None]:
# Plot ROC curves for all ensemble methods
plt.figure(figsize=(12, 10))

# Average Ensemble
fpr, tpr, _ = roc_curve(y_test, ensemble_avg_test)
plt.plot(fpr, tpr, label=f'Average Ensemble (AUC = {ensemble_avg_roc_auc:.4f})')

# Weighted Ensemble
fpr, tpr, _ = roc_curve(y_test, ensemble_weighted_test)
plt.plot(fpr, tpr, label=f'Weighted Ensemble (AUC = {ensemble_weighted_roc_auc:.4f})')

# Maximum Ensemble
fpr, tpr, _ = roc_curve(y_test, ensemble_max_test)
plt.plot(fpr, tpr, label=f'Maximum Ensemble (AUC = {ensemble_max_roc_auc:.4f})')

# Best Individual Model
best_model = max([
    ('Isolation Forest', iso_forest_roc_auc, iso_forest_norm_test),
    ('Local Outlier Factor', lof_roc_auc, lof_norm_test),
    ('One-Class SVM', ocsvm_roc_auc, ocsvm_norm_test),
    ('PCA-Mahalanobis', mahal_roc_auc, mahal_norm_test),
    ('Autoencoder', autoencoder_roc_auc, autoencoder_norm_test)
], key=lambda x: x[1])

fpr, tpr, _ = roc_curve(y_test, best_model[2])
plt.plot(fpr, tpr, label=f'Best Individual: {best_model[0]} (AUC = {best_model[1]:.4f})')

# Reference line
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Ensemble Methods')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# Determine the best ensemble method
ensemble_methods = [
    ('Average Ensemble', ensemble_avg_roc_auc, ensemble_avg_test),
    ('Weighted Ensemble', ensemble_weighted_roc_auc, ensemble_weighted_test),
    ('Maximum Ensemble', ensemble_max_roc_auc, ensemble_max_test)
]

best_ensemble = max(ensemble_methods, key=lambda x: x[1])
print(f"Best ensemble method: {best_ensemble[0]} with ROC AUC = {best_ensemble[1]:.4f}")

# Use the best ensemble method for further analysis
best_ensemble_scores = best_ensemble[2]

## 9. Threshold Optimization for Best Ensemble

In [None]:
# Try different thresholds for anomaly detection
thresholds = np.linspace(min(best_ensemble_scores), max(best_ensemble_scores), 100)
f1_scores = []
precision_scores = []
recall_scores = []

for threshold in thresholds:
    y_pred_threshold = np.where(best_ensemble_scores >= threshold, 1, 0)
    f1_scores.append(f1_score(y_test, y_pred_threshold))
    precision_scores.append(precision_score(y_test, y_pred_threshold))
    recall_scores.append(recall_score(y_test, y_pred_threshold))

# Find the threshold that maximizes F1 score
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]
best_f1 = f1_scores[best_threshold_idx]
best_precision = precision_scores[best_threshold_idx]
best_recall = recall_scores[best_threshold_idx]

print(f"Best threshold: {best_threshold:.4f}")
print(f"Best F1 score: {best_f1:.4f}")
print(f"Precision at best threshold: {best_precision:.4f}")
print(f"Recall at best threshold: {best_recall:.4f}")

In [None]:
# Plot F1, precision, and recall scores for different thresholds
plt.figure(figsize=(12, 8))
plt.plot(thresholds, f1_scores, 'b-', label='F1 Score')
plt.plot(thresholds, precision_scores, 'g-', label='Precision')
plt.plot(thresholds, recall_scores, 'r-', label='Recall')
plt.axvline(x=best_threshold, color='k', linestyle='--', label=f'Best Threshold: {best_threshold:.4f}')
plt.title('Performance Metrics vs. Threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Apply the optimized threshold
y_pred_optimized = np.where(best_ensemble_scores >= best_threshold, 1, 0)

# Evaluate with optimized threshold
print("Confusion Matrix with Optimized Threshold:")
cm_optimized = confusion_matrix(y_test, y_pred_optimized)
print(cm_optimized)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_optimized, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.title('Confusion Matrix with Optimized Threshold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report with Optimized Threshold:")
print(classification_report(y_test, y_pred_optimized))

## 10. Analyze Anomalies Over Time

In [None]:
# Create a DataFrame with timestamps, actual labels, and predictions
results_df = pd.DataFrame({
    'timestamp': test_df['timestamp'],
    'actual': y_test,
    'predicted': y_pred_optimized,
    'anomaly_score': best_ensemble_scores
})

# Plot actual vs predicted anomalies over time
plt.figure(figsize=(16, 8))

# Sample data for better visualization if dataset is large
sample_size = min(10000, len(results_df))
sample_indices = np.linspace(0, len(results_df)-1, sample_size, dtype=int)
sample_df = results_df.iloc[sample_indices]

plt.plot(sample_df['timestamp'], sample_df['actual'], 'b-', alpha=0.5, label='Actual')
plt.plot(sample_df['timestamp'], sample_df['predicted'], 'r-', alpha=0.5, label='Predicted')
plt.title('Actual vs Predicted Anomalies Over Time')
plt.xlabel('Time')
plt.ylabel('Anomaly (1) / Normal (0)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot anomaly scores over time with actual labels
plt.figure(figsize=(16, 8))

# Create a colormap based on actual labels
colors = np.where(sample_df['actual'] == 1, 'red', 'blue')

plt.scatter(sample_df['timestamp'], sample_df['anomaly_score'], c=colors, alpha=0.5, s=10)
plt.title('Anomaly Scores Over Time')
plt.xlabel('Time')
plt.ylabel('Anomaly Score')
plt.grid(True)

# Add a horizontal line at the threshold
plt.axhline(y=best_threshold, color='g', linestyle='--')

# Add a legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Actual Anomaly'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Normal'),
    Line2D([0], [0], color='g', linestyle='--', label='Threshold')
]
plt.legend(handles=legend_elements)

plt.show()

## 11. Save the Models and Ensemble

In [None]:
# Create a directory to save the models if it doesn't exist
model_dir = "./"
os.makedirs(model_dir, exist_ok=True)

# Save individual models
with open(os.path.join(model_dir, "isolation_forest_model.pkl"), 'wb') as file:
    pickle.dump(iso_forest, file)
    
with open(os.path.join(model_dir, "lof_model.pkl"), 'wb') as file:
    pickle.dump(lof, file)
    
with open(os.path.join(model_dir, "ocsvm_model.pkl"), 'wb') as file:
    pickle.dump(ocsvm, file)
    
with open(os.path.join(model_dir, "pca_model.pkl"), 'wb') as file:
    pickle.dump(pca, file)
    
with open(os.path.join(model_dir, "covariance_model.pkl"), 'wb') as file:
    pickle.dump(cov, file)
    
with open(os.path.join(model_dir, "mean_vector.pkl"), 'wb') as file:
    pickle.dump(mean_vec, file)
    
# Save autoencoder model
autoencoder.save(os.path.join(model_dir, "autoencoder_model.h5"))

# Save scaler
with open(os.path.join(model_dir, "scaler.pkl"), 'wb') as file:
    pickle.dump(scaler, file)
    
# Save ensemble information
ensemble_info = {
    'best_ensemble_method': best_ensemble[0],
    'best_threshold': best_threshold,
    'normalized_weights': normalized_weights if best_ensemble[0] == 'Weighted Ensemble' else None
}

with open(os.path.join(model_dir, "ensemble_info.pkl"), 'wb') as file:
    pickle.dump(ensemble_info, file)
    
print("All models and ensemble information saved successfully.")

## 12. Conclusion

In this notebook, we have:

1. Loaded and preprocessed the HAI-23.05 dataset
2. Implemented five different anomaly detection models:
   - Isolation Forest
   - Local Outlier Factor
   - One-Class SVM
   - PCA with Mahalanobis Distance
   - Autoencoder
3. Evaluated each model's performance using ROC AUC and PR AUC
4. Created three ensemble methods:
   - Simple Averaging
   - Weighted Ensemble
   - Maximum Ensemble
5. Identified the best ensemble method and optimized its threshold
6. Analyzed anomaly detection results over time
7. Saved all models and ensemble information for future use

The ensemble approach has demonstrated superior performance compared to individual models, leveraging the strengths of different anomaly detection techniques. This approach is particularly effective for industrial control systems where different types of anomalies may be better detected by different methods.