# HAI-20.07 Dataset Anomaly Detection - Isolation Forest Model

This notebook uses the Isolation Forest algorithm for anomaly detection on the HAI-20.07 dataset.

The HAI dataset contains data from industrial control systems (ICS), where training data does not include attack labels, while test data includes attack labels.

## 1. Import Necessary Libraries

In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import pickle
import time

# Import machine learning libraries
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc, roc_curve, f1_score, precision_score, recall_score

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 2. Load Dataset

In [None]:
# Set data path
data_path = "../../hai-security-dataset/hai-20.07/"

# Load training data
train_files = [f for f in os.listdir(data_path) if f.startswith('train')]
train_dfs = []

for file in train_files:
    print(f"Loading training file: {file}")
    df = pd.read_csv(f"{data_path}{file}", sep=";")
    train_dfs.append(df)
    
# Combine training data
train_df = pd.concat(train_dfs, ignore_index=True)
print(f"Training data shape: {train_df.shape}")

In [None]:
# Load test data
test_files = [f for f in os.listdir(data_path) if f.startswith('test')]
test_dfs = []

for file in test_files:
    print(f"Loading test file: {file}")
    df = pd.read_csv(f"{data_path}{file}", sep=";")
    test_dfs.append(df)
    
# Combine test data
test_df = pd.concat(test_dfs, ignore_index=True)
print(f"Test data shape: {test_df.shape}")

## 3. Data Preprocessing

In [None]:
# Check basic information of the dataset
print("Column names in training dataset:")
print(train_df.columns.tolist())

# Check for missing values
print("\nMissing values in training dataset:")
print(train_df.isnull().sum().sum())

print("\nMissing values in test dataset:")
print(test_df.isnull().sum().sum())

In [None]:
# Convert timestamp to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Extract attack labels from test data
attack_columns = [col for col in test_df.columns if 'attack' in col.lower()]
print(f"Attack label columns: {attack_columns}")

# Create a combined attack label (if multiple attack columns exist)
if len(attack_columns) > 1:
    test_df['attack_combined'] = test_df[attack_columns].max(axis=1)
    y_test = test_df['attack_combined']
else:
    y_test = test_df[attack_columns[0]]

# Print attack distribution
print(f"\nAttack distribution in test data:\n{y_test.value_counts()}")

In [None]:
# Select features for training and testing
# Exclude timestamp and attack labels
feature_columns = [col for col in train_df.columns if col not in ['time'] + attack_columns]
print(f"Number of features: {len(feature_columns)}")

# Prepare training and testing data
X_train = train_df[feature_columns].values
X_test = test_df[feature_columns].values

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape after preprocessing: {X_train_scaled.shape}")
print(f"Testing data shape after preprocessing: {X_test_scaled.shape}")

## 4. Train Isolation Forest Model

In [None]:
# Define hyperparameters
n_estimators = 100
max_samples = 'auto'
contamination = 'auto'
random_state = 42

# Create and train the model
print("Training Isolation Forest model...")
start_time = time.time()

model = IsolationForest(
    n_estimators=n_estimators,
    max_samples=max_samples,
    contamination=contamination,
    random_state=random_state,
    n_jobs=-1
)

model.fit(X_train_scaled)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

## 5. Save the Model

In [None]:
# Create a directory to save the model if it doesn't exist
model_dir = "./"
os.makedirs(model_dir, exist_ok=True)

# Save the model
model_filename = os.path.join(model_dir, "isolation_forest_model.pkl")
scaler_filename = os.path.join(model_dir, "scaler.pkl")

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
    
with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)
    
print(f"Model saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")

## 6. Evaluate the Model

In [None]:
# Predict anomalies on test data
# Isolation Forest returns -1 for anomalies and 1 for normal data
print("Predicting anomalies on test data...")
start_time = time.time()

y_pred_raw = model.predict(X_test_scaled)
# Convert to binary format (0 for normal, 1 for anomaly)
y_pred = np.where(y_pred_raw == -1, 1, 0)

prediction_time = time.time() - start_time
print(f"Prediction completed in {prediction_time:.2f} seconds")

In [None]:
# Calculate anomaly scores
# Lower scores (more negative) indicate higher anomaly probability
anomaly_scores = model.decision_function(X_test_scaled)

# Plot anomaly score distribution
plt.figure(figsize=(12, 6))
plt.hist(anomaly_scores, bins=50, alpha=0.7)
plt.title('Anomaly Score Distribution')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Evaluate the model
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
# ROC Curve
# For Isolation Forest, we need to use negative of anomaly scores as the decision function
# since lower scores indicate higher anomaly probability
fpr, tpr, thresholds = roc_curve(y_test, -anomaly_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Precision-Recall Curve
precision_curve, recall_curve, _ = precision_recall_curve(y_test, -anomaly_scores)
pr_auc = auc(recall_curve, precision_curve)

plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

## 7. Analyze Anomalies Over Time

In [None]:
# Create a DataFrame with timestamps, actual labels, and predictions
results_df = pd.DataFrame({
    'timestamp': test_df['time'],
    'actual': y_test,
    'predicted': y_pred,
    'anomaly_score': anomaly_scores
})

# Plot actual vs predicted anomalies over time
plt.figure(figsize=(16, 8))

# Sample data for better visualization if dataset is large
sample_size = min(10000, len(results_df))
sample_indices = np.linspace(0, len(results_df)-1, sample_size, dtype=int)
sample_df = results_df.iloc[sample_indices]

plt.plot(sample_df['timestamp'], sample_df['actual'], 'b-', alpha=0.5, label='Actual')
plt.plot(sample_df['timestamp'], sample_df['predicted'], 'r-', alpha=0.5, label='Predicted')
plt.title('Actual vs Predicted Anomalies Over Time')
plt.xlabel('Time')
plt.ylabel('Anomaly (1) / Normal (0)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot anomaly scores over time with actual labels
plt.figure(figsize=(16, 8))

# Create a colormap based on actual labels
colors = np.where(sample_df['actual'] == 1, 'red', 'blue')

plt.scatter(sample_df['timestamp'], sample_df['anomaly_score'], c=colors, alpha=0.5, s=10)
plt.title('Anomaly Scores Over Time')
plt.xlabel('Time')
plt.ylabel('Anomaly Score')
plt.grid(True)

# Add a horizontal line at the threshold (typically 0 for Isolation Forest)
plt.axhline(y=0, color='g', linestyle='--')

# Add a legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Actual Anomaly'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Normal'),
    Line2D([0], [0], color='g', linestyle='--', label='Threshold')
]
plt.legend(handles=legend_elements)

plt.show()

## 8. Threshold Optimization

In [None]:
# Try different thresholds for anomaly detection
thresholds = np.linspace(min(anomaly_scores), max(anomaly_scores), 100)
f1_scores = []
precision_scores = []
recall_scores = []

for threshold in thresholds:
    y_pred_threshold = np.where(anomaly_scores <= threshold, 1, 0)
    f1_scores.append(f1_score(y_test, y_pred_threshold))
    precision_scores.append(precision_score(y_test, y_pred_threshold))
    recall_scores.append(recall_score(y_test, y_pred_threshold))

# Find the threshold that maximizes F1 score
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]
best_f1 = f1_scores[best_threshold_idx]
best_precision = precision_scores[best_threshold_idx]
best_recall = recall_scores[best_threshold_idx]

print(f"Best threshold: {best_threshold:.4f}")
print(f"Best F1 score: {best_f1:.4f}")
print(f"Precision at best threshold: {best_precision:.4f}")
print(f"Recall at best threshold: {best_recall:.4f}")

In [None]:
# Plot F1, precision, and recall scores for different thresholds
plt.figure(figsize=(12, 8))
plt.plot(thresholds, f1_scores, 'b-', label='F1 Score')
plt.plot(thresholds, precision_scores, 'g-', label='Precision')
plt.plot(thresholds, recall_scores, 'r-', label='Recall')
plt.axvline(x=best_threshold, color='k', linestyle='--', label=f'Best Threshold: {best_threshold:.4f}')
plt.title('Performance Metrics vs. Threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Apply the optimized threshold
y_pred_optimized = np.where(anomaly_scores <= best_threshold, 1, 0)

# Evaluate with optimized threshold
print("Confusion Matrix with Optimized Threshold:")
cm_optimized = confusion_matrix(y_test, y_pred_optimized)
print(cm_optimized)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_optimized, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.title('Confusion Matrix with Optimized Threshold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report with Optimized Threshold:")
print(classification_report(y_test, y_pred_optimized))

## 9. Save the Optimized Model

In [None]:
# Save the optimized threshold
threshold_filename = os.path.join(model_dir, "optimized_threshold.pkl")
with open(threshold_filename, 'wb') as file:
    pickle.dump(best_threshold, file)
    
print(f"Optimized threshold saved to {threshold_filename}")

## 10. Conclusion

In this notebook, we have:

1. Loaded and preprocessed the HAI-20.07 dataset
2. Trained an Isolation Forest model for anomaly detection
3. Evaluated the model's performance using various metrics
4. Optimized the anomaly detection threshold to maximize F1 score
5. Saved the model, scaler, and optimized threshold for future use

The Isolation Forest algorithm has demonstrated its ability to detect anomalies in the industrial control system data without requiring labeled training data, making it suitable for real-world scenarios where attack data may not be available during training.