# Random Forest Model for HAI Security Dataset Anomaly Detection

This notebook implements a Random Forest classifier for anomaly detection on the HAI security dataset. Random Forests are effective for this task due to their ability to handle high-dimensional data and capture complex relationships between features.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import time
import joblib
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load Preprocessed Data

First, let's load the preprocessed data created in the preprocessing notebook.

In [None]:
def load_processed_data(file_path):
    """
    Load processed data from NPZ file.
    
    Args:
        file_path: Path to the NPZ file
        
    Returns:
        DataFrame: Loaded data
    """
    # Load NPZ file
    npz_data = np.load(file_path, allow_pickle=True)
    
    # Convert to DataFrame
    df = pd.DataFrame(npz_data['data'], columns=npz_data['columns'])
    
    return df

In [None]:
# Load preprocessor
preprocessor_path = './models/hai_hai_20_07_standard_preprocessor.joblib'
preprocessor_dict = joblib.load(preprocessor_path)

# Extract important information
feature_columns = preprocessor_dict['feature_columns']
attack_columns = preprocessor_dict['attack_columns']
timestamp_col = preprocessor_dict['timestamp_col']

print(f"Number of features: {len(feature_columns)}")
print(f"Attack columns: {attack_columns}")
print(f"Timestamp column: {timestamp_col}")

In [None]:
# Get list of processed data files
train_data_dir = './processed_data/hai-20.07/train'
test_data_dir = './processed_data/hai-20.07/test'

train_files = sorted(glob.glob(f'{train_data_dir}/*.npz'))
test_files = sorted(glob.glob(f'{test_data_dir}/*.npz'))

print(f"Training files: {[os.path.basename(f) for f in train_files]}")
print(f"Test files: {[os.path.basename(f) for f in test_files]}")

## 2. Prepare Data for Random Forest

Unlike LSTM, Random Forest doesn't require sequence data. We'll use individual time points as samples.

In [None]:
def load_and_prepare_tabular_data(file_paths, feature_cols, target_col=None, max_files=None, sample_fraction=None):
    """
    Load and prepare tabular data from multiple files.
    
    Args:
        file_paths: List of file paths
        feature_cols: List of feature column names
        target_col: Target column name (None for unsupervised learning)
        max_files: Maximum number of files to load (None for all files)
        sample_fraction: Fraction of data to sample (None for all data)
        
    Returns:
        tuple: (X, y) - Features and targets
    """
    all_X = []
    all_y = [] if target_col is not None else None
    
    # Limit the number of files if specified
    if max_files is not None:
        file_paths = file_paths[:max_files]
    
    for file_path in file_paths:
        print(f"Processing {os.path.basename(file_path)}...")
        
        # Load data
        df = load_processed_data(file_path)
        
        # Sample data if specified
        if sample_fraction is not None and sample_fraction < 1.0:
            df = df.sample(frac=sample_fraction, random_state=42)
        
        # Extract features
        X = df[feature_cols]
        all_X.append(X)
        
        # Extract target if provided
        if target_col is not None and target_col in df.columns:
            y = df[target_col]
            all_y.append(y)
    
    # Combine data from all files
    combined_X = pd.concat(all_X) if all_X else pd.DataFrame()
    combined_y = pd.concat(all_y) if all_y else None
    
    return combined_X, combined_y

In [None]:
# Set parameters
target_col = 'attack' if attack_columns else None  # Target column
sample_fraction = 0.1  # Sample 10% of data to reduce memory usage

# Load and prepare training data
print("Loading and preparing training data...")
X_train, _ = load_and_prepare_tabular_data(train_files, feature_columns, target_col=None, 
                                          max_files=2, sample_fraction=sample_fraction)

# Load and prepare test data
print("\nLoading and preparing test data...")
X_test, y_test = load_and_prepare_tabular_data(test_files, feature_columns, target_col=target_col, 
                                              max_files=2, sample_fraction=sample_fraction)

print(f"\nTraining data shape: {X_train.shape}")
if y_test is not None:
    print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")
else:
    print(f"Test data shape: {X_test.shape}")

## 3. Feature Selection

Let's perform feature selection to reduce dimensionality and improve model performance.

In [None]:
def select_features(X_train, X_test, n_estimators=100, max_features=20):
    """
    Select important features using a Random Forest.
    
    Args:
        X_train: Training features
        X_test: Test features
        n_estimators: Number of trees in the forest
        max_features: Maximum number of features to select
        
    Returns:
        tuple: (X_train_selected, X_test_selected, selected_features) - Selected data and feature names
    """
    print("Performing feature selection...")
    start_time = time.time()
    
    # Train a Random Forest for feature importance
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
    
    # Create synthetic target for unsupervised feature selection
    # We'll use Isolation Forest to create pseudo-labels
    iso_forest = IsolationForest(random_state=42, n_jobs=-1)
    pseudo_labels = iso_forest.fit_predict(X_train)
    # Convert to binary classification (1 for normal, 0 for anomaly)
    pseudo_labels = np.where(pseudo_labels == 1, 1, 0)
    
    # Fit Random Forest with pseudo-labels
    rf.fit(X_train, pseudo_labels)
    
    # Get feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Select top features
    selector = SelectFromModel(rf, max_features=max_features, prefit=True)
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Get names of selected features
    selected_mask = selector.get_support()
    selected_features = X_train.columns[selected_mask].tolist()
    
    print(f"Feature selection completed in {time.time() - start_time:.2f} seconds")
    print(f"Selected {len(selected_features)} features out of {X_train.shape[1]}")
    
    return X_train_selected, X_test_selected, selected_features, importances, indices

In [None]:
# Perform feature selection
X_train_selected, X_test_selected, selected_features, importances, indices = select_features(X_train, X_test, max_features=20)

print(f"Selected features: {selected_features}")

In [None]:
# Plot feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(min(20, len(indices))), importances[indices[:20]], align="center")
plt.xticks(range(min(20, len(indices))), [X_train.columns[i] for i in indices[:20]], rotation=90)
plt.xlim([-1, min(20, len(indices))])
plt.tight_layout()
plt.show()

## 4. Build and Train Random Forest Model

Now we'll build and train a Random Forest model for anomaly detection.

In [None]:
def train_random_forest(X_train, y_train=None, param_grid=None, cv=3, n_jobs=-1):
    """
    Train a Random Forest model with hyperparameter tuning.
    
    Args:
        X_train: Training features
        y_train: Training labels (None for unsupervised learning)
        param_grid: Grid of hyperparameters to search
        cv: Number of cross-validation folds
        n_jobs: Number of parallel jobs
        
    Returns:
        RandomForestClassifier: Trained model
    """
    print("Training Random Forest model...")
    start_time = time.time()
    
    # Default hyperparameters
    if param_grid is None:
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    
    # Create synthetic target for unsupervised learning if needed
    if y_train is None:
        # Use Isolation Forest to create pseudo-labels
        iso_forest = IsolationForest(random_state=42, n_jobs=n_jobs)
        y_train = iso_forest.fit_predict(X_train)
        # Convert to binary classification (1 for normal, 0 for anomaly)
        y_train = np.where(y_train == 1, 1, 0)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(random_state=42)
    
    # Use RandomizedSearchCV for faster hyperparameter tuning
    random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=10, 
                                      cv=cv, verbose=1, random_state=42, n_jobs=n_jobs)
    
    # Fit model
    random_search.fit(X_train, y_train)
    
    # Get best model
    best_model = random_search.best_estimator_
    
    print(f"Training completed in {time.time() - start_time:.2f} seconds")
    print(f"Best parameters: {random_search.best_params_}")
    
    return best_model

In [None]:
# Define a smaller parameter grid for faster training
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True]
}

# Train Random Forest model
rf_model = train_random_forest(X_train_selected, y_train=None, param_grid=param_grid, cv=3)

## 5. Evaluate Model and Detect Anomalies

Now we'll evaluate the model's performance on the test data.

In [None]:
# Predict on test data
y_pred_proba = rf_model.predict_proba(X_test_selected)[:, 1]  # Probability of normal class
anomaly_scores = 1 - y_pred_proba  # Convert to anomaly score (higher = more anomalous)

print(f"Anomaly score statistics:")
print(f"Min: {np.min(anomaly_scores):.6f}")
print(f"Max: {np.max(anomaly_scores):.6f}")
print(f"Mean: {np.mean(anomaly_scores):.6f}")
print(f"Std: {np.std(anomaly_scores):.6f}")

In [None]:
# Plot anomaly score distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(anomaly_scores, bins=50)
plt.title('Anomaly Score Distribution')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(anomaly_scores, bins=50, log=True)
plt.title('Anomaly Score Distribution (Log Scale)')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency (Log Scale)')

plt.tight_layout()
plt.show()

### 5.1 Determine Anomaly Threshold

We need to determine a threshold for the anomaly score to classify data points as normal or anomalous.

In [None]:
def find_optimal_threshold(scores, y_true):
    """
    Find the optimal threshold for anomaly detection using ROC curve.
    
    Args:
        scores: Anomaly scores
        y_true: True labels (0 for normal, 1 for anomaly)
        
    Returns:
        float: Optimal threshold value
    """
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, scores)
    
    # Calculate the geometric mean of sensitivity and specificity
    gmeans = np.sqrt(tpr * (1 - fpr))
    
    # Find the optimal threshold
    ix = np.argmax(gmeans)
    optimal_threshold = thresholds[ix]
    
    print(f"Optimal threshold: {optimal_threshold:.6f}")
    print(f"At this threshold - TPR: {tpr[ix]:.4f}, FPR: {fpr[ix]:.4f}, G-mean: {gmeans[ix]:.4f}")
    
    # Plot ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, marker='.')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.scatter(fpr[ix], tpr[ix], marker='o', color='red', label=f'Optimal (Threshold = {optimal_threshold:.6f})')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Calculate AUC
    roc_auc = auc(fpr, tpr)
    print(f"ROC AUC: {roc_auc:.4f}")
    
    return optimal_threshold

In [None]:
# Find optimal threshold if labels are available
if y_test is not None:
    threshold = find_optimal_threshold(anomaly_scores, y_test)
else:
    # If no labels, use a statistical approach
    threshold = np.mean(anomaly_scores) + 2 * np.std(anomaly_scores)  # Mean + 2 standard deviations
    print(f"Using statistical threshold: {threshold:.6f}")

In [None]:
# Classify as anomaly if anomaly score > threshold
y_pred = (anomaly_scores > threshold).astype(int)

# Evaluate if labels are available
if y_test is not None:
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

### 5.2 Feature Importance Analysis

Let's analyze which features are most important for anomaly detection.

In [None]:
# Get feature importances from the model
feature_importances = rf_model.feature_importances_

# Create a DataFrame for easier visualization
importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances for Anomaly Detection')
plt.tight_layout()
plt.show()

### 5.3 Analyze Anomaly Patterns

Let's analyze the patterns of detected anomalies to understand their characteristics.

In [None]:
# Get indices of anomalies
anomaly_indices = np.where(y_pred == 1)[0]

if len(anomaly_indices) > 0:
    # Get anomalous samples
    anomaly_samples = X_test.iloc[anomaly_indices]
    
    # Calculate statistics for each feature in anomalous samples
    anomaly_stats = anomaly_samples.describe()
    
    # Compare with overall statistics
    overall_stats = X_test.describe()
    
    # Calculate the difference in means (as a percentage of overall standard deviation)
    mean_diff = (anomaly_stats.loc['mean'] - overall_stats.loc['mean']) / overall_stats.loc['std']
    
    # Sort features by absolute difference
    sorted_features = mean_diff.abs().sort_values(ascending=False)
    
    # Plot top 10 features with largest differences
    plt.figure(figsize=(12, 8))
    sns.barplot(x=mean_diff[sorted_features.index[:10]], y=sorted_features.index[:10])
    plt.title('Top 10 Features with Largest Differences in Anomalies (Normalized)')
    plt.xlabel('Difference in Means (normalized by std)')
    plt.axvline(x=0, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 features with largest differences in anomalies:")
    for feature in sorted_features.index[:10]:
        print(f"{feature}: Normal mean = {overall_stats.loc['mean', feature]:.4f}, "
              f"Anomaly mean = {anomaly_stats.loc['mean', feature]:.4f}, "
              f"Difference = {mean_diff[feature]:.4f} std")
else:
    print("No anomalies detected.")

## 6. Save Model and Results

Finally, let's save the model and results for future use.

In [None]:
# Save model
os.makedirs('./models', exist_ok=True)
joblib.dump(rf_model, './models/random_forest_hai_20_07.joblib')

# Save metadata
model_metadata = {
    'threshold': threshold,
    'selected_features': selected_features,
    'feature_importances': feature_importances.tolist()
}

joblib.dump(model_metadata, './models/random_forest_metadata_hai_20_07.joblib')
print("Model and metadata saved successfully.")

## 7. Conclusion

In this notebook, we've implemented a Random Forest model for anomaly detection on the HAI security dataset. The model identifies anomalies based on feature patterns learned from normal data. Key steps included:

1. Loading and preparing preprocessed data
2. Performing feature selection to identify the most important features
3. Building and training a Random Forest model
4. Detecting anomalies using anomaly scores
5. Evaluating the model's performance
6. Analyzing feature importance and anomaly patterns

The Random Forest approach provides an effective method for detecting anomalies in industrial control system data, with the added benefit of interpretability through feature importance analysis.