# HAI-20.07 Isolation Forest Model

Anomaly detection using Isolation Forest algorithm.

In [None]:
import sys
sys.path.append('..')

import polars as pl
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

from utils.model_utils import ModelManager
from utils.evaluation import Evaluator
from utils.visualization import Visualizer

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed data
processed_dir = Path('processed_data')

train_df1 = pl.read_parquet(processed_dir / 'train1.parquet')
train_df2 = pl.read_parquet(processed_dir / 'train2.parquet')
test_df1 = pl.read_parquet(processed_dir / 'test1.parquet')
test_df2 = pl.read_parquet(processed_dir / 'test2.parquet')

# Combine training data
train_data = pl.concat([train_df1, train_df2])

# Separate features and labels
feature_cols = [col for col in train_data.columns if not col.startswith('attack')]
X_train = train_data.select(feature_cols).to_numpy()
X_test1 = test_df1.select(feature_cols).to_numpy()
X_test2 = test_df2.select(feature_cols).to_numpy()

y_test1 = test_df1.select('attack').to_numpy().ravel()
y_test2 = test_df2.select('attack').to_numpy().ravel()

## 2. Model Training

In [None]:
# Initialize model
model = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination='auto',
    random_state=42
)

# Train model
model.fit(X_train)

## 3. Anomaly Detection

In [None]:
# Get anomaly scores
scores_test1 = -model.score_samples(X_test1)  # Negative scores for consistency (higher = more anomalous)
scores_test2 = -model.score_samples(X_test2)

# Get predictions (1 for anomaly, 0 for normal)
y_pred1 = (model.predict(X_test1) == -1).astype(int)
y_pred2 = (model.predict(X_test2) == -1).astype(int)

## 4. Performance Evaluation

In [None]:
# Initialize evaluator
evaluator = Evaluator()

# Calculate metrics for test set 1
metrics1 = evaluator.calculate_basic_metrics(y_test1, y_pred1)
etapr1 = evaluator.calculate_etapr(y_test1, y_pred1)
delay1 = evaluator.calculate_detection_delay(
    y_test1, 
    y_pred1,
    test_df1.select('time').to_numpy().ravel()
)

# Calculate metrics for test set 2
metrics2 = evaluator.calculate_basic_metrics(y_test2, y_pred2)
etapr2 = evaluator.calculate_etapr(y_test2, y_pred2)
delay2 = evaluator.calculate_detection_delay(
    y_test2,
    y_pred2,
    test_df2.select('time').to_numpy().ravel()
)

# Print results
print("Test Set 1 Results:")
print(f"Basic Metrics: {metrics1}")
print(f"eTaPR Metrics: {etapr1}")
print(f"Detection Delay: {delay1}")

print("\nTest Set 2 Results:")
print(f"Basic Metrics: {metrics2}")
print(f"eTaPR Metrics: {etapr2}")
print(f"Detection Delay: {delay2}")

## 5. Visualization

In [None]:
# Initialize visualizer
visualizer = Visualizer(save_dir='figures')

# Plot confusion matrix
cm1 = evaluator.calculate_confusion_matrix(y_test1, y_pred1)
fig = visualizer.plot_confusion_matrix(cm1, ['Normal', 'Attack'], title='Test Set 1 Confusion Matrix')
fig.show()

# Plot time series with detected anomalies
key_features = ['P1_PIT01', 'P1_LIT01', 'P1_FT01', 'P2_SIT01']
test_df1_with_pred = test_df1.with_column(pl.Series('predicted_attack', y_pred1))
fig = visualizer.plot_time_series(
    test_df1_with_pred,
    key_features,
    attack_col='predicted_attack',
    title='Detected Anomalies (Test Set 1)'
)
fig.show()

## 6. Save Model

In [None]:
# Initialize model manager
model_manager = ModelManager(base_dir='models')

# Prepare metadata
metadata = {
    'model_type': 'isolation_forest',
    'dataset_version': '20.07',
    'parameters': {
        'n_estimators': model.n_estimators,
        'max_samples': model.max_samples,
        'contamination': model.contamination
    },
    'performance': {
        'test1': {
            'basic_metrics': metrics1,
            'etapr_metrics': etapr1,
            'detection_delay': delay1
        },
        'test2': {
            'basic_metrics': metrics2,
            'etapr_metrics': etapr2,
            'detection_delay': delay2
        }
    }
}

# Save model
model_manager.save_sklearn_model(
    model=model,
    model_name='isolation_forest',
    version='v1',
    metadata=metadata
)