# eTaPR Evaluation for HAI-21.03 Dataset

This notebook implements evaluation using eTaPR (Time-series Aware Precision and Recall) metrics for the HAI-21.03 industrial control system security dataset. eTaPR is specifically designed for time series anomaly detection evaluation.

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="darkgrid")

## 1. Global Variables Setup

In [None]:
# Set paths
OUTPUT_DIR = 'hai-security-dataset/processed'
MODEL_DIR = 'hai-security-dataset/models'
RESULTS_DIR = 'hai-security-dataset/results'
ETAPR_DIR = 'eTaPR'

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

## 2. Import eTaPR Package

In [None]:
# Add eTaPR directory to path
sys.path.append(ETAPR_DIR)

# Import eTaPR
try:
    from eTaPR_pkg import etapr
    print("Successfully imported eTaPR package")
except ImportError:
    print("Failed to import eTaPR package. Installing from wheel file...")
    
    # Install eTaPR package
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", os.path.join(ETAPR_DIR, "eTaPR-22.6.1-py3-none-any.whl")])
    
    # Try importing again
    try:
        from eTaPR_pkg import etapr
        print("Successfully installed and imported eTaPR package")
    except ImportError:
        print("Failed to install and import eTaPR package. Please install manually.")

## 3. Load Model Results

In [None]:
# Load operating point results
with open(os.path.join(MODEL_DIR, 'operating_point_results.pkl'), 'rb') as f:
    operating_point_results = pickle.load(f)

print(f"Loaded operating point results for {len(operating_point_results)} operating points")

# Load test results
test_results_path = os.path.join(RESULTS_DIR, 'test_results.csv')
if os.path.exists(test_results_path):
    test_results = pd.read_csv(test_results_path)
    print(f"Loaded test results for {len(test_results)} test files")
else:
    print("Test results file not found. Will generate results from scratch.")
    test_results = None

In [None]:
# Load detector
with open(os.path.join(MODEL_DIR, 'multi_threshold_detector.pkl'), 'rb') as f:
    detector = pickle.load(f)

print("Loaded multi-threshold detector")

In [None]:
# Load test data
test_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('test') and f.endswith('_processed_enhanced.csv')]
test_data = {}

for file in test_files:
    file_path = os.path.join(OUTPUT_DIR, file)
    file_name = file.split('_')[0]  # Extract test file name (e.g., 'test1')
    df = pd.read_csv(file_path)
    test_data[file_name] = df
    print(f"Loaded {file_name}: {df.shape[0]} rows, {df.shape[1]} columns")

## 4. Prepare Data for eTaPR Evaluation

In [None]:
def prepare_etapr_data(ground_truth, predictions, test_name, operating_point):
    """
    Prepare data for eTaPR evaluation.
    
    Args:
        ground_truth (np.array): Ground truth labels
        predictions (np.array): Predicted labels
        test_name (str): Name of the test file
        operating_point (str): Operating point used
        
    Returns:
        tuple: (ground_truth_ranges, prediction_ranges)
    """
    # Convert binary labels to ranges (start, end) for ground truth
    ground_truth_ranges = []
    i = 0
    while i < len(ground_truth):
        if ground_truth[i] == 1:
            start = i
            while i < len(ground_truth) and ground_truth[i] == 1:
                i += 1
            end = i - 1
            ground_truth_ranges.append((start, end))
        else:
            i += 1
    
    # Convert binary labels to ranges (start, end) for predictions
    prediction_ranges = []
    i = 0
    while i < len(predictions):
        if predictions[i] == 1:
            start = i
            while i < len(predictions) and predictions[i] == 1:
                i += 1
            end = i - 1
            prediction_ranges.append((start, end))
        else:
            i += 1
    
    # Print summary
    print(f"Test: {test_name}, Operating Point: {operating_point}")
    print(f"Ground Truth: {len(ground_truth_ranges)} anomaly ranges")
    print(f"Predictions: {len(prediction_ranges)} anomaly ranges")
    
    return ground_truth_ranges, prediction_ranges

In [None]:
def generate_etapr_files(ground_truth_ranges, prediction_ranges, test_name, operating_point):
    """
    Generate files for eTaPR evaluation.
    
    Args:
        ground_truth_ranges (list): List of (start, end) tuples for ground truth
        prediction_ranges (list): List of (start, end) tuples for predictions
        test_name (str): Name of the test file
        operating_point (str): Operating point used
        
    Returns:
        tuple: (ground_truth_file, prediction_file)
    """
    # Create directory for eTaPR files
    etapr_files_dir = os.path.join(RESULTS_DIR, 'etapr_files')
    os.makedirs(etapr_files_dir, exist_ok=True)
    
    # Generate file names
    ground_truth_file = os.path.join(etapr_files_dir, f"{test_name}_ground_truth.csv")
    prediction_file = os.path.join(etapr_files_dir, f"{test_name}_{operating_point}_predictions.csv")
    
    # Write ground truth file
    with open(ground_truth_file, 'w') as f:
        for start, end in ground_truth_ranges:
            f.write(f"{start},{end}\n")
    
    # Write prediction file
    with open(prediction_file, 'w') as f:
        for start, end in prediction_ranges:
            f.write(f"{start},{end}\n")
    
    print(f"Generated eTaPR files: {ground_truth_file}, {prediction_file}")
    
    return ground_truth_file, prediction_file

In [None]:
# Generate eTaPR data for operating point results
etapr_data = {}

# Get first test file from operating point results
test_name = list(operating_point_results.values())[0]['test_name']
print(f"Generating eTaPR data for test file: {test_name}")

for operating_point, result in operating_point_results.items():
    # Prepare data
    ground_truth = result['ground_truth']
    predictions = result['anomaly_labels']
    
    # Convert to ranges
    ground_truth_ranges, prediction_ranges = prepare_etapr_data(ground_truth, predictions, test_name, operating_point)
    
    # Generate files
    ground_truth_file, prediction_file = generate_etapr_files(ground_truth_ranges, prediction_ranges, test_name, operating_point)
    
    # Store data
    etapr_data[operating_point] = {
        'test_name': test_name,
        'ground_truth_ranges': ground_truth_ranges,
        'prediction_ranges': prediction_ranges,
        'ground_truth_file': ground_truth_file,
        'prediction_file': prediction_file
    }

## 5. Run eTaPR Evaluation

In [None]:
def run_etapr_evaluation(ground_truth_file, prediction_file, theta=0.5):
    """
    Run eTaPR evaluation.
    
    Args:
        ground_truth_file (str): Path to ground truth file
        prediction_file (str): Path to prediction file
        theta (float): Theta parameter for eTaPR
        
    Returns:
        dict: eTaPR results
    """
    # Run eTaPR evaluation
    try:
        # Create eTaPR object
        etapr_obj = etapr.eTaPR()
        
        # Load ground truth and predictions
        etapr_obj.load_anomalies(ground_truth_file)
        etapr_obj.load_predictions(prediction_file)
        
        # Set theta parameter
        etapr_obj.set_theta(theta)
        
        # Calculate eTaPR metrics
        etapr_obj.calculate()
        
        # Get results
        precision = etapr_obj.precision
        recall = etapr_obj.recall
        f1 = etapr_obj.f1
        
        # Print results
        print(f"eTaPR Results (theta={theta})")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Return results
        return {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'theta': theta
        }
    except Exception as e:
        print(f"Error running eTaPR evaluation: {e}")
        return None

In [None]:
# Run eTaPR evaluation for each operating point
etapr_results = {}

for operating_point, data in etapr_data.items():
    print(f"\nRunning eTaPR evaluation for operating point: {operating_point}")
    
    # Run evaluation with different theta values
    theta_results = {}
    for theta in [0.1, 0.3, 0.5, 0.7, 0.9]:
        result = run_etapr_evaluation(data['ground_truth_file'], data['prediction_file'], theta=theta)
        if result:
            theta_results[theta] = result
    
    # Store results
    etapr_results[operating_point] = theta_results

## 6. Visualize eTaPR Results

In [None]:
# Create DataFrame with eTaPR results
etapr_df = []

for operating_point, theta_results in etapr_results.items():
    for theta, result in theta_results.items():
        etapr_df.append({
            'Operating Point': operating_point,
            'Theta': theta,
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1 Score': result['f1_score']
        })

etapr_df = pd.DataFrame(etapr_df)
etapr_df

In [None]:
# Visualize eTaPR results for different operating points (theta=0.5)
theta_05_df = etapr_df[etapr_df['Theta'] == 0.5]

plt.figure(figsize=(15, 6))

# Plot precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
for i, metric in enumerate(metrics):
    plt.subplot(1, 3, i+1)
    plt.bar(theta_05_df['Operating Point'], theta_05_df[metric])
    plt.title(f'{metric} (eTaPR, theta=0.5)')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    
    # Add value labels
    for j, v in enumerate(theta_05_df[metric]):
        plt.text(j, v + 0.02, f"{v:.4f}", ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Visualize eTaPR results for different theta values (balanced operating point)
balanced_df = etapr_df[etapr_df['Operating Point'] == 'balanced']

plt.figure(figsize=(15, 6))

# Plot precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
for i, metric in enumerate(metrics):
    plt.subplot(1, 3, i+1)
    plt.plot(balanced_df['Theta'], balanced_df[metric], marker='o')
    plt.title(f'{metric} vs. Theta (eTaPR, balanced)')
    plt.xlabel('Theta')
    plt.ylabel(metric)
    plt.grid(True)
    plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## 7. Compare eTaPR with Traditional Metrics

In [None]:
# Load threshold comparison results
threshold_comparison_path = os.path.join(RESULTS_DIR, 'threshold_comparison.csv')
if os.path.exists(threshold_comparison_path):
    traditional_df = pd.read_csv(threshold_comparison_path)
    print(f"Loaded threshold comparison results for {len(traditional_df)} operating points")
else:
    print("Threshold comparison file not found. Using operating point results.")
    
    # Create DataFrame from operating point results
    traditional_df = []
    for operating_point, result in operating_point_results.items():
        traditional_df.append({
            'Operating Point': operating_point,
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1 Score': result['f1_score']
        })
    traditional_df = pd.DataFrame(traditional_df)

traditional_df

In [None]:
# Merge traditional and eTaPR results (theta=0.5)
comparison_df = pd.merge(traditional_df, theta_05_df, on='Operating Point', suffixes=('_Traditional', '_eTaPR'))
comparison_df

In [None]:
# Visualize comparison between traditional and eTaPR metrics
plt.figure(figsize=(15, 15))

# Plot precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
for i, metric in enumerate(metrics):
    plt.subplot(3, 1, i+1)
    
    # Plot traditional metrics
    plt.bar(np.arange(len(comparison_df)) - 0.2, comparison_df[f'{metric}_Traditional'], width=0.4, label='Traditional')
    
    # Plot eTaPR metrics
    plt.bar(np.arange(len(comparison_df)) + 0.2, comparison_df[f'{metric}_eTaPR'], width=0.4, label='eTaPR (theta=0.5)')
    
    plt.title(f'Comparison of {metric}')
    plt.xticks(np.arange(len(comparison_df)), comparison_df['Operating Point'], rotation=45)
    plt.ylabel(metric)
    plt.ylim(0, 1)
    plt.legend()
    plt.grid(True)
    
    # Add value labels
    for j, v in enumerate(comparison_df[f'{metric}_Traditional']):
        plt.text(j - 0.2, v + 0.02, f"{v:.2f}", ha='center')
    
    for j, v in enumerate(comparison_df[f'{metric}_eTaPR']):
        plt.text(j + 0.2, v + 0.02, f"{v:.2f}", ha='center')

plt.tight_layout()
plt.show()

## 8. Evaluate All Test Files with eTaPR

In [None]:
def evaluate_test_file_etapr(test_name, test_df, operating_point='balanced', theta=0.5):
    """
    Evaluate a test file using eTaPR metrics.
    
    Args:
        test_name (str): Name of the test file
        test_df (pd.DataFrame): Test dataframe
        operating_point (str): Operating point to use
        theta (float): Theta parameter for eTaPR
        
    Returns:
        dict: Evaluation results
    """
    print(f"\nEvaluating {test_name} with operating point: {operating_point}, theta: {theta}")
    
    # Extract features and target
    feature_cols = [col for col in test_df.columns if col not in ['time', 'attack']]
    X_test = test_df[feature_cols].values
    y_test = test_df['attack'].values
    
    # Predict anomalies
    anomaly_scores, anomaly_labels, threshold = detector.predict(X_test, operating_point=operating_point)
    
    # Apply post-processing
    processed_labels = detector.post_process_anomalies(anomaly_labels)
    
    # Prepare data for eTaPR
    ground_truth_ranges, prediction_ranges = prepare_etapr_data(y_test, processed_labels, test_name, operating_point)
    
    # Generate files for eTaPR
    ground_truth_file, prediction_file = generate_etapr_files(ground_truth_ranges, prediction_ranges, test_name, operating_point)
    
    # Run eTaPR evaluation
    etapr_result = run_etapr_evaluation(ground_truth_file, prediction_file, theta=theta)
    
    if etapr_result:
        return {
            'test_name': test_name,
            'operating_point': operating_point,
            'theta': theta,
            'precision': etapr_result['precision'],
            'recall': etapr_result['recall'],
            'f1_score': etapr_result['f1_score'],
            'ground_truth_ranges': ground_truth_ranges,
            'prediction_ranges': prediction_ranges
        }
    else:
        return None

In [None]:
# Evaluate all test files with eTaPR
best_operating_point = 'balanced'  # Use the balanced operating point
theta = 0.5  # Use theta=0.5

all_etapr_results = []
for test_name, test_df in test_data.items():
    result = evaluate_test_file_etapr(test_name, test_df, operating_point=best_operating_point, theta=theta)
    if result:
        all_etapr_results.append(result)

In [None]:
# Create DataFrame with all eTaPR results
all_etapr_df = pd.DataFrame([
    {
        'Test File': result['test_name'],
        'Operating Point': result['operating_point'],
        'Theta': result['theta'],
        'Precision': result['precision'],
        'Recall': result['recall'],
        'F1 Score': result['f1_score']
    }
    for result in all_etapr_results
])

all_etapr_df

In [None]:
# Visualize eTaPR results across test files
plt.figure(figsize=(15, 6))

# Plot precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
for i, metric in enumerate(metrics):
    plt.subplot(1, 3, i+1)
    plt.bar(all_etapr_df['Test File'], all_etapr_df[metric])
    plt.title(f'{metric} (eTaPR, theta={theta})')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    
    # Add value labels
    for j, v in enumerate(all_etapr_df[metric]):
        plt.text(j, v + 0.02, f"{v:.4f}", ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Calculate average eTaPR metrics
avg_etapr_metrics = {
    'Precision': all_etapr_df['Precision'].mean(),
    'Recall': all_etapr_df['Recall'].mean(),
    'F1 Score': all_etapr_df['F1 Score'].mean()
}

# Display average metrics
print(f"Average eTaPR Metrics Across All Test Files (theta={theta}):")
for metric, value in avg_etapr_metrics.items():
    print(f"{metric}: {value:.4f}")

## 9. Compare eTaPR with Traditional Metrics Across Test Files

In [None]:
# Load test results with traditional metrics
if test_results is not None:
    # Merge with eTaPR results
    test_comparison = pd.merge(test_results, all_etapr_df, left_on='Test File', right_on='Test File', suffixes=('_Traditional', '_eTaPR'))
    
    # Display comparison
    test_comparison[['Test File', 'Precision_Traditional', 'Precision_eTaPR', 'Recall_Traditional', 'Recall_eTaPR', 'F1 Score_Traditional', 'F1 Score_eTaPR']]

In [None]:
# Visualize comparison between traditional and eTaPR metrics across test files
if test_results is not None:
    plt.figure(figsize=(15, 15))

    # Plot precision, recall, and F1 score
    metrics = ['Precision', 'Recall', 'F1 Score']
    for i, metric in enumerate(metrics):
        plt.subplot(3, 1, i+1)
        
        # Plot traditional metrics
        plt.bar(np.arange(len(test_comparison)) - 0.2, test_comparison[f'{metric}_Traditional'], width=0.4, label='Traditional')
        
        # Plot eTaPR metrics
        plt.bar(np.arange(len(test_comparison)) + 0.2, test_comparison[f'{metric}_eTaPR'], width=0.4, label=f'eTaPR (theta={theta})')
        
        plt.title(f'Comparison of {metric} Across Test Files')
        plt.xticks(np.arange(len(test_comparison)), test_comparison['Test File'], rotation=45)
        plt.ylabel(metric)
        plt.ylim(0, 1)
        plt.legend()
        plt.grid(True)
        
        # Add value labels
        for j, v in enumerate(test_comparison[f'{metric}_Traditional']):
            plt.text(j - 0.2, v + 0.02, f"{v:.2f}", ha='center')
        
        for j, v in enumerate(test_comparison[f'{metric}_eTaPR']):
            plt.text(j + 0.2, v + 0.02, f"{v:.2f}", ha='center')

    plt.tight_layout()
    plt.show()

## 10. Save eTaPR Results

In [None]:
# Save eTaPR results
etapr_df.to_csv(os.path.join(RESULTS_DIR, 'etapr_results.csv'), index=False)
print(f"Saved eTaPR results to {os.path.join(RESULTS_DIR, 'etapr_results.csv')}")

# Save all test file eTaPR results
all_etapr_df.to_csv(os.path.join(RESULTS_DIR, 'etapr_test_results.csv'), index=False)
print(f"Saved eTaPR test results to {os.path.join(RESULTS_DIR, 'etapr_test_results.csv')}")

# Save comparison results if available
if test_results is not None:
    test_comparison.to_csv(os.path.join(RESULTS_DIR, 'metrics_comparison.csv'), index=False)
    print(f"Saved metrics comparison to {os.path.join(RESULTS_DIR, 'metrics_comparison.csv')}")