# HAI Dataset Preprocessing

This notebook handles the preprocessing of HAI security dataset using Polars for efficient data processing.

In [None]:
import polars as pl
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plot style
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Define paths
DATA_DIR = Path('../hai-security-dataset')
PROCESSED_DIR = Path('../data/processed')
INTERIM_DIR = Path('../data/interim')

In [None]:
def load_dataset(version='hai-22.04'):
    """Load HAI dataset files using Polars lazy evaluation"""
    version_dir = DATA_DIR / version
    
    # Load training files
    train_files = list(version_dir.glob('train*.csv'))
    train_dfs = [pl.scan_csv(f) for f in train_files]
    
    # Load test files
    test_files = list(version_dir.glob('test*.csv'))
    test_dfs = [pl.scan_csv(f) for f in test_files]
    
    return train_dfs, test_dfs

In [None]:
def preprocess_dataframe(df):
    """Preprocess a single dataframe"""
    return df.with_columns([
        # Convert timestamp to datetime
        pl.col('timestamp').str.to_datetime(),
        
        # Handle missing values
        pl.all().fill_null(strategy='forward')
    ])

In [None]:
def extract_features(df):
    """Extract features for each control loop"""
    control_loops = {
        'P1-PC': {
            'SP': 'P1_B2016',
            'PV': 'P1_PIT01',
            'CV': ['P1_PCV01D', 'P1_PCV02D']
        },
        'P1-LC': {
            'SP': 'P1_B3004',
            'PV': 'P1_LIT01',
            'CV': ['P1_LCV01D']
        },
        'P1-FC': {
            'SP': 'P1_B3005',
            'PV': 'P1_FT03',
            'CV': ['P1_FCV03D']
        },
        'P1-TC': {
            'SP': 'P1_B4022',
            'PV': 'P1_TIT01',
            'CV': ['P1_FCV01D', 'P1_FCV02D']
        }
    }
    
    features = []
    
    for loop_name, vars in control_loops.items():
        # Calculate control error
        features.append(
            (pl.col(vars['SP']) - pl.col(vars['PV'])).alias(f'{loop_name}_error')
        )
        
        # Calculate moving statistics
        window_sizes = [10, 30, 60]
        for size in window_sizes:
            features.extend([
                pl.col(vars['PV']).rolling_mean(size).alias(f'{loop_name}_PV_mean_{size}'),
                pl.col(vars['PV']).rolling_std(size).alias(f'{loop_name}_PV_std_{size}')
            ])
            
            # Calculate CV statistics
            for cv in vars['CV']:
                features.extend([
                    pl.col(cv).rolling_mean(size).alias(f'{cv}_mean_{size}'),
                    pl.col(cv).rolling_std(size).alias(f'{cv}_std_{size}')
                ])
    
    return df.with_columns(features)

In [None]:
def save_processed_data(df, filename, directory=PROCESSED_DIR):
    """Save processed dataframe to parquet format"""
    output_path = directory / f'{filename}.parquet'
    df.collect().write_parquet(output_path)
    print(f'Saved to {output_path}')

In [None]:
# Load and process training data
train_dfs, test_dfs = load_dataset()

# Process each training file
for i, df in enumerate(train_dfs):
    processed_df = df.pipe(preprocess_dataframe).pipe(extract_features)
    save_processed_data(processed_df, f'train_{i+1}')

# Process each test file
for i, df in enumerate(test_dfs):
    processed_df = df.pipe(preprocess_dataframe).pipe(extract_features)
    save_processed_data(processed_df, f'test_{i+1}')

In [None]:
# Load a processed file to verify
sample_df = pl.read_parquet(PROCESSED_DIR / 'train_1.parquet')

# Plot some features
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot control errors
for i, loop_name in enumerate(['P1-PC', 'P1-LC', 'P1-FC', 'P1-TC']):
    ax = axes[i//2, i%2]
    error_col = f'{loop_name}_error'
    
    sample_df.select(['timestamp', error_col]).sample(1000).to_pandas().plot(
        x='timestamp', y=error_col, ax=ax, title=f'{loop_name} Control Error'
    )
    
plt.tight_layout()
plt.show()