# HAI-20.07 Dataset Preprocessing

This notebook performs preprocessing on the HAI-20.07 dataset.

In [None]:
import sys
sys.path.append('..')

import polars as pl
import numpy as np
import plotly.graph_objects as go
from pathlib import Path

from utils.data_loader import HAIDataLoader
from utils.feature_engineering import FeatureEngineer
from utils.visualization import Visualizer

## 1. Data Loading

In [None]:
# Initialize data loader
data_loader = HAIDataLoader(base_path='../hai-security-dataset')

# Load training and testing data
train_df1 = data_loader.load_dataset('20.07', 'train', 1)
train_df2 = data_loader.load_dataset('20.07', 'train', 2)
test_df1 = data_loader.load_dataset('20.07', 'test', 1)
test_df2 = data_loader.load_dataset('20.07', 'test', 2)

## 2. Data Exploration

In [None]:
# Initialize visualizer
visualizer = Visualizer(save_dir='figures')

# Plot time series data for key features
key_features = ['P1_PIT01', 'P1_LIT01', 'P1_FT01', 'P2_SIT01']
fig = visualizer.plot_time_series(train_df1, key_features)
fig.show()

## 3. Feature Engineering

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

# Extract time features
train_df1 = feature_engineer.extract_time_features(train_df1)
train_df2 = feature_engineer.extract_time_features(train_df2)
test_df1 = feature_engineer.extract_time_features(test_df1)
test_df2 = feature_engineer.extract_time_features(test_df2)

# Extract statistical features
window_size = 10
train_df1 = feature_engineer.extract_statistical_features(train_df1, window_size)
train_df2 = feature_engineer.extract_statistical_features(train_df2, window_size)
test_df1 = feature_engineer.extract_statistical_features(test_df1, window_size)
test_df2 = feature_engineer.extract_statistical_features(test_df2, window_size)

## 4. Data Visualization

In [None]:
# Plot correlation heatmap
corr_matrix = feature_engineer.calculate_feature_correlations(train_df1)
fig = visualizer.plot_correlation_heatmap(corr_matrix.to_numpy(), train_df1.columns)
fig.show()

# Plot attack distribution
attack_cols = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']
fig = visualizer.plot_attack_distribution(test_df1, attack_cols)
fig.show()

## 5. Data Preprocessing

In [None]:
# Fit scaler on training data
exclude_cols = ['time'] + attack_cols
feature_engineer.fit_standard_scaler(train_df1, exclude_cols)

# Transform all datasets
train_df1_scaled = feature_engineer.transform_with_scaler(train_df1)
train_df2_scaled = feature_engineer.transform_with_scaler(train_df2)
test_df1_scaled = feature_engineer.transform_with_scaler(test_df1)
test_df2_scaled = feature_engineer.transform_with_scaler(test_df2)

## 6. Save Processed Data

In [None]:
# Create processed data directory
processed_dir = Path('processed_data')
processed_dir.mkdir(exist_ok=True)

# Save processed datasets
train_df1_scaled.collect().write_parquet(processed_dir / 'train1.parquet')
train_df2_scaled.collect().write_parquet(processed_dir / 'train2.parquet')
test_df1_scaled.collect().write_parquet(processed_dir / 'test1.parquet')
test_df2_scaled.collect().write_parquet(processed_dir / 'test2.parquet')