In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import preprocessing functions
from data_preprocessing import load_hai_dataset, convert_to_parquet, engineer_features, visualize_dataset_overview

# Set visualization style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Define dataset paths
HAI_VERSION = 'hai-20.07'
DATA_DIR = f'hai-security-dataset/{HAI_VERSION}'

# List all CSV files
train_files = sorted(glob(f'{DATA_DIR}/train*.csv'))
test_files = sorted(glob(f'{DATA_DIR}/test*.csv'))

print(f"Found {len(train_files)} training files and {len(test_files)} test files")

In [None]:
# Function to analyze attack periods
def analyze_attack_period(df, attack_start, attack_end, attack_col):
    """Analyze and visualize a specific attack period"""
    # Include some pre/post attack data
    start_idx = max(0, attack_start - 100)
    end_idx = min(len(df), attack_end + 100)
    
    attack_df = df.iloc[start_idx:end_idx].copy()
    
    # Key variables to monitor during attack
    control_loops = {
        'P1-PC': ['P1_B2016', 'P1_PIT01', 'P1_PCV01D', 'P1_PCV02D'],
        'P1-LC': ['P1_B3004', 'P1_LIT01', 'P1_LCV01D'],
        'P1-FC': ['P1_B3005', 'P1_FT03', 'P1_FCV03D'],
        'P1-TC': ['P1_B4022', 'P1_TIT01', 'P1_FCV01D', 'P1_FCV02D']
    }
    
    # Plot attack period for each control loop
    for loop_name, vars in control_loops.items():
        # Check if variables exist in dataset
        vars = [var for var in vars if var in attack_df.columns]
        if not vars:
            continue
            
        fig, axes = plt.subplots(len(vars)+1, 1, figsize=(14, 3*(len(vars)+1)))
        
        # Plot attack label
        axes[0].plot(attack_df['timestamp'], attack_df[attack_col], 'r-')
        axes[0].set_title(f'{loop_name} Variables During Attack')
        axes[0].set_ylabel('Attack')
        
        # Plot each variable
        for i, var in enumerate(vars, 1):
            axes[i].plot(attack_df['timestamp'], attack_df[var], label=var)
            axes[i].set_ylabel('Value')
            axes[i].legend()
        
        plt.tight_layout()
        plt.show()

In [None]:
# Process training data
for train_file in train_files:
    print(f"\nProcessing {os.path.basename(train_file)}")
    
    # Load data
    train_df = load_hai_dataset(train_file)
    print(f"Shape: {train_df.shape}")
    print(f"Time range: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
    
    # Visualize control loops
    visualize_dataset_overview(train_df)
    
    # Engineer features and visualize
    train_df_engineered = engineer_features(train_df)
    
    # Plot control errors
    error_cols = [col for col in train_df_engineered.columns if 'error' in col]
    if error_cols:
        fig, axes = plt.subplots(len(error_cols), 1, figsize=(14, 4*len(error_cols)))
        
        for i, col in enumerate(error_cols):
            axes[i].plot(train_df_engineered['timestamp'][::100], train_df_engineered[col][::100])
            axes[i].set_title(f'{col} Over Time')
            axes[i].set_xlabel('Time')
            axes[i].set_ylabel('Error')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Process test data with attacks
for test_file in test_files:
    print(f"\nProcessing {os.path.basename(test_file)}")
    
    # Load data
    test_df = load_hai_dataset(test_file)
    print(f"Shape: {test_df.shape}")
    print(f"Time range: {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")
    
    # Find attack columns
    attack_cols = [col for col in test_df.columns if 'attack' in col.lower()]
    print(f"Attack columns: {attack_cols}")
    
    if attack_cols:
        # Plot overall attack distribution
        plt.figure(figsize=(14, 6))
        for col in attack_cols:
            plt.plot(test_df['timestamp'], test_df[col], label=col)
        plt.title('Attack Distribution Over Time')
        plt.xlabel('Time')
        plt.ylabel('Attack (1) / Normal (0)')
        plt.legend()
        plt.show()
        
        # Find attack periods
        for attack_col in attack_cols:
            attack_starts = []
            attack_ends = []
            in_attack = False
            
            for i, row in test_df.iterrows():
                if row[attack_col] == 1 and not in_attack:
                    attack_starts.append(i)
                    in_attack = True
                elif row[attack_col] == 0 and in_attack:
                    attack_ends.append(i-1)
                    in_attack = False
            
            if in_attack:
                attack_ends.append(len(test_df)-1)
            
            print(f"\nFound {len(attack_starts)} attack periods for {attack_col}")
            
            # Analyze first few attack periods
            for i in range(min(3, len(attack_starts))):
                print(f"\nAnalyzing attack period {i+1}")
                analyze_attack_period(test_df, attack_starts[i], attack_ends[i], attack_col)

In [None]:
# Analyze control loop behavior during attacks
for test_file in test_files:
    test_df = load_hai_dataset(test_file)
    attack_cols = [col for col in test_df.columns if 'attack' in col.lower()]
    
    if not attack_cols:
        continue
    
    # Engineer features
    test_df_engineered = engineer_features(test_df)
    
    # Plot control errors during attacks
    error_cols = [col for col in test_df_engineered.columns if 'error' in col]
    
    if error_cols:
        for attack_col in attack_cols:
            # Get attack and non-attack periods
            attack_data = test_df_engineered[test_df_engineered[attack_col] == 1]
            normal_data = test_df_engineered[test_df_engineered[attack_col] == 0]
            
            # Plot error distributions
            fig, axes = plt.subplots(len(error_cols), 1, figsize=(14, 4*len(error_cols)))
            
            for i, col in enumerate(error_cols):
                sns.kdeplot(data=normal_data[col], ax=axes[i], label='Normal', color='blue')
                sns.kdeplot(data=attack_data[col], ax=axes[i], label='Attack', color='red')
                axes[i].set_title(f'{col} Distribution: Normal vs Attack')
                axes[i].legend()
            
            plt.tight_layout()
            plt.show()