In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Function to detect separator and load dataset
def load_hai_dataset(file_path, chunksize=None):
    """Load HAI dataset with automatic separator detection"""
    # Read first line to detect separator
    with open(file_path, 'r') as f:
        first_line = f.readline().strip()
    
    # Detect separator
    if ';' in first_line:
        separator = ';'
    elif ',' in first_line:
        separator = ','
    else:
        separator = None
    
    # Load data (with or without chunking)
    if chunksize:
        df_reader = pd.read_csv(file_path, sep=separator, chunksize=chunksize)
        return df_reader
    else:
        df = pd.read_csv(file_path, sep=separator)
        
        # Convert timestamp column
        time_cols = [col for col in df.columns if col.lower() in ['time', 'timestamp']]
        if time_cols:
            df['timestamp'] = pd.to_datetime(df[time_cols[0]])
            if time_cols[0] != 'timestamp':
                df.drop(time_cols[0], axis=1, inplace=True)
        
        return df

In [None]:
# Function to convert dataset to parquet format
def convert_to_parquet(csv_path, parquet_path=None, chunksize=100000):
    """Convert CSV dataset to Parquet format for faster processing"""
    if parquet_path is None:
        parquet_path = os.path.splitext(csv_path)[0] + '.parquet'
    
    # Check if parquet file already exists
    if os.path.exists(parquet_path):
        print(f"Parquet file already exists: {parquet_path}")
        return parquet_path
    
    start_time = time.time()
    print(f"Converting {csv_path} to Parquet format...")
    
    # Process in chunks for large files
    df_reader = load_hai_dataset(csv_path, chunksize=chunksize)
    
    for i, chunk in enumerate(df_reader):
        if i == 0:
            # Convert timestamp
            time_cols = [col for col in chunk.columns if col.lower() in ['time', 'timestamp']]
            if time_cols:
                chunk['timestamp'] = pd.to_datetime(chunk[time_cols[0]])
                if time_cols[0] != 'timestamp':
                    chunk.drop(time_cols[0], axis=1, inplace=True)
            
            # Write first chunk with schema
            chunk.to_parquet(parquet_path, engine='pyarrow', index=False)
        else:
            # Append subsequent chunks
            chunk.to_parquet(parquet_path, engine='pyarrow', index=False, append=True)
        
        if (i+1) % 10 == 0:
            print(f"Processed {(i+1)*chunksize} rows...")
    
    elapsed = time.time() - start_time
    print(f"Conversion completed in {elapsed:.2f} seconds")
    return parquet_path

In [None]:
# Function to engineer features
def engineer_features(df):
    """Create engineered features for anomaly detection"""
    result = df.copy()
    
    # Time-based features
    if 'timestamp' in result.columns:
        result['hour'] = result['timestamp'].dt.hour
        result['day'] = result['timestamp'].dt.day
        result['day_of_week'] = result['timestamp'].dt.dayofweek
    
    # Control loop features
    # P1-PC: Pressure Control
    if all(col in result.columns for col in ['P1_B2016', 'P1_PIT01']):
        result['P1_PC_error'] = result['P1_B2016'] - result['P1_PIT01']
    
    # P1-LC: Level Control
    if all(col in result.columns for col in ['P1_B3004', 'P1_LIT01']):
        result['P1_LC_error'] = result['P1_B3004'] - result['P1_LIT01']
    
    # P1-FC: Flow Control
    if all(col in result.columns for col in ['P1_B3005', 'P1_FT03']):
        result['P1_FC_error'] = result['P1_B3005'] - result['P1_FT03']
    
    # P1-TC: Temperature Control
    if all(col in result.columns for col in ['P1_B4022', 'P1_TIT01']):
        result['P1_TC_error'] = result['P1_B4022'] - result['P1_TIT01']
    
    # Rolling window features (5-minute window)
    window_size = 300  # 5 minutes at 1-second sampling
    
    # For key process variables
    for col in ['P1_PIT01', 'P1_LIT01', 'P1_FT03', 'P1_TIT01']:
        if col in result.columns:
            result[f'{col}_rolling_mean'] = result[col].rolling(window=window_size, min_periods=1).mean()
            result[f'{col}_rolling_std'] = result[col].rolling(window=window_size, min_periods=1).std()
    
    # For control errors
    for col in ['P1_PC_error', 'P1_LC_error', 'P1_FC_error', 'P1_TC_error']:
        if col in result.columns:
            result[f'{col}_rolling_mean'] = result[col].rolling(window=window_size, min_periods=1).mean()
            result[f'{col}_rolling_std'] = result[col].rolling(window=window_size, min_periods=1).std()
    
    return result

In [None]:
# Function to visualize dataset overview
def visualize_dataset_overview(df, title="Dataset Overview"):
    """Create overview visualizations for a dataset"""
    # Sample data for visualization (every 100th point)
    sampled_df = df.iloc[::100].copy()
    
    # Control loops visualization
    control_loops = {
        'P1-PC': {
            'SP': 'P1_B2016',
            'PV': 'P1_PIT01',
            'CV': ['P1_PCV01D', 'P1_PCV02D']
        },
        'P1-LC': {
            'SP': 'P1_B3004',
            'PV': 'P1_LIT01',
            'CV': ['P1_LCV01D']
        },
        'P1-FC': {
            'SP': 'P1_B3005',
            'PV': 'P1_FT03',
            'CV': ['P1_FCV03D']
        },
        'P1-TC': {
            'SP': 'P1_B4022',
            'PV': 'P1_TIT01',
            'CV': ['P1_FCV01D', 'P1_FCV02D']
        }
    }
    
    # Plot each control loop
    for loop_name, loop_vars in control_loops.items():
        # Check if all variables exist in the dataset
        sp = loop_vars['SP']
        pv = loop_vars['PV']
        cv = loop_vars['CV']
        
        if not all(col in df.columns for col in [sp, pv] + cv):
            continue
        
        fig, axes = plt.subplots(2, 1, figsize=(14, 10))
        
        # Plot setpoint and process variable
        axes[0].plot(sampled_df['timestamp'], sampled_df[sp], 'r-', label=f'Setpoint ({sp})')
        axes[0].plot(sampled_df['timestamp'], sampled_df[pv], 'b-', label=f'Process Variable ({pv})')
        axes[0].set_title(f'{loop_name} Control Loop: Setpoint vs. Process Variable')
        axes[0].set_ylabel('Value')
        axes[0].legend()
        
        # Plot control variables
        for control_var in cv:
            axes[1].plot(sampled_df['timestamp'], sampled_df[control_var], label=f'Control Variable ({control_var})')
        
        axes[1].set_title(f'{loop_name} Control Loop: Control Variables')
        axes[1].set_xlabel('Time')
        axes[1].set_ylabel('Value (%)')
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
    
    # Correlation heatmap for key variables
    key_vars = []
    for loop in control_loops.values():
        key_vars.extend([loop['SP'], loop['PV']])
        key_vars.extend(loop['CV'])
    
    # Filter to existing columns
    key_vars = [col for col in key_vars if col in df.columns]
    
    if key_vars:
        plt.figure(figsize=(12, 10))
        corr_matrix = df[key_vars].corr()
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix of Key Variables')
        plt.tight_layout()
        plt.show()

In [None]:
# List all available dataset versions
dataset_versions = glob('hai-security-dataset/hai-*')
dataset_versions.extend(glob('hai-security-dataset/haiend-*'))
dataset_versions = [os.path.basename(version) for version in dataset_versions]
print(f"Available dataset versions: {dataset_versions}")

In [None]:
# Process HAI-20.07 dataset as an example
HAI_VERSION = 'hai-20.07'
DATA_DIR = f'hai-security-dataset/{HAI_VERSION}'

# Convert training files to parquet
train_files = sorted(glob(f'{DATA_DIR}/train*.csv'))
train_parquet = [convert_to_parquet(file) for file in train_files]

# Load and analyze first training file
train_df = pd.read_parquet(train_parquet[0])
print(f"\nLoaded training dataset shape: {train_df.shape}")

# Visualize dataset overview
visualize_dataset_overview(train_df, title="HAI-20.07 Training Dataset Overview")

# Engineer features
train_df_engineered = engineer_features(train_df)

# Visualize engineered features
if 'P1_PC_error' in train_df_engineered.columns:
    plt.figure(figsize=(14, 6))
    plt.plot(train_df_engineered['timestamp'][::100], train_df_engineered['P1_PC_error'][::100])
    plt.title('P1-PC Control Error')
    plt.xlabel('Time')
    plt.ylabel('Error (SP - PV)')
    plt.show()

# Distribution of key process variables
key_vars = ['P1_PIT01', 'P1_LIT01', 'P1_FT03', 'P1_TIT01']
key_vars = [var for var in key_vars if var in train_df.columns]

if key_vars:
    fig, axes = plt.subplots(len(key_vars), 1, figsize=(14, 4*len(key_vars)))
    
    for i, var in enumerate(key_vars):
        sns.histplot(train_df[var], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {var}')
        axes[i].set_xlabel('Value')
        axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()