# HAI Security Dataset: Preprocessing All Versions

This notebook preprocesses all versions of the HIL-based Augmented ICS (HAI) Security Dataset. It handles the different formats and structures of each version (HAI-20.07, HAI-21.03, HAI-22.04, HAI-23.05, HAIEnd-23.05) and creates standardized processed data files for model training.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import time
from datetime import datetime
import dask.dataframe as dd  # For parallel computing with DataFrames
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib  # For saving preprocessing objects
import warnings
from tqdm.notebook import tqdm  # For progress tracking

# Try to import GPU libraries, but continue if not available
try:
    import cudf  # GPU-accelerated DataFrame library
    import cupy as cp  # GPU-accelerated NumPy-like library
    GPU_AVAILABLE = True
    print("GPU libraries loaded successfully. GPU acceleration is available.")
except ImportError:
    GPU_AVAILABLE = False
    print("GPU libraries not available. Falling back to CPU processing.")

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Dataset Overview

First, let's explore the available dataset versions and their structure.

In [None]:
# Define the base path to the HAI security dataset
base_dataset_path = '../hai-security-dataset/'

# List all available dataset versions
dataset_versions = [os.path.basename(version) for version in glob.glob(f'{base_dataset_path}hai-*')]
dataset_versions.extend([os.path.basename(version) for version in glob.glob(f'{base_dataset_path}haiend-*')])

# Filter out any non-directory items (like zip files)
dataset_versions = [version for version in dataset_versions if os.path.isdir(os.path.join(base_dataset_path, version))]

print(f"Available dataset versions: {dataset_versions}")

In [None]:
# Create a function to get dataset information
def get_dataset_info(version):
    """
    Get information about a specific dataset version.
    
    Args:
        version: Dataset version (e.g., 'hai-20.07')
        
    Returns:
        dict: Dataset information
    """
    version_path = os.path.join(base_dataset_path, version)
    
    # Get all CSV files
    all_files = glob.glob(f'{version_path}/*.csv')
    
    # Identify train and test files
    train_files = [f for f in all_files if 'train' in os.path.basename(f).lower()]
    test_files = [f for f in all_files if 'test' in os.path.basename(f).lower()]
    label_files = [f for f in all_files if 'label' in os.path.basename(f).lower()]
    
    # Determine separator (HAI-20.07 uses semicolon, others use comma)
    separator = ';' if version == 'hai-20.07' else ','
    
    # Determine timestamp column name
    timestamp_col = 'time' if version in ['hai-20.07', 'hai-21.03'] else 'timestamp'
    
    # Try to get column count from first file
    column_count = None
    sample_columns = None
    if all_files:
        try:
            sample_df = pd.read_csv(all_files[0], sep=separator, nrows=1)
            column_count = len(sample_df.columns)
            sample_columns = sample_df.columns.tolist()
        except Exception as e:
            print(f"Error reading sample from {version}: {e}")
    
    return {
        'version': version,
        'path': version_path,
        'train_files': train_files,
        'test_files': test_files,
        'label_files': label_files,
        'total_files': len(all_files),
        'separator': separator,
        'timestamp_col': timestamp_col,
        'column_count': column_count,
        'sample_columns': sample_columns
    }

In [None]:
# Get information for all dataset versions
dataset_info = {}
for version in dataset_versions:
    dataset_info[version] = get_dataset_info(version)
    
# Display summary information
summary_data = []
for version, info in dataset_info.items():
    summary_data.append({
        'Version': version,
        'Train Files': len(info['train_files']),
        'Test Files': len(info['test_files']),
        'Label Files': len(info['label_files']),
        'Total Files': info['total_files'],
        'Separator': info['separator'],
        'Timestamp Column': info['timestamp_col'],
        'Column Count': info['column_count']
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

## 2. Preprocessing Pipeline

Now let's create a comprehensive preprocessing pipeline that can handle all HAI dataset versions.

In [None]:
class HAIDataPreprocessor:
    """
    Preprocessing pipeline for HAI security dataset.
    """
    def __init__(self, dataset_version, scaler_type='standard', base_path='../hai-security-dataset/'):
        """
        Initialize the preprocessor.
        
        Args:
            dataset_version: Dataset version (e.g., 'hai-20.07')
            scaler_type: Type of scaler ('standard' or 'minmax')
            base_path: Base path to the HAI security dataset
        """
        self.dataset_version = dataset_version
        self.dataset_path = os.path.join(base_path, dataset_version)
        self.scaler_type = scaler_type
        self.scaler = None
        
        # Set version-specific parameters
        self.timestamp_col = 'time' if dataset_version in ['hai-20.07', 'hai-21.03'] else 'timestamp'
        self.separator = ';' if dataset_version == 'hai-20.07' else ','
        
        # Initialize columns
        self.feature_columns = None
        self.attack_columns = None
        self.label_columns = None
        
    def get_file_list(self):
        """
        Get list of training, test, and label files.
        
        Returns:
            tuple: (train_files, test_files, label_files)
        """
        all_files = glob.glob(f'{self.dataset_path}/*.csv')
        train_files = [f for f in all_files if 'train' in os.path.basename(f).lower() and 'label' not in os.path.basename(f).lower()]
        test_files = [f for f in all_files if 'test' in os.path.basename(f).lower() and 'label' not in os.path.basename(f).lower()]
        label_files = [f for f in all_files if 'label' in os.path.basename(f).lower()]
        
        return train_files, test_files, label_files
    
    def load_file(self, file_path, sample_rows=None):
        """
        Load a CSV file efficiently.
        
        Args:
            file_path: Path to the CSV file
            sample_rows: Number of rows to sample (None for all rows)
            
        Returns:
            DataFrame: Loaded data
        """
        start_time = time.time()
        
        try:
            # First, use pandas to read a few rows to get column names and dtypes
            sample_df = pd.read_csv(file_path, sep=self.separator, nrows=5)
            
            # Create a dictionary of dtypes to optimize memory usage
            dtypes = {}
            for col in sample_df.columns:
                if col == self.timestamp_col:
                    dtypes[col] = 'object'  # Keep timestamp as object initially
                elif 'attack' in col.lower() or 'label' in col.lower():
                    dtypes[col] = 'int8'  # Binary labels can be stored as int8
                elif sample_df[col].dtype == 'float64':
                    dtypes[col] = 'float64'  # Keep as float64 to avoid conversion issues
                elif sample_df[col].dtype == 'int64':
                    dtypes[col] = 'int32'  # Reduce precision to save memory
            
            # Use Dask to read the file in parallel chunks
            if sample_rows:
                # For sampling, use pandas directly
                df = pd.read_csv(file_path, sep=self.separator, dtype=dtypes, nrows=sample_rows)
            else:
                # For full file, use Dask
                dask_df = dd.read_csv(file_path, sep=self.separator, dtype=dtypes, blocksize="64MB")
                df = dask_df.compute()
        except ValueError as e:
            # If there's a dtype conversion error, try again without specifying dtypes
            print(f"Warning: Error with dtype conversion: {e}")
            print("Attempting to load without dtype specifications...")
            
            if sample_rows:
                df = pd.read_csv(file_path, sep=self.separator, nrows=sample_rows)
            else:
                dask_df = dd.read_csv(file_path, sep=self.separator, blocksize="64MB")
                df = dask_df.compute()
        
        # Convert timestamp
        if self.timestamp_col in df.columns:
            df[self.timestamp_col] = pd.to_datetime(df[self.timestamp_col])
        
        # Identify feature and attack columns if not already set
        if self.feature_columns is None:
            # Identify attack/label columns
            self.attack_columns = [col for col in df.columns if 'attack' in col.lower()]
            self.label_columns = [col for col in df.columns if 'label' in col.lower()]
            
            # All columns that are not timestamp, attack, or label are features
            exclude_cols = [self.timestamp_col] + self.attack_columns + self.label_columns
            self.feature_columns = [col for col in df.columns if col not in exclude_cols]
        
        print(f"Data loaded in {time.time() - start_time:.2f} seconds")
        return df
    
    def fit_scaler(self, train_files, sample_size=10000):
        """
        Fit a scaler on training data.
        
        Args:
            train_files: List of training file paths
            sample_size: Number of rows to sample from each file for fitting
            
        Returns:
            self: The fitted preprocessor
        """
        print(f"Fitting {self.scaler_type} scaler on training data...")
        
        # Initialize the appropriate scaler
        if self.scaler_type == 'standard':
            self.scaler = StandardScaler()
        elif self.scaler_type == 'minmax':
            self.scaler = MinMaxScaler()
        else:
            raise ValueError(f"Unknown scaler type: {self.scaler_type}")
        
        # Sample data from each training file
        all_samples = []
        for file in train_files:
            df = self.load_file(file, sample_rows=sample_size)
            all_samples.append(df[self.feature_columns])
        
        # Concatenate samples and fit the scaler
        combined_samples = pd.concat(all_samples)
        self.scaler.fit(combined_samples)
        
        print(f"Scaler fitted on {len(combined_samples)} samples")
        return self
    
    def transform_data(self, df):
        """
        Apply preprocessing transformations to a DataFrame.
        
        Args:
            df: Input DataFrame
            
        Returns:
            tuple: (X, y, timestamps) - Features, labels, and timestamps
        """
        # Extract features, labels, and timestamps
        X = df[self.feature_columns].copy()
        timestamps = df[self.timestamp_col].copy() if self.timestamp_col in df.columns else None
        
        # Extract labels if available
        y = None
        label_cols = self.attack_columns + self.label_columns
        if label_cols and any(col in df.columns for col in label_cols):
            # Use the first available label column
            for col in label_cols:
                if col in df.columns:
                    y = df[col].copy()
                    break
        
        # Apply scaling if scaler is fitted
        if self.scaler is not None:
            X_scaled = self.scaler.transform(X)
            X = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
        
        return X, y, timestamps
    
    def save_preprocessor(self, output_dir='./models'):
        """
        Save the preprocessor to disk.
        
        Args:
            output_dir: Directory to save the preprocessor
            
        Returns:
            str: Path to the saved preprocessor
        """
        os.makedirs(output_dir, exist_ok=True)
        output_path = f"{output_dir}/hai_{self.dataset_version.replace('-', '_')}_{self.scaler_type}_preprocessor.joblib"
        
        # Create a dictionary with all necessary attributes
        preprocessor_dict = {
            'scaler': self.scaler,
            'feature_columns': self.feature_columns,
            'attack_columns': self.attack_columns,
            'label_columns': self.label_columns,
            'timestamp_col': self.timestamp_col,
            'separator': self.separator,
            'dataset_version': self.dataset_version,
            'scaler_type': self.scaler_type
        }
        
        # Save to disk
        joblib.dump(preprocessor_dict, output_path)
        print(f"Preprocessor saved to {output_path}")
        
        return output_path
    
    @classmethod
    def load_preprocessor(cls, input_path, base_path='../hai-security-dataset/'):
        """
        Load a preprocessor from disk.
        
        Args:
            input_path: Path to the saved preprocessor
            base_path: Base path to the HAI security dataset
            
        Returns:
            HAIDataPreprocessor: Loaded preprocessor
        """
        # Load from disk
        preprocessor_dict = joblib.load(input_path)
        
        # Create a new instance
        preprocessor = cls(dataset_version=preprocessor_dict['dataset_version'],
                          scaler_type=preprocessor_dict['scaler_type'],
                          base_path=base_path)
        
        # Restore attributes
        preprocessor.scaler = preprocessor_dict['scaler']
        preprocessor.feature_columns = preprocessor_dict['feature_columns']
        preprocessor.attack_columns = preprocessor_dict['attack_columns']
        preprocessor.label_columns = preprocessor_dict.get('label_columns', [])
        preprocessor.timestamp_col = preprocessor_dict['timestamp_col']
        preprocessor.separator = preprocessor_dict['separator']
        
        print(f"Preprocessor loaded from {input_path}")
        return preprocessor

In [None]:
def process_and_save_data(preprocessor, file_paths, output_dir='./processed_data', chunk_size=100000):
    """
    Process and save data in chunks.
    
    Args:
        preprocessor: HAIDataPreprocessor instance
        file_paths: List of file paths to process
        output_dir: Directory to save processed data
        chunk_size: Size of each chunk
        
    Returns:
        list: Paths to saved files
    """
    os.makedirs(output_dir, exist_ok=True)
    saved_files = []
    
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        print(f"Processing {file_name}...")
        
        try:
            # Process in chunks
            chunk_reader = pd.read_csv(file_path, sep=preprocessor.separator, chunksize=chunk_size)
            
            for i, chunk in enumerate(chunk_reader):
                # Convert timestamp
                if preprocessor.timestamp_col in chunk.columns:
                    chunk[preprocessor.timestamp_col] = pd.to_datetime(chunk[preprocessor.timestamp_col])
                
                # Transform data
                X, y, timestamps = preprocessor.transform_data(chunk)
                
                # Create output DataFrame
                output_df = X.copy()
                if timestamps is not None:
                    output_df[preprocessor.timestamp_col] = timestamps
                if y is not None:
                    output_df['attack'] = y
                
                # Save chunk
                output_path = f"{output_dir}/{file_name.replace('.csv', f'_chunk{i}.npz')}"
                np.savez_compressed(output_path, 
                                   data=output_df.to_numpy(), 
                                   columns=output_df.columns.tolist())
                
                saved_files.append(output_path)
                print(f"  Saved chunk {i} to {output_path}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    return saved_files

In [None]:
def preprocess_dataset_version(version, scaler_type='standard', base_path='../hai-security-dataset/'):
    """
    Preprocess a specific HAI dataset version.
    
    Args:
        version: Dataset version (e.g., 'hai-21.03')
        scaler_type: Type of scaler ('standard' or 'minmax')
        base_path: Base path to the HAI security dataset
        
    Returns:
        tuple: (preprocessor, train_saved_files, test_saved_files)
    """
    print(f"\n{'='*50}")
    print(f"Preprocessing {version} dataset...")
    print(f"{'='*50}")
    
    # Initialize preprocessor
    preprocessor = HAIDataPreprocessor(dataset_version=version, scaler_type=scaler_type, base_path=base_path)
    
    # Get training, test, and label files
    train_files, test_files, label_files = preprocessor.get_file_list()
    print(f"Training files: {[os.path.basename(f) for f in train_files]}")
    print(f"Test files: {[os.path.basename(f) for f in test_files]}")
    print(f"Label files: {[os.path.basename(f) for f in label_files]}")
    
    # Fit scaler on training data
    preprocessor.fit_scaler(train_files)
    
    # Save preprocessor
    preprocessor.save_preprocessor()
    
    # Process and save training data
    train_output_dir = f'./processed_data/{version}/train'
    train_saved_files = process_and_save_data(preprocessor, train_files, output_dir=train_output_dir)
    
    # Process and save test data
    test_output_dir = f'./processed_data/{version}/test'
    test_saved_files = process_and_save_data(preprocessor, test_files, output_dir=test_output_dir)
    
    # Process and save label data if available
    label_saved_files = []
    if label_files:
        label_output_dir = f'./processed_data/{version}/label'
        label_saved_files = process_and_save_data(preprocessor, label_files, output_dir=label_output_dir)
    
    return preprocessor, train_saved_files, test_saved_files, label_saved_files

## 3. Preprocess All Dataset Versions

Now let's preprocess all available HAI dataset versions.

In [None]:
# Create a function to load and verify processed data
def load_processed_data(file_path):
    """
    Load processed data from NPZ file.
    
    Args:
        file_path: Path to the NPZ file
        
    Returns:
        DataFrame: Loaded data
    """
    # Load NPZ file
    npz_data = np.load(file_path, allow_pickle=True)
    
    # Convert to DataFrame
    df = pd.DataFrame(npz_data['data'], columns=npz_data['columns'])
    
    return df

In [None]:
# Preprocess all dataset versions
results = {}

for version in tqdm(dataset_versions, desc="Processing dataset versions"):
    try:
        preprocessor, train_files, test_files, label_files = preprocess_dataset_version(version)
        results[version] = {
            'preprocessor': preprocessor,
            'train_files': train_files,
            'test_files': test_files,
            'label_files': label_files,
            'status': 'success'
        }
    except Exception as e:
        print(f"Error processing {version}: {e}")
        results[version] = {
            'status': 'error',
            'error': str(e)
        }

## 4. Verify Processed Data

Let's verify the processed data for each dataset version.

In [None]:
# Verify processed data for each version
for version, result in results.items():
    if result['status'] == 'success' and result['train_files']:
        print(f"\n{'='*50}")
        print(f"Verifying processed data for {version}")
        print(f"{'='*50}")
        
        # Load a sample file
        sample_file = result['train_files'][0]
        print(f"Loading {os.path.basename(sample_file)}...")
        
        try:
            loaded_df = load_processed_data(sample_file)
            print(f"Loaded data shape: {loaded_df.shape}")
            print("\nFirst 5 rows:")
            display(loaded_df.head())
            
            # Check for attack column
            attack_cols = [col for col in loaded_df.columns if 'attack' in col.lower()]
            if attack_cols:
                print(f"\nAttack columns: {attack_cols}")
                for col in attack_cols:
                    attack_count = loaded_df[col].sum()
                    attack_percentage = (attack_count / len(loaded_df)) * 100
                    print(f"{col}: {attack_count} attacks ({attack_percentage:.2f}% of data)")
        except Exception as e:
            print(f"Error verifying {version}: {e}")

## 5. Visualize Data Distribution

Let's visualize the data distribution for each dataset version.

In [None]:
# Visualize data distribution for each version
for version, result in results.items():
    if result['status'] == 'success' and result['test_files']:
        print(f"\n{'='*50}")
        print(f"Visualizing data for {version}")
        print(f"{'='*50}")
        
        # Load a test file
        sample_file = result['test_files'][0]
        print(f"Loading {os.path.basename(sample_file)}...")
        
        try:
            loaded_df = load_processed_data(sample_file)
            
            # Check for attack column
            attack_cols = [col for col in loaded_df.columns if 'attack' in col.lower()]
            timestamp_col = [col for col in loaded_df.columns if col in ['time', 'timestamp']]
            
            if attack_cols and timestamp_col:
                # Plot attack distribution over time
                plt.figure(figsize=(14, 6))
                plt.plot(loaded_df[timestamp_col[0]], loaded_df[attack_cols[0]])
                plt.title(f'{version}: {attack_cols[0]} Distribution Over Time')
                plt.xlabel('Time')
                plt.ylabel('Attack (1) / Normal (0)')
                plt.show()
                
                # Select a few features to visualize
                feature_cols = [col for col in loaded_df.columns 
                               if col not in attack_cols and col not in timestamp_col]
                selected_features = feature_cols[:5]  # First 5 features
                
                # Plot features during attack periods
                if loaded_df[attack_cols[0]].sum() > 0:
                    # Find attack periods
                    attack_starts = []
                    attack_ends = []
                    in_attack = False
                    
                    for i, val in enumerate(loaded_df[attack_cols[0]]):
                        if val == 1 and not in_attack:
                            attack_starts.append(i)
                            in_attack = True
                        elif val == 0 and in_attack:
                            attack_ends.append(i-1)
                            in_attack = False
                    
                    if in_attack:  # If dataset ends during an attack
                        attack_ends.append(len(loaded_df)-1)
                    
                    print(f"Found {len(attack_starts)} attack periods")
                    
                    # Visualize the first attack period
                    if attack_starts:
                        start_idx = max(0, attack_starts[0] - 100)  # Include some pre-attack data
                        end_idx = min(len(loaded_df)-1, attack_ends[0] + 100)  # Include some post-attack data
                        
                        attack_df = loaded_df.iloc[start_idx:end_idx].copy()
                        
                        # Plot features during attack
                        fig, axes = plt.subplots(len(selected_features)+1, 1, figsize=(14, 4*(len(selected_features)+1)))
                        
                        # Plot attack label
                        axes[0].plot(attack_df[timestamp_col[0]], attack_df[attack_cols[0]], 'r-')
                        axes[0].set_title(f'{version}: Attack Period')
                        axes[0].set_ylabel('Attack')
                        
                        # Plot each feature
                        for i, feature in enumerate(selected_features):
                            axes[i+1].plot(attack_df[timestamp_col[0]], attack_df[feature])
                            axes[i+1].set_title(f'{feature} During Attack')
                            axes[i+1].set_ylabel('Value')
                        
                        plt.tight_layout()
                        plt.show()
        except Exception as e:
            print(f"Error visualizing {version}: {e}")

## 6. Summary of Preprocessing Results

Let's summarize the preprocessing results for all dataset versions.

In [None]:
# Create a summary of preprocessing results
summary_data = []

for version, result in results.items():
    if result['status'] == 'success':
        summary_data.append({
            'Version': version,
            'Preprocessor': 'Success',
            'Train Files': len(result['train_files']),
            'Test Files': len(result['test_files']),
            'Label Files': len(result['label_files']),
            'Status': 'Success'
        })
    else:
        summary_data.append({
            'Version': version,
            'Preprocessor': 'Failed',
            'Train Files': 0,
            'Test Files': 0,
            'Label Files': 0,
            'Status': f"Error: {result['error']}"
        })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

## 7. Conclusion

In this notebook, we've implemented a comprehensive preprocessing pipeline for all versions of the HAI security dataset. The pipeline includes:

1. Efficient data loading using Dask for parallel processing
2. Handling different file formats and structures across dataset versions
3. Feature scaling and normalization
4. Processing and saving data in a standardized format for model training

The preprocessed data and fitted preprocessors are saved and ready to be used for training anomaly detection models in separate notebooks.