# HAI Security Dataset - Data Preprocessing

This notebook provides utility functions for processing the HAI security dataset using the Polars framework for efficient data handling.

In [None]:
# Install required libraries
!pip install polars pyarrow matplotlib seaborn scikit-learn tqdm

In [None]:
# Import libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from pathlib import Path
import time
from tqdm.notebook import tqdm

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading Functions

In [None]:
def lazy_load_csv(file_path, time_column='timestamp', time_format='%Y-%m-%d %H:%M:%S'):
    """
    Lazy load CSV file and convert time column to datetime format
    
    Args:
        file_path (str): Path to CSV file
        time_column (str): Name of time column
        time_format (str): Time format
        
    Returns:
        pl.LazyFrame: Lazy loaded DataFrame
    """
    print(f"Lazy loading file: {file_path}")
    start_time = time.time()
    
    # First read a small sample to get column names and data types
    sample_df = pl.read_csv(file_path, n_rows=1000)
    
    # Create a better schema to optimize data types
    dtypes = {}
    for col in sample_df.columns:
        if col == time_column:
            dtypes[col] = pl.Utf8  # Read as string first, convert to datetime later
        elif sample_df[col].dtype == pl.Float64:
            # Use Float32 for floating point numbers to save memory
            dtypes[col] = pl.Float32
        elif sample_df[col].dtype == pl.Int64:
            # Use Int32 for integers to save memory
            dtypes[col] = pl.Int32
    
    # Lazy load CSV with optimized schema
    df_lazy = pl.scan_csv(file_path, dtypes=dtypes)
    
    # Convert time column to datetime
    df_lazy = df_lazy.with_column(
        pl.col(time_column).str.strptime(pl.Datetime, time_format).alias(time_column)
    )
    
    print(f"Loading completed in {time.time() - start_time:.2f} seconds")
    return df_lazy

def get_file_info(file_path):
    """
    Get basic information about a file
    
    Args:
        file_path (str): Path to file
        
    Returns:
        dict: Dictionary containing file information
    """
    file_path = Path(file_path)
    file_size_mb = file_path.stat().st_size / (1024 * 1024)
    
    # Read a few rows to get column count
    sample_df = pl.read_csv(file_path, n_rows=1)
    num_columns = len(sample_df.columns)
    
    # Estimate row count (not accurate but fast)
    with open(file_path, 'r') as f:
        first_line = f.readline()
        f.seek(0, os.SEEK_END)
        file_size = f.tell()
    
    estimated_rows = file_size / (len(first_line) + 2)  # +2 for newline chars
    
    return {
        "file_name": file_path.name,
        "file_size_mb": file_size_mb,
        "num_columns": num_columns,
        "estimated_rows": int(estimated_rows)
    }

## 2. Chunked Processing for Large Files

In [None]:
def process_in_chunks(file_path, process_func, chunk_size=100000, **kwargs):
    """
    Process large CSV files in chunks
    
    Args:
        file_path (str): Path to CSV file
        process_func (callable): Function to process each chunk
        chunk_size (int): Size of each chunk
        **kwargs: Additional arguments to pass to process_func
        
    Returns:
        list: List of results from processing each chunk
    """
    print(f"Starting chunked processing of file: {file_path}, chunk size: {chunk_size}")
    start_time = time.time()
    
    results = []
    total_chunks = 0
    
    # Use polars batched reading feature
    for i, chunk in enumerate(pl.read_csv_batched(file_path, batch_size=chunk_size)):
        print(f"Processing chunk {i+1}, size: {len(chunk)}")
        chunk_start = time.time()
        
        # Process current chunk
        result = process_func(chunk, **kwargs)
        results.append(result)
        
        total_chunks += 1
        print(f"Chunk {i+1} processed in {time.time() - chunk_start:.2f} seconds")
    
    print(f"All chunks processed in {time.time() - start_time:.2f} seconds, total chunks: {total_chunks}")
    return results

def save_to_efficient_format(df, output_path, format='parquet', partition_cols=None):
    """
    Save DataFrame to an efficient format (parquet or arrow)
    
    Args:
        df (pl.DataFrame or pl.LazyFrame): DataFrame to save
        output_path (str): Output path
        format (str): Output format ('parquet' or 'arrow')
        partition_cols (list): Columns to partition by (for parquet)
    """
    start_time = time.time()
    
    # If LazyFrame, collect it
    if isinstance(df, pl.LazyFrame):
        df = df.collect()
    
    if format.lower() == 'parquet':
        if partition_cols:
            df.write_parquet(output_path, use_pyarrow=True, compression="zstd", 
                             partition_cols=partition_cols)
        else:
            df.write_parquet(output_path, use_pyarrow=True, compression="zstd")
    elif format.lower() == 'arrow':
        df.write_ipc(output_path, compression="zstd")
    else:
        raise ValueError(f"Unsupported format: {format}. Use 'parquet' or 'arrow'.")
    
    print(f"Data saved to {output_path} in {time.time() - start_time:.2f} seconds")

## 3. Feature Engineering

In [None]:
def add_time_features(df_lazy, time_column='timestamp'):
    """
    Add time-based features to DataFrame
    
    Args:
        df_lazy (pl.LazyFrame): Input LazyFrame
        time_column (str): Name of time column
        
    Returns:
        pl.LazyFrame: LazyFrame with time features added
    """
    return df_lazy.with_columns([
        pl.col(time_column).dt.hour().alias('hour'),
        pl.col(time_column).dt.day_of_week().alias('day_of_week'),
        pl.col(time_column).dt.day().alias('day'),
        pl.col(time_column).dt.month().alias('month'),
        pl.col(time_column).dt.year().alias('year'),
        # Is weekend feature
        (pl.col(time_column).dt.day_of_week() >= 5).alias('is_weekend'),
        # Time of day category
        pl.when(pl.col(time_column).dt.hour() < 6).then(pl.lit('night'))
          .when(pl.col(time_column).dt.hour() < 12).then(pl.lit('morning'))
          .when(pl.col(time_column).dt.hour() < 18).then(pl.lit('afternoon'))
          .otherwise(pl.lit('evening')).alias('time_of_day')
    ])

def add_lag_features(df, columns, lags=[1, 5, 10], group_by=None):
    """
    Add lag features to DataFrame
    
    Args:
        df (pl.DataFrame): Input DataFrame
        columns (list): Columns to create lag features for
        lags (list): List of lag values
        group_by (str): Column to group by before creating lags
        
    Returns:
        pl.DataFrame: DataFrame with lag features added
    """
    result = df.clone()
    
    for col in columns:
        for lag in lags:
            if group_by:
                result = result.with_column(
                    pl.col(col).shift(lag).over(group_by).alias(f"{col}_lag_{lag}")
                )
            else:
                result = result.with_column(
                    pl.col(col).shift(lag).alias(f"{col}_lag_{lag}")
                )
    
    return result

def add_rolling_features(df, columns, windows=[5, 10, 30], group_by=None):
    """
    Add rolling window features to DataFrame
    
    Args:
        df (pl.DataFrame): Input DataFrame
        columns (list): Columns to create rolling features for
        windows (list): List of window sizes
        group_by (str): Column to group by before creating rolling features
        
    Returns:
        pl.DataFrame: DataFrame with rolling features added
    """
    result = df.clone()
    
    for col in columns:
        for window in windows:
            if group_by:
                result = result.with_columns([
                    pl.col(col).rolling_mean(window).over(group_by).alias(f"{col}_rolling_mean_{window}"),
                    pl.col(col).rolling_std(window).over(group_by).alias(f"{col}_rolling_std_{window}"),
                    pl.col(col).rolling_min(window).over(group_by).alias(f"{col}_rolling_min_{window}"),
                    pl.col(col).rolling_max(window).over(group_by).alias(f"{col}_rolling_max_{window}")
                ])
            else:
                result = result.with_columns([
                    pl.col(col).rolling_mean(window).alias(f"{col}_rolling_mean_{window}"),
                    pl.col(col).rolling_std(window).alias(f"{col}_rolling_std_{window}"),
                    pl.col(col).rolling_min(window).alias(f"{col}_rolling_min_{window}"),
                    pl.col(col).rolling_max(window).alias(f"{col}_rolling_max_{window}")
                ])
    
    return result

## 4. Visualization Functions

In [None]:
def plot_time_series(df, time_column, value_columns, title=None, figsize=(15, 8), attack_column=None):
    """
    Plot time series data
    
    Args:
        df (pl.DataFrame): Input DataFrame
        time_column (str): Name of time column
        value_columns (list): List of columns to plot
        title (str): Plot title
        figsize (tuple): Figure size
        attack_column (str): Name of attack label column
    """
    plt.figure(figsize=figsize)
    
    # Convert to pandas for easier plotting
    pdf = df.to_pandas()
    
    for col in value_columns:
        plt.plot(pdf[time_column], pdf[col], label=col)
    
    # Highlight attack regions if attack_column is provided
    if attack_column and attack_column in df.columns:
        attack_regions = pdf[pdf[attack_column] > 0]
        if not attack_regions.empty:
            plt.scatter(attack_regions[time_column], 
                       attack_regions[value_columns[0]], 
                       color='red', alpha=0.5, s=10, label='Attack')
    
    plt.title(title or f"Time Series Plot of {', '.join(value_columns)}")
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_correlation_matrix(df, columns=None, figsize=(12, 10)):
    """
    Plot correlation matrix
    
    Args:
        df (pl.DataFrame): Input DataFrame
        columns (list): List of columns to include in correlation matrix
        figsize (tuple): Figure size
    """
    # Convert to pandas
    if columns:
        pdf = df.select(columns).to_pandas()
    else:
        # Select only numeric columns
        numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]]
        pdf = df.select(numeric_cols).to_pandas()
    
    # Calculate correlation matrix
    corr = pdf.corr()
    
    # Plot
    plt.figure(figsize=figsize)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".2f")
    
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

def plot_distribution(df, columns, bins=30, figsize=(15, 10)):
    """
    Plot distribution of columns
    
    Args:
        df (pl.DataFrame): Input DataFrame
        columns (list): List of columns to plot
        bins (int): Number of bins
        figsize (tuple): Figure size
    """
    n_cols = 2
    n_rows = (len(columns) + 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten()
    
    pdf = df.to_pandas()
    
    for i, col in enumerate(columns):
        if i < len(axes):
            sns.histplot(pdf[col], bins=bins, kde=True, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
            axes[i].grid(True)
    
    # Hide unused subplots
    for i in range(len(columns), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 5. Example Usage

In [None]:
# Example of how to use the functions
# Uncomment and modify as needed

# file_path = 'hai-security-dataset/hai-20.07/train1.csv'
# 
# # Get file info
# info = get_file_info(file_path)
# print(info)
# 
# # Lazy load data
# df_lazy = lazy_load_csv(file_path)
# 
# # Add time features
# df_lazy = add_time_features(df_lazy)
# 
# # Collect a sample for visualization
# df_sample = df_lazy.limit(10000).collect()
# 
# # Plot time series
# plot_time_series(df_sample, 'timestamp', ['P1_PIT01', 'P1_TIT01'], 
#                  title='Pressure and Temperature Time Series')
# 
# # Plot correlation matrix
# plot_correlation_matrix(df_sample, columns=['P1_PIT01', 'P1_TIT01', 'P1_FT01Z', 'P1_LIT01'])
# 
# # Save to efficient format
# save_to_efficient_format(df_lazy, 'processed_data/train1.parquet')