# HAI 20.07 Dataset: Optimized Visualization

This notebook aims to improve the visualization performance compared to `1_data_exploration.ipynb` by:
1. Using more aggressive data sampling.
2. Using Plotly's `Scattergl` traces, which are optimized for larger datasets via WebGL.

## 1. Setup and Data Loading

In [None]:
import polars as pl
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import math

# Set Plotly default theme
import plotly.io as pio
pio.templates.default = "plotly_white"

In [None]:
# Define the path to the dataset
data_dir = "../../hai-security-dataset/hai-20.07/"
train_file = os.path.join(data_dir, "train1.csv")

print(f"Train file path: {train_file}")

In [None]:
# Load the training data using Polars
df_train = None
try:
    df_train = pl.read_csv(
        train_file,
        separator=';',
        try_parse_dates=False,
        infer_schema_length=10000
    )
    df_train = df_train.with_columns(
        pl.col('time').str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S").alias('time')
    ).sort('time')
    print("Training data loaded successfully.")
    print(f"Shape: {df_train.shape}")
except Exception as e:
    print(f"Error loading training data: {e}")

## 2. Optimized Feature Visualization (Grouped Scattergl)

In [None]:
def plot_sensor_group_optimized(df, features, title_suffix="", sample_frac=50, max_points_marker=5000):
    """Helper function to plot a group of sensors using Scattergl and more sampling."""
    if df is None or not features:
        print(f"Skipping plot for group {title_suffix}: Data not loaded or no features.")
        return
        
    plot_cols = ['time', 'attack'] + [col for col in features if col in df.columns]
    if len(plot_cols) <= 2:
        print(f"Skipping plot for group {title_suffix}: Could not find specified features in DataFrame.")
        return
        
    # --- More Aggressive Sampling --- 
    rows_before_sampling = df.height
    if rows_before_sampling > 20000: # Start sampling earlier
         print(f"Sampling data (1/{sample_frac}) for plotting group {title_suffix} ({rows_before_sampling} rows).")
         slice_len = (rows_before_sampling // sample_frac) * sample_frac
         # Use pl.len() instead of pl.count()
         df_sample = df.select(plot_cols).slice(0, slice_len).filter(pl.int_range(0, pl.len()).mod(sample_frac) == 0)
         plot_title = f'Sensor Readings (Group {title_suffix}) - Sampled 1/{sample_frac}'
    else:
         df_sample = df.select(plot_cols)
         plot_title = f'Sensor Readings (Group {title_suffix})'
    # --- End Sampling Logic ---
    
    # Create figure
    fig = go.Figure()

    try:
        # Add Scattergl traces for sensors
        df_sample_pd = df_sample.to_pandas() # Convert once for plotting
        for sensor in features:
            if sensor in df_sample_pd.columns:
                fig.add_trace(
                    go.Scattergl(x=df_sample_pd['time'], y=df_sample_pd[sensor], mode='lines', name=sensor)
                )
        
        # Add attack markers using Scattergl
        attack_points = df_sample.filter(pl.col('attack') == 1)
        if attack_points.height > 0 and attack_points.height < max_points_marker:
             # print(f"Adding {attack_points.height} attack markers for group {title_suffix}...") # Optional
             attack_points_pd = attack_points.to_pandas()
             # Use first available sensor in the group for y-axis reference if possible
             y_marker_ref_col = next((f for f in features if f in attack_points_pd.columns), None)
             if y_marker_ref_col:
                 fig.add_trace(go.Scattergl(x=attack_points_pd['time'], 
                                          y=attack_points_pd[y_marker_ref_col], 
                                          mode='markers', name='Attack', 
                                          marker=dict(color='red', size=6, symbol='x', opacity=0.7)))
        elif attack_points.height >= max_points_marker:
             print(f"Too many attack points ({attack_points.height}) for group {title_suffix}, skipping markers.")

        fig.update_layout(title=plot_title, xaxis_title="Time", yaxis_title="Value", height=400)
        fig.show()
        
    except Exception as e:
        print(f"Error during plotting group {title_suffix}: {e}")

In [None]:
# Identify numerical sensor columns for plotting
if df_train is not None:
    exclude_cols = ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']
    sensor_cols = [
        col for col, dtype in df_train.schema.items() 
        if dtype in pl.NUMERIC_DTYPES and col not in exclude_cols
    ]
    print(f"Found {len(sensor_cols)} numerical sensor columns to plot.")

    # Plot in groups using the optimized function
    group_size = 8
    num_groups = math.ceil(len(sensor_cols) / group_size)

    for i in range(num_groups):
        start_idx = i * group_size
        end_idx = start_idx + group_size
        feature_group = sensor_cols[start_idx:end_idx]
        # Use the new optimized plotting function
        plot_sensor_group_optimized(df_train, feature_group, title_suffix=f"{i+1}/{num_groups}", sample_frac=50)
else:
    print("Data not loaded, cannot plot.")

## 3. Further Exploration Ideas

*   **Correlation Analysis:** Examine correlations between different sensors.
*   **Distribution Plots:** Use histograms or density plots for individual features.
*   **Time-based Aggregations:** Analyze sensor behavior aggregated over minutes, hours, etc.
*   **Load Test Data:** Perform similar initial analysis on the test datasets.