# HAI 20.07 Dataset: External Plot Visualization

This notebook generates sensor plots similar to previous versions but saves them as external HTML files instead of embedding them directly. This significantly reduces the notebook file size.

Plots will be saved in the `plots/` subdirectory relative to this notebook.

## 1. Setup and Data Loading

In [1]:
import polars as pl
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import math
from IPython.display import display, HTML # To display links

# Set Plotly default theme
import plotly.io as pio
pio.templates.default = "plotly_white"

In [2]:
# Define paths
data_dir = "../../../hai-security-dataset/hai-20.07/"
train_file = os.path.join(data_dir, "train1.csv")
output_plot_dir = "plots" # Directory to save HTML plots

print(f"Train file path: {train_file}")
print(f"Output plot directory: {output_plot_dir}")

# Create the output directory if it doesn't exist
os.makedirs(output_plot_dir, exist_ok=True)

Train file path: ../../hai-security-dataset/hai-20.07/train1.csv
Output plot directory: plots


In [3]:
# Load the training data using Polars
df_train = None
try:
    df_train = pl.read_csv(
        train_file,
        separator=';',
        try_parse_dates=False,
        infer_schema_length=100000
    )
    df_train = df_train.with_columns(
        pl.col('time').str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S").alias('time')
    ).sort('time')
    print("Training data loaded successfully.")
    print(f"Shape: {df_train.shape}")
except Exception as e:
    print(f"Error loading training data: {e}")

Error loading training data: No such file or directory (os error 2): ../../hai-security-dataset/hai-20.07/train1.csv


## 2. Generate and Save External Plots

In [4]:
def save_sensor_group_plot(df, features, filename_base, output_dir, title_suffix="", sample_frac=100, max_points_marker=5000):
    """Generates a plot for a group of sensors and saves it as an HTML file."""
    if df is None or not features:
        print(f"Skipping plot for group {title_suffix}: Data not loaded or no features.")
        return None
        
    plot_cols = ['time', 'attack'] + [col for col in features if col in df.columns]
    if len(plot_cols) <= 2:
        print(f"Skipping plot for group {title_suffix}: Could not find specified features in DataFrame.")
        return None
        
    # --- Even More Aggressive Sampling --- 
    rows_before_sampling = df.height
    # Adjust threshold and fraction for desired output size/detail trade-off
    if rows_before_sampling > 10000: # Start sampling even earlier
         print(f"Sampling data (1/{sample_frac}) for plotting group {title_suffix} ({rows_before_sampling} rows).")
         slice_len = (rows_before_sampling // sample_frac) * sample_frac
         df_sample = df.select(plot_cols).slice(0, slice_len).filter(pl.int_range(0, pl.len()).mod(sample_frac) == 0)
         plot_title = f'Sensor Readings (Group {title_suffix}) - Sampled 1/{sample_frac}'
    else:
         df_sample = df.select(plot_cols)
         plot_title = f'Sensor Readings (Group {title_suffix})'
    # --- End Sampling Logic ---
    
    # Create figure
    fig = go.Figure()

    try:
        # Add Scattergl traces for sensors
        df_sample_pd = df_sample.to_pandas() # Convert once for plotting
        for sensor in features:
            if sensor in df_sample_pd.columns:
                fig.add_trace(
                    go.Scattergl(x=df_sample_pd['time'], y=df_sample_pd[sensor], mode='lines', name=sensor)
                )
        
        # Add attack markers using Scattergl
        attack_points = df_sample.filter(pl.col('attack') == 1)
        if attack_points.height > 0 and attack_points.height < max_points_marker:
             attack_points_pd = attack_points.to_pandas()
             y_marker_ref_col = next((f for f in features if f in attack_points_pd.columns), None)
             if y_marker_ref_col:
                 fig.add_trace(go.Scattergl(x=attack_points_pd['time'], 
                                          y=attack_points_pd[y_marker_ref_col], 
                                          mode='markers', name='Attack', 
                                          marker=dict(color='red', size=6, symbol='x', opacity=0.7)))
        elif attack_points.height >= max_points_marker:
             print(f"Too many attack points ({attack_points.height}) for group {title_suffix}, skipping markers.")

        fig.update_layout(title=plot_title, xaxis_title="Time", yaxis_title="Value", height=400)
        
        # Save figure to HTML
        filepath = os.path.join(output_dir, f"{filename_base}.html")
        fig.write_html(filepath, include_plotlyjs='cdn') # Use CDN for smaller file size
        print(f"Saved plot to: {filepath}")
        return filepath # Return path for linking
        
    except Exception as e:
        print(f"Error during plotting/saving group {title_suffix}: {e}")
        return None

In [5]:
# Identify numerical sensor columns for plotting
if df_train is not None:
    exclude_cols = ['time', 'attack', 'attack_P1', 'attack_P2', 'attack_P3']
    sensor_cols = [
        col for col, dtype in df_train.schema.items() 
        if dtype in pl.NUMERIC_DTYPES and col not in exclude_cols
    ]
    print(f"Found {len(sensor_cols)} numerical sensor columns to plot.")

    # Plot in groups and save externally
    group_size = 8
    num_groups = math.ceil(len(sensor_cols) / group_size)
    plot_links = []

    for i in range(num_groups):
        start_idx = i * group_size
        end_idx = start_idx + group_size
        feature_group = sensor_cols[start_idx:end_idx]
        filename = f"sensor_group_{i+1}"
        group_title = f"{i+1}/{num_groups}"
        
        # Use the saving function with more aggressive sampling (e.g., sample_frac=100)
        saved_path = save_sensor_group_plot(df_train, feature_group, filename, output_plot_dir, 
                                            title_suffix=group_title, sample_frac=100)
        if saved_path:
            # Create a relative path for the link
            relative_path = os.path.join(output_plot_dir, f"{filename}.html") 
            plot_links.append(f'<li><a href="{relative_path}" target="_blank">Plot Group {group_title}</a></li>')
            
    # Display links to the saved plots
    if plot_links:
        display(HTML("<h3>Saved Plot Links:</h3><ul>" + "".join(plot_links) + "</ul>"))
    else:
        print("No plots were generated or saved.")
else:
    print("Data not loaded, cannot generate plots.")

Data not loaded, cannot generate plots.


## 3. Further Exploration Ideas

*   **Correlation Analysis:** Examine correlations between different sensors.
*   **Distribution Plots:** Use histograms or density plots for individual features.
*   **Time-based Aggregations:** Analyze sensor behavior aggregated over minutes, hours, etc.
*   **Load Test Data:** Perform similar initial analysis on the test datasets.