# HAI Dataset Training Data Analysis

This notebook analyzes the training data patterns and characteristics.

In [None]:
import polars as pl
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plot style
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Define paths
PROCESSED_DIR = Path('../data/processed')
MODELS_DIR = Path('../models')

In [None]:
# Load all processed training data
train_files = list(PROCESSED_DIR.glob('train_*.parquet'))
train_dfs = [pl.read_parquet(f) for f in train_files]

# Combine all training data
combined_df = pl.concat(train_dfs)

In [None]:
# Plot time series patterns for each control loop
control_loops = ['P1-PC', 'P1-LC', 'P1-FC', 'P1-TC']

for loop in control_loops:
    fig, axes = plt.subplots(3, 1, figsize=(15, 12))
    
    # Plot error distribution
    error_data = combined_df.select(f'{loop}_error').to_numpy().flatten()
    sns.histplot(error_data, kde=True, ax=axes[0])
    axes[0].set_title(f'{loop} Error Distribution')
    
    # Plot moving average
    ma_data = combined_df.select(f'{loop}_PV_mean_30').to_numpy().flatten()
    axes[1].plot(ma_data)
    axes[1].set_title(f'{loop} 30-point Moving Average')
    
    # Plot moving standard deviation
    std_data = combined_df.select(f'{loop}_PV_std_30').to_numpy().flatten()
    axes[2].plot(std_data)
    axes[2].set_title(f'{loop} 30-point Moving Standard Deviation')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze correlations between different control loops
error_cols = [f'{loop}_error' for loop in control_loops]
ma_cols = [f'{loop}_PV_mean_30' for loop in control_loops]
std_cols = [f'{loop}_PV_std_30' for loop in control_loops]

# Create correlation matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Error correlations
error_corr = combined_df.select(error_cols).to_pandas().corr()
sns.heatmap(error_corr, annot=True, cmap='coolwarm', ax=axes[0])
axes[0].set_title('Control Error Correlations')

# Moving average correlations
ma_corr = combined_df.select(ma_cols).to_pandas().corr()
sns.heatmap(ma_corr, annot=True, cmap='coolwarm', ax=axes[1])
axes[1].set_title('Moving Average Correlations')

# Standard deviation correlations
std_corr = combined_df.select(std_cols).to_pandas().corr()
sns.heatmap(std_corr, annot=True, cmap='coolwarm', ax=axes[2])
axes[2].set_title('Standard Deviation Correlations')

plt.tight_layout()
plt.show()

In [None]:
# Analyze control variable patterns
cv_patterns = {
    'P1-PC': ['P1_PCV01D', 'P1_PCV02D'],
    'P1-LC': ['P1_LCV01D'],
    'P1-FC': ['P1_FCV03D'],
    'P1-TC': ['P1_FCV01D', 'P1_FCV02D']
}

for loop, cvs in cv_patterns.items():
    n_cvs = len(cvs)
    fig, axes = plt.subplots(n_cvs, 2, figsize=(15, 5*n_cvs))
    
    for i, cv in enumerate(cvs):
        # Distribution
        cv_data = combined_df.select(cv).to_numpy().flatten()
        sns.histplot(cv_data, kde=True, ax=axes[i,0])
        axes[i,0].set_title(f'{cv} Distribution')
        
        # Time series
        axes[i,1].plot(cv_data)
        axes[i,1].set_title(f'{cv} Time Series')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze relationships between SP, PV, and CV
control_vars = {
    'P1-PC': {
        'SP': 'P1_B2016',
        'PV': 'P1_PIT01',
        'CV': 'P1_PCV01D'
    },
    'P1-LC': {
        'SP': 'P1_B3004',
        'PV': 'P1_LIT01',
        'CV': 'P1_LCV01D'
    },
    'P1-FC': {
        'SP': 'P1_B3005',
        'PV': 'P1_FT03',
        'CV': 'P1_FCV03D'
    },
    'P1-TC': {
        'SP': 'P1_B4022',
        'PV': 'P1_TIT01',
        'CV': 'P1_FCV01D'
    }
}

for loop, vars in control_vars.items():
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # SP vs PV
    axes[0,0].scatter(combined_df.select(vars['SP']).to_numpy(),
                      combined_df.select(vars['PV']).to_numpy(),
                      alpha=0.1)
    axes[0,0].set_title(f'{loop} SP vs PV')
    axes[0,0].set_xlabel('Setpoint')
    axes[0,0].set_ylabel('Process Variable')
    
    # Error vs CV
    axes[0,1].scatter(combined_df.select(f'{loop}_error').to_numpy(),
                      combined_df.select(vars['CV']).to_numpy(),
                      alpha=0.1)
    axes[0,1].set_title(f'{loop} Error vs CV')
    axes[0,1].set_xlabel('Control Error')
    axes[0,1].set_ylabel('Control Variable')
    
    # Time series
    sample_df = combined_df.sample(1000)
    axes[1,0].plot(sample_df.select(vars['SP']).to_numpy(), label='SP')
    axes[1,0].plot(sample_df.select(vars['PV']).to_numpy(), label='PV')
    axes[1,0].set_title(f'{loop} SP and PV Time Series')
    axes[1,0].legend()
    
    axes[1,1].plot(sample_df.select(vars['CV']).to_numpy(), label='CV')
    axes[1,1].set_title(f'{loop} CV Time Series')
    axes[1,1].legend()
    
    plt.tight_layout()
    plt.show()