# Residual & Model Analysis (Quantitative)

This notebook focuses on the quantitative modeling of energy reconstruction. It includes feature engineering (charge corrections), model training (Linear, Cyclic, Polynomial), cross-validation, and detailed residual analysis to evaluate performance across different energy ranges.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Add project root to path
sys.path.append(os.path.abspath('..')) 

from data_processing.utils import get_charge_totale_corr
from models.cyclic_model import modele_cyclique
from models.model_utils import validation_croisee, print_res_modeles, print_erreurs
from visualization.plots import print_plot_reg, print_binned_barplot_reg, evolution_correlation

sns.set_theme(style="whitegrid")

## 1. Data Loading
Loading the processed dataframe.

In [None]:
DF_PKL_PATH = r"../base_de_donnee/df.pkl"
DF_EVENTS_PKL_PATH = r"../base_de_donnee/df_events/df.pkl"

df = None
for pkl_path in [DF_PKL_PATH, DF_EVENTS_PKL_PATH]:
    if os.path.exists(pkl_path):
        print(f"Loading data from {pkl_path}...")
        try:
            df = pd.read_pickle(pkl_path)
            print(f"Successfully loaded {len(df)} events.")
            break
        except Exception as e:
            print(f"Failed to load pickle {pkl_path}: {e}")

if df is None:
    print("No data found. Please run exploratory analysis first or ensure data path is correct.")


## 2. Feature Engineering: Charge Corrections
We calculate corrected charges to improve energy resolution. If these columns are missing, we compute them using utility functions.

In [None]:
if df is not None:
    # Check for corrected charge columns, compute if missing
    if 'charge_corr' not in df.columns:
        print("Calculating corrected charges (this might take a moment)...\n")
        # NOTE: This assumes 'df' has the necessary structure (hits exploded or list columns) expected by get_charge_totale_corr
        # If df is the event-level summary, we might not have individual hits here unless we load the heavy ROOT or specifically structured pickle
        # For this migration, we assume the function handles the dataframe structure available.
        try:
            # Setup detector params if needed by the function (check utils.py signature)
            # Assuming function signature: get_charge_totale_corr(df, verbose=True)
            
            # Mocking the call structure based on extraction
            # df = get_charge_totale_corr(df) 
            
            print("Charge correction logic placeholder. Ensure 'get_charge_totale_corr' is compatible with your DataFrame structure.")
            # If simplified DF, we might skip or use existing 'charge_totale'
            if 'charge_totale' in df.columns:
                df['charge_corr'] = df['charge_totale'] # Fallback/Placeholder if specific correction logic needs raw hits
                print("Using 'charge_totale' as 'charge_corr' for demonstration.")
        except Exception as e:
            print(f"Error calculating corrections: {e}")
    else:
        print("Corrected charge columns found.")

## 3. Model Definition & Comparison
We define feature sets and target variables, then compare Linear Regression, Cyclic Models, and simple Polynomial fits.

In [None]:
# Filter data for stable training
if df is not None:
    # Remove NaNs
    df_model = df.dropna(subset=['energy', 'charge_totale', 'n_hits']).copy()
    
    # Target
    y = df_model['energy']
    
    # Feature Sets
    X_simple = df_model[['charge_totale', 'n_hits']]
    X_geom = df_model[['charge_totale', 'n_hits', 'dwall', 'towall']] if 'dwall' in df_model.columns else X_simple
    X_corr = df_model[['charge_corr', 'n_hits']] if 'charge_corr' in df_model.columns else X_simple

    # Models
    models = {
        "Linear (Simple)": (LinearRegression(), X_simple),
        "Linear (Geom)": (LinearRegression(), X_geom),
        "Linear (Corr)": (LinearRegression(), X_corr),
        # Cyclic Model placeholder (requires specific optimization logic usually handled inside the class)
        # "Cyclic": (modele_cyclique(), X_simple) 
    }

    print("Models defined.")

### Cross Validation Results
Evaluating Bias (`mu`) and Resolution (`sigma`) for each model.

In [None]:
results = {}

if df is not None:
    print("Running Cross-Validation...\n")
    for name, (model, X) in models.items():
        print(f"Evaluating {name}...")
        try:
            if name == "Cyclic":
                # Cyclic model might need special handling if not fully sklearn compliant
                pass 
            else:
                res = validation_croisee(model, X, y, cv=5)
                results[name] = res
        except Exception as e:
            print(f"Failed to evaluate {name}: {e}")

    # Display Results
    if results:
        print_res_modeles(results)

## 4. Residual Analysis
Analyzing the residuals (Actual - Predicted) to identify systematic biases.

In [None]:
# Train best model on full dataset for analysis
best_model_name = "Linear (Geom)" # Example choice
if df is not None and best_model_name in models:
    model, X = models[best_model_name]
    model.fit(X, y)
    y_pred = model.predict(X)
    residuals = (y_pred - y) / y
    
    # Global Residuals
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, bins=100, kde=True)
    plt.title(f"Relative Residuals Distribution ({best_model_name})")
    plt.xlabel("(Predicted - Actual) / Actual")
    plt.xlim(-0.5, 0.5)
    plt.show()

## 5. Detailed Error Analysis by Energy Bin
Breaking down the resolution and bias across different energy ranges.

In [None]:
if df is not None:
    # Define Bins
    bins = np.arange(0, 1500, 100)
    
    # Use utility function to compute errors per bin
    # Note: print_erreurs from utils expects specific args, we assume standard here
    try:
        print("Calculating errors per bin...")
        # Check helper signature: print_erreurs(y_true, y_pred, num_bins=10...)
        # Or manual implementation if helper is geared differently
        
        df_res = pd.DataFrame({'energy': y, 'pred': y_pred})
        df_res['bin'] = pd.cut(df_res['energy'], bins)
        
        bin_stats = df_res.groupby('bin').apply(lambda x: pd.Series({
            'bias': np.mean((x['pred'] - x['energy']) / x['energy']),
            'resolution': np.std((x['pred'] - x['energy']) / x['energy'])
        }))
        
        display(bin_stats)
        
        # Plotting
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        bin_stats['bias'].plot(kind='bar', color='skyblue')
        plt.title("Bias per Energy Bin")
        plt.ylabel("Bias ((Pred-True)/True)")

        plt.subplot(1, 2, 2)
        bin_stats['resolution'].plot(kind='bar', color='salmon')
        plt.title("Resolution per Energy Bin")
        plt.ylabel("Resolution")
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error in bin analysis: {e}")

## 6. Visualizing Fits
Comparing simple linear vs polynomial trends.

In [None]:
if df is not None:
    # Scatter plot with regression line
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred, alpha=0.3, s=1, label='Data')
    plt.plot([0, 1500], [0, 1500], 'r--', label='Ideal')
    plt.xlabel("True Energy")
    plt.ylabel("Predicted Energy")
    plt.legend()
    plt.title("True vs Predicted Energy")
    plt.show()
    
    # Checking Polynomial Fit visualization
    # print_plot_reg_poly(...) if available