In [1]:
import os, pickle, json
import numpy as np
import pandas as pd
from typing import Any, Dict, Optional, Tuple

BASE_DIR = "qlattice_yearly"

# ---- Load model + preprocessing ----
def load_qlattice_model(
    model_path: str,
    preproc_path: Optional[str] = None
) -> Tuple[Any, Optional[Dict[str, Any]]]:
    """
    Load a trained QLattice model and optional preprocessing metadata.

    Args:
        model_path (str): Path to the pickled QLattice model file (.pkl).
        preproc_path (Optional[str]): Path to the preprocessing .npz file
            that contains 'feat_cols', 'mu', and 'sd'. Defaults to None.

    Returns:
        Tuple[Any, Optional[Dict[str, Any]]]:
            - model: the unpickled QLattice model object.
            - preproc: a dictionary with preprocessing info
              (keys: 'feat_cols', 'mu', 'sd') if provided, else None.
    """
    # ---- Load model ----
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    preproc: Optional[Dict[str, Any]] = None

    # ---- Load preprocessing (optional) ----
    if preproc_path is not None:
        data = np.load(preproc_path, allow_pickle=True)
        preproc = {
            "feat_cols": data["feat_cols"].tolist(),
            "mu": data["mu"],
            "sd": data["sd"],
        }

    return model, preproc

# ---- Prediction helper ----
def preprocess(df: pd.DataFrame, preproc: Dict[str, Any]) -> pd.DataFrame:
    X = df[preproc["feat_cols"]].copy()
    for i, c in enumerate(preproc["feat_cols"]):
        X[c] = (X[c] - preproc["mu"][i]) / preproc["sd"][i]
    return X.fillna(0.0)


def predict_safely(model: Any, df: pd.DataFrame, preproc: Dict[str, Any]) -> np.ndarray:
    X = preprocess(df, preproc)
    try:
        return np.asarray(model.predict(X), dtype=float)
    except Exception:
        return np.asarray(model.predict(df[preproc["feat_cols"]]), dtype=float)


# ---- Scaling computation ----
def compute_distribution_alignment(year_dir: str) -> Dict[str, float]:
    year = os.path.basename(year_dir)
    print(f"\n=== Processing {year} ===")

    model_path   = os.path.join(year_dir, f"qlattice_model_{year}.pkl")
    preproc_path = os.path.join(year_dir, f"qlattice_preproc_{year}.npz")
    val_csv      = os.path.join(year_dir, f"qlattice_val_predictions_{year}.csv")
    test_csv     = os.path.join(year_dir, f"qlattice_test_predictions_{year}.csv")

    if not all(os.path.exists(p) for p in [model_path, preproc_path, val_csv]):
        print(f"Missing files for {year}, skipping.")
        return {}

    # Load model + preproc
    model, preproc = load_qlattice_model(model_path, preproc_path)

    # --- Validation stats ---
    val_df = pd.read_csv(val_csv)
    val_mean = float(val_df["y_true"].mean())
    val_std  = float(val_df["y_true"].std(ddof=1))

    pred_mean = float(val_df["y_pred"].mean())
    pred_std = float(val_df["y_pred"].std(ddof=1))

    out = {
        "year": int(year),
        "val_pred_mean": val_mean,
        "val_pred_std": val_std,
        "model_pred_mean": pred_mean,
        "model_pred_std": pred_std,
    }

    out_path = os.path.join(year_dir, f"qlattice_distribution_stats_{year}.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, indent=2)
    print(f"Saved → {out_path}")

    return out


# ---- Main: run for all yearly subfolders ----
all_stats = []
for sub in sorted(os.listdir(BASE_DIR)):
    year_path = os.path.join(BASE_DIR, sub)
    if not os.path.isdir(year_path) or not sub.isdigit():
        continue
    stats = compute_distribution_alignment(year_path)
    if stats:
        all_stats.append(stats)

pd.DataFrame(all_stats).to_csv(os.path.join(BASE_DIR, "qlattice_distribution_summary.csv"), index=False)
print("✅ Saved summary CSV: qlattice_distribution_summary.csv")



=== Processing 2017 ===


This version of Feyn and the QLattice is available for academic, personal, and non-commercial use. By using the community version of this software you agree to the terms and conditions which can be found at https://abzu.ai/eula.

Saved → qlattice_yearly/2017/qlattice_distribution_stats_2017.json

=== Processing 2018 ===
Saved → qlattice_yearly/2018/qlattice_distribution_stats_2018.json

=== Processing 2019 ===
Saved → qlattice_yearly/2019/qlattice_distribution_stats_2019.json

=== Processing 2020 ===
Saved → qlattice_yearly/2020/qlattice_distribution_stats_2020.json

=== Processing 2021 ===
Saved → qlattice_yearly/2021/qlattice_distribution_stats_2021.json

=== Processing 2022 ===
Saved → qlattice_yearly/2022/qlattice_distribution_stats_2022.json

=== Processing 2023 ===
Saved → qlattice_yearly/2023/qlattice_distribution_stats_2023.json

=== Processing 2024 ===
Saved → qlattice_yearly/2024/qlattice_distribution_stats_2024.json

=== Processing 2025 ===
Saved → qlattice_yearly/2025/qlattice_distribution_stats_2025.json
✅ Saved summary CSV: qlattice_distribution_summary.csv
