In [None]:
# ============================================
# FEYN QLattice — Rolling yearly training (2017–2025)
# Train on 2007..(T-3), validate on {T-2, T-1}, test on T
# Logs everything, saves ALL candidates + formulas, marks BEST
# Memory-lean: strategic gc.collect(), float32 features, minimized temporaries
# ============================================

import sys, subprocess, warnings, time, os, pickle, json, shutil, gc
import numpy as np
import pandas as pd
from typing import Dict, Any, List, Optional, Tuple
from scipy.stats import spearmanr, pearsonr

# -------- Paths (top-level) --------
LOG_PATH                 = "feyn_qlattice_train.log"
BASE_MODELS_DIR          = "qlattice_models"                 # per-year subfolders of raw candidates
BASE_OUTPUT_DIR          = "qlattice_yearly"                 # per-year artifacts (best model, metrics, preds)
AGG_METRICS_CSV          = "qlattice_yearly_summary.csv"     # one row per test year

# -------- Data / columns --------
PATH               = "../data/ret_sample.parquet"
ID_COL             = "id"
DATE_COL           = "date"
TARGET             = "stock_ret"
RANDOM_SEED        = 42
TRAIN_START_YEAR   = 2007
TEST_YEARS         = list(range(2017, 2026))  # 2017..2025 inclusive

In [3]:

# ============== Logging ==============
def log(msg: Optional[str] = None) -> None:
    if not hasattr(log, "t0"):
        log.t0 = time.time()
        with open(LOG_PATH, "w") as f:
            f.write(f"[{time.strftime('%H:%M:%S')}] Log started\n")
    if msg:
        ts = f"[{time.strftime('%H:%M:%S')}] {msg}"
        print(ts, flush=True)
        with open(LOG_PATH, "a", encoding="utf-8") as f:
            f.write(ts + "\n")

def save_text(path: str, text: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    log(f"Wrote text file: {path}  (size={len(text)} chars)")

def save_pickle(path: str, obj: Any) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    log(f"Wrote pickle: {path}")

def save_json(path: str, obj: Any) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)
    log(f"Wrote JSON: {path}")

def save_csv_df(path: str, df: pd.DataFrame) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)
    log(f"Wrote CSV: {path}  (rows={len(df)})")

def _try_symlink(src: str, dst: str) -> None:
    try:
        if os.path.islink(dst) or os.path.exists(dst):
            try: os.remove(dst)
            except Exception: pass
        os.symlink(src, dst)
        log(f"Created symlink: {dst} -> {src}")
    except Exception:
        shutil.copyfile(src, dst)
        log(f"Symlink not supported; copied instead: {dst}")

def _mark_best_candidate(year_models_dir: str, best_idx: int) -> None:
    marker_txt = os.path.join(year_models_dir, f"BEST_INDEX_{best_idx:03d}.txt")
    with open(marker_txt, "w", encoding="utf-8") as f:
        f.write(str(best_idx) + "\n")
    log(f"Wrote best-index marker: {marker_txt}")

# ============== Feyn import ==============
log("Checking feyn/QLattice...")
try:
    from feyn import QLattice
    log("QLattice import OK.")
except Exception:
    log("Installing feyn (quiet)...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "feyn"])
    from feyn import QLattice
    log("QLattice installed & imported.")

warnings.filterwarnings("ignore")

# ============== Load ==============
log(f"Reading parquet: {PATH}")
df = pd.read_parquet(PATH)
log(f"Read done. Rows={len(df):,}, Cols={len(df.columns)}")
gc.collect()

# Ensure datetime, year
if DATE_COL in df and not np.issubdtype(df[DATE_COL].dtype, np.datetime64):
    log(f"Parsing {DATE_COL} to datetime...")
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")

if "year" not in df.columns:
    if DATE_COL in df:
        log("Deriving 'year' from date column...")
        df["year"] = df[DATE_COL].dt.year
    else:
        raise RuntimeError("No 'year' or parsable DATE_COL found to create splits.")
gc.collect()

# ============== Helpers ==============
# Drop-likes never used as features
drop_like = {ID_COL, DATE_COL, TARGET, "year", "month", "char_date", "gvkey", "iid", "ret_eom", "char_eom"}

def _make_ql() -> QLattice:
    try:
        return QLattice(random_seed=RANDOM_SEED)
    except TypeError:
        try: return QLattice(random_state=RANDOM_SEED)
        except TypeError: return QLattice()

def fit_candidates(df_small: pd.DataFrame, yname: str):
    ql = _make_ql()
    log("QLattice.auto_run(...)")
    models = ql.auto_run(df_small, yname, max_complexity=12, n_epochs=15, criterion="bic")
    log(f"Got {len(models)} candidate models.")
    return models

def predict_safely(model: Any, df_small: pd.DataFrame, yname: str, xcols: List[str]) -> np.ndarray:
    # Avoid building new DataFrames repeatedly; rely on column-order selection when possible
    try:
        arr = model.predict(df_small[[yname] + xcols])
        preds = np.asarray(arr, dtype=np.float32, order="C")
        log(f"Prediction used `[y]+X` signature (rows={preds.shape[0]}).")
        return preds
    except Exception:
        arr = model.predict(df_small[xcols])
        preds = np.asarray(arr, dtype=np.float32, order="C")
        log(f"Prediction used `X-only` signature (rows={preds.shape[0]}).")
        return preds

def summary_stats(y: np.ndarray, p: np.ndarray, prefix: str) -> Dict[str, Any]:
    # y, p are float arrays; keep as float32 to reduce memory; cast for stats where needed
    y32 = y.astype(np.float32, copy=False)
    p32 = p.astype(np.float32, copy=False)
    resid = p32 - y32
    # compute with float64 for numeric stability on aggregates, then cast to float for JSON
    mse = float(np.nanmean(resid.astype(np.float64) ** 2))
    mae = float(np.nanmean(np.abs(resid.astype(np.float64))))
    rmse = float(np.sqrt(mse))
    out: Dict[str, Any] = {
        f"{prefix}_n": int(np.isfinite(y32).sum()),
        f"{prefix}_mse": mse,
        f"{prefix}_mae": mae,
        f"{prefix}_rmse": rmse,
    }
    try:
        r, _ = pearsonr(y32.astype(np.float64), p32.astype(np.float64))
        out[f"{prefix}_pearson_r"] = float(r)
    except Exception as e:
        log(f"[warn] pearsonr failed: {e}")
        out[f"{prefix}_pearson_r"] = float("nan")
    try:
        ic = spearmanr(y32, p32, nan_policy="omit").correlation
        out[f"{prefix}_spearman_ic"] = float(ic) if ic is not None else float("nan")
    except Exception as e:
        log(f"[warn] spearmanr failed: {e}")
        out[f"{prefix}_spearman_ic"] = float("nan")
    ybar = float(np.nanmean(y32.astype(np.float64)))
    ss_tot = float(np.nansum((y32.astype(np.float64) - ybar) ** 2))
    ss_res = float(np.nansum((y32.astype(np.float64) - p32.astype(np.float64)) ** 2))
    out[f"{prefix}_r2"] = float("nan") if ss_tot == 0.0 else 1.0 - (ss_res / ss_tot)
    return out

def safe_formula(model: Any) -> str:
    try:
        s = str(model.sympify())
        if not isinstance(s, str):
            s = str(s)
        log(f"Extracted symbolic representation (len={len(s)}).")
        return s
    except Exception as e:
        log(f"[warn] Could not extract symbolic formula: {e}")
        return ""

def _downcast_float32_inplace(df_small: pd.DataFrame, feat_cols: List[str]) -> None:
    # Downcast features to float32 (saves ~2x memory); TARGET stays original dtype
    for c in feat_cols:
        if pd.api.types.is_float_dtype(df_small[c]):
            df_small[c] = df_small[c].astype(np.float32, copy=False)
        elif pd.api.types.is_integer_dtype(df_small[c]):
            df_small[c] = df_small[c].astype(np.float32, copy=False)

def standardize_fit_transform(train_df_small: pd.DataFrame, val_df_small: pd.DataFrame, test_df_small: pd.DataFrame, feat_cols: List[str]):
    # compute mu/sd in float64 (stability), then apply in-place on float32 arrays
    mu = np.empty(len(feat_cols), dtype=np.float64)
    sd = np.empty(len(feat_cols), dtype=np.float64)

    for j, c in enumerate(feat_cols):
        arr = train_df_small[c].to_numpy(dtype=np.float64, copy=False)
        mu[j] = np.nanmean(arr)
        sd[j] = np.nanstd(arr, ddof=1)

    np.nan_to_num(sd, copy=False, nan=1.0)
    sd[~np.isfinite(sd)] = 1.0
    sd[sd == 0.0] = 1.0

    # In-place z-score; we avoid allocating new DataFrames
    for j, c in enumerate(feat_cols):
        # train
        t = train_df_small[c].to_numpy(dtype=np.float32, copy=False)
        np.subtract(t, mu[j], out=t, dtype=np.float32)
        np.divide(t, sd[j], out=t, dtype=np.float32)
        # val
        v = val_df_small[c].to_numpy(dtype=np.float32, copy=False)
        np.subtract(v, mu[j], out=v, dtype=np.float32)
        np.divide(v, sd[j], out=v, dtype=np.float32)
        # test
        w = test_df_small[c].to_numpy(dtype=np.float32, copy=False)
        np.subtract(w, mu[j], out=w, dtype=np.float32)
        np.divide(w, sd[j], out=w, dtype=np.float32)

    # Fill residual NaNs with 0 in-place
    train_df_small[feat_cols] = train_df_small[feat_cols].fillna(0.0)
    val_df_small[feat_cols]   = val_df_small[feat_cols].fillna(0.0)
    test_df_small[feat_cols]  = test_df_small[feat_cols].fillna(0.0)

    log(f"Standardized & imputed: features={len(feat_cols)} | "
        f"train={train_df_small.shape} val={val_df_small.shape} test={test_df_small.shape}")
    return mu, sd


[23:33:00] Checking feyn/QLattice...


This version of Feyn and the QLattice is available for academic, personal, and non-commercial use. By using the community version of this software you agree to the terms and conditions which can be found at https://abzu.ai/eula.

[23:33:00] QLattice import OK.
[23:33:00] Reading parquet: ../data/ret_sample.parquet
[23:33:20] Read done. Rows=6,401,414, Cols=159


In [None]:
# ============== Rolling loop ==============
os.makedirs(BASE_MODELS_DIR, exist_ok=True)
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
gc.collect()

agg_rows: List[Dict[str, Any]] = []

for T in TEST_YEARS:
    # Example: T=2018 -> val = 2016–2017, train = 2007–2015, test = 2018
    val_lo, val_hi = T - 2, T - 1
    train_hi = T - 3

    log("="*72)
    log(f"=== Processing TEST YEAR {T} "
        f"(Train {TRAIN_START_YEAR}..{train_hi}, Val in [{val_lo}, {val_hi}], Test=={T}) ===")

    # Define splits (no extra copies here; masks create views until we mutate)
    train_mask = (df["year"] >= TRAIN_START_YEAR) & (df["year"] <= train_hi)
    val_mask   = (df["year"] >= val_lo) & (df["year"] <= val_hi)
    test_mask  = (df["year"] == T)

    train_df = df.loc[train_mask]
    val_df   = df.loc[val_mask]
    test_df  = df.loc[test_mask]
    log(f"Split sizes — train={len(train_df):,} val={len(val_df):,} test={len(test_df):,}")

    if train_df.empty or val_df.empty or test_df.empty:
        log(f"Skipping {T}: missing data in one or more splits.")
        del train_df, val_df, test_df
        gc.collect()
        continue

    # Sort (if columns exist)
    if {ID_COL, DATE_COL} <= set(train_df.columns): train_df = train_df.sort_values([ID_COL, DATE_COL])
    if {ID_COL, DATE_COL} <= set(val_df.columns):   val_df   = val_df.sort_values([ID_COL, DATE_COL])
    if {ID_COL, DATE_COL} <= set(test_df.columns):  test_df  = test_df.sort_values([ID_COL, DATE_COL])

    # Feature selection (numeric, intersect across splits)
    num_cols_train = [c for c in train_df.select_dtypes(include=[np.number]).columns if c not in drop_like]
    num_cols_val   = [c for c in val_df.select_dtypes(include=[np.number]).columns   if c not in drop_like]
    num_cols_test  = [c for c in test_df.select_dtypes(include=[np.number]).columns  if c not in drop_like]
    feat_cols = [c for c in num_cols_train if (c in num_cols_val) and (c in num_cols_test)]
    log(f"Selected {len(feat_cols)} overlapping numeric features for year {T}.")

    if not feat_cols:
        log(f"Skipping {T}: no overlapping numeric features across splits.")
        del train_df, val_df, test_df
        gc.collect()
        continue

    cols = [TARGET] + feat_cols

    # Build small frames (views until we write); immediately drop NaN targets to keep them small
    train_small = train_df.loc[:, cols]
    val_small   = val_df.loc[:, cols]
    test_small  = test_df.loc[:, cols]
    del train_df, val_df, test_df
    gc.collect()

    train_small = train_small[train_small[TARGET].notna()]
    val_small   = val_small[val_small[TARGET].notna()]
    test_small  = test_small[test_small[TARGET].notna()]
    log(f"After dropping NaN targets — train={len(train_small):,} val={len(val_small):,} test={len(test_small):,}")

    if train_small.empty or val_small.empty or test_small.empty:
        del train_small, val_small, test_small
        gc.collect()
        log(f"Skipping {T}: empty split after dropping NaN targets.")
        continue

    # Downcast features to float32 (saves memory)
    _downcast_float32_inplace(train_small, feat_cols)
    _downcast_float32_inplace(val_small,   feat_cols)
    _downcast_float32_inplace(test_small,  feat_cols)
    gc.collect()

    # Standardize using TRAIN stats only; impute z-space NaNs with 0 (in-place)
    mu, sd = standardize_fit_transform(train_small, val_small, test_small, feat_cols)
    gc.collect()

    # Fit candidates on TRAIN
    candidates = fit_candidates(train_small, TARGET)
    if not candidates:
        log(f"Skipping {T}: no candidates returned by QLattice.")
        del train_small, val_small, test_small
        gc.collect()
        continue

    # Per-year output dirs / filenames
    year_models_dir = os.path.join(BASE_MODELS_DIR, f"{T}")
    os.makedirs(year_models_dir, exist_ok=True)
    year_out_dir = os.path.join(BASE_OUTPUT_DIR, f"{T}")
    os.makedirs(year_out_dir, exist_ok=True)

    CANDIDATE_METRICS_JSON = os.path.join(year_out_dir, f"qlattice_candidates_metrics_{T}.json")
    CANDIDATE_METRICS_CSV  = os.path.join(year_out_dir, f"qlattice_candidates_metrics_{T}.csv")
    ALL_FORMULAS_TXT       = os.path.join(year_out_dir, f"qlattice_candidates_formulas_{T}.txt")
    MODEL_OUT    = os.path.join(year_out_dir, f"qlattice_model_{T}.pkl")
    PREPROC_OUT  = os.path.join(year_out_dir, f"qlattice_preproc_{T}.npz")
    FORMULAS_OUT = os.path.join(year_out_dir, f"qlattice_formulas_{T}.txt")
    METRICS_JSON = os.path.join(year_out_dir, f"qlattice_metrics_{T}.json")
    METRICS_CSV  = os.path.join(year_out_dir, f"qlattice_metrics_{T}.csv")
    TRAIN_PRED_CSV = os.path.join(year_out_dir, f"qlattice_train_predictions_{T}.csv")
    VAL_PRED_CSV   = os.path.join(year_out_dir, f"qlattice_val_predictions_{T}.csv")
    TEST_PRED_CSV  = os.path.join(year_out_dir, f"qlattice_test_predictions_{T}.csv")

    # Evaluate all candidates on VAL; pick best by Val MSE; save each model + formula
    y_train = train_small[TARGET].to_numpy(dtype=np.float32, copy=False)
    y_val   = val_small[TARGET].to_numpy(dtype=np.float32, copy=False)
    y_test  = test_small[TARGET].to_numpy(dtype=np.float32, copy=False)

    candidate_rows: List[Dict[str, Any]] = []
    formulas_concat: List[str] = []
    best_idx, best_mse = -1, float("inf")

    log(f"Evaluating {len(candidates)} candidates for year {T} (selection by VALIDATION MSE)...")
    for i, m in enumerate(candidates):
        phat_tr = predict_safely(m, train_small, TARGET, feat_cols)
        phat_va = predict_safely(m, val_small,   TARGET, feat_cols)
        phat_te = predict_safely(m, test_small,  TARGET, feat_cols)

        row: Dict[str, Any] = {"year": T, "model_index": i}
        row.update(summary_stats(y_train, phat_tr, "train"))
        row.update(summary_stats(y_val,   phat_va, "val"))     # <-- used for selection
        row.update(summary_stats(y_test,  phat_te, "test"))

        # Persist the model
        model_path = os.path.join(year_models_dir, f"model_{i:03d}.pkl")
        save_pickle(model_path, m)
        row["model_path"] = model_path

        # Persist formula (symbolic)
        form = safe_formula(m)
        row["formula"] = form
        cand_form_path = os.path.join(year_models_dir, f"model_{i:03d}_formula.txt")
        save_text(cand_form_path, (form or "") + "\n")
        formulas_concat.append(f"### Candidate {i}\n{form}\n")

        # Best by validation MSE
        val_mse_i = row["val_mse"]
        log(f"[{T}] Model {i}: Val MSE = {val_mse_i:.6g}")
        if np.isfinite(val_mse_i) and (val_mse_i < best_mse):
            best_mse, best_idx = val_mse_i, i

        candidate_rows.append(row)

        # Free per-candidate arrays ASAP
        del phat_tr, phat_va, phat_te, form
        gc.collect()

    if best_idx < 0:
        log(f"Skipping {T}: no finite Val MSE among candidates.")
        del train_small, val_small, test_small, candidates, candidate_rows, formulas_concat
        gc.collect()
        continue

    # flag the best row
    for r in candidate_rows:
        r["is_best"] = (r["model_index"] == best_idx)
        r["best_selection_metric"] = "val_mse"

    # Save per-year candidate metrics + concatenated formulas
    save_json(CANDIDATE_METRICS_JSON, candidate_rows)
    save_csv_df(CANDIDATE_METRICS_CSV, pd.DataFrame(candidate_rows))
    save_text(ALL_FORMULAS_TXT, "\n".join(formulas_concat))
    del candidate_rows, formulas_concat
    gc.collect()

    # ====== MARK best model clearly inside the candidates folder ======
    log(f"[{T}] Selected best model index {best_idx} by lowest validation MSE = {best_mse:.6g}")
    _mark_best_candidate(year_models_dir, best_idx)

    # original file names
    best_model_path     = os.path.join(year_models_dir, f"model_{best_idx:03d}.pkl")
    best_formula_path   = os.path.join(year_models_dir, f"model_{best_idx:03d}_formula.txt")

    # highlighted copies with BEST__ prefix
    best_copy_pkl       = os.path.join(year_models_dir, f"BEST__model_{best_idx:03d}.pkl")
    best_copy_formula   = os.path.join(year_models_dir, f"BEST__model_{best_idx:03d}_formula.txt")
    shutil.copyfile(best_model_path,   best_copy_pkl)
    shutil.copyfile(best_formula_path, best_copy_formula)
    log(f"[{T}] Created BEST copies: {os.path.basename(best_copy_pkl)}, {os.path.basename(best_copy_formula)}")

    # handy symlinks (fallback to copy if symlinks not allowed)
    _try_symlink(best_model_path,   os.path.join(year_models_dir, "best_model.pkl"))
    _try_symlink(best_formula_path, os.path.join(year_models_dir, "best_model_formula.txt"))

    # small JSON with the winner’s identity
    best_meta = {
        "year": T,
        "best_index": int(best_idx),
        "selected_by": "val_mse",
        "val_mse": float(best_mse),
        "model_path": best_model_path,
        "formula_path": best_formula_path,
        "train_years": [int(y) for y in range(TRAIN_START_YEAR, train_hi + 1)],
        "val_years": [val_lo, val_hi],
        "test_year": T,
    }
    save_json(os.path.join(year_models_dir, f"best_model_meta_{T}.json"), best_meta)
    del best_meta
    gc.collect()

    # ====== Save BEST model to per-year artifacts ======
    best_model = None
    try:
        # Reload the best model from disk to avoid holding the whole 'candidates' list in memory
        with open(best_model_path, "rb") as f:
            best_model = pickle.load(f)
        log(f"[{T}] Reloaded BEST model from {best_model_path}")
    except Exception:
        # fallback: take from candidates list
        best_model = candidates[best_idx]
        log(f"[{T}] Using in-memory BEST model (reload failed or skipped).")

    # free other candidates early
    del candidates
    gc.collect()

    forms = safe_formula(best_model)
    save_text(FORMULAS_OUT, (forms or "") + "\n")
    del forms
    gc.collect()

    # Persist best model copy in per-year artifacts
    save_pickle(MODEL_OUT, best_model)

    # Persist preprocessing (float64 stats; feature list)
    np.savez_compressed(
        PREPROC_OUT,
        feat_cols=np.array(feat_cols, dtype=object),
        mu=np.asarray(mu, dtype=np.float64),
        sd=np.asarray(sd, dtype=np.float64),
    )
    log(f"Wrote NPZ: {PREPROC_OUT}  (feat_cols={len(feat_cols)})")

    # Predictions for best model (keep arrays float32)
    yhat_train = predict_safely(best_model, train_small, TARGET, feat_cols)
    yhat_val   = predict_safely(best_model, val_small,   TARGET, feat_cols)
    yhat_test  = predict_safely(best_model, test_small,  TARGET, feat_cols)

    best_metrics: Dict[str, Any] = {"year": T, "best_model_index": int(best_idx), "best_model_val_mse": float(best_mse)}
    best_metrics.update(summary_stats(y_train, yhat_train, "train"))
    best_metrics.update(summary_stats(y_val,   yhat_val,   "val"))
    best_metrics.update(summary_stats(y_test,  yhat_test,  "test"))

    save_json(METRICS_JSON, best_metrics)
    save_csv_df(METRICS_CSV, pd.DataFrame([best_metrics]))

    # Dump preds then free arrays quickly
    save_csv_df(TRAIN_PRED_CSV, pd.DataFrame({"y_true": y_train.astype(np.float32, copy=False),
                                              "y_pred": yhat_train.astype(np.float32, copy=False)}))
    save_csv_df(VAL_PRED_CSV,   pd.DataFrame({"y_true": y_val.astype(np.float32, copy=False),
                                              "y_pred": yhat_val.astype(np.float32, copy=False)}))
    save_csv_df(TEST_PRED_CSV,  pd.DataFrame({"y_true": y_test.astype(np.float32, copy=False),
                                              "y_pred": yhat_test.astype(np.float32, copy=False)}))

    # Free per-year heavy objects
    del y_train, y_val, y_test, yhat_train, yhat_val, yhat_test
    del best_model, train_small, val_small, test_small, mu, sd
    gc.collect()

    # Add to aggregate (keep tiny dict only)
    agg_rows.append(best_metrics)
    del best_metrics
    gc.collect()


[04:37:18] Got 10 candidate models.
[04:37:18] Evaluating 10 candidates for year 2025 (selection by VALIDATION MSE)...
[04:37:19] Prediction used `[y]+X` signature (rows=4975091).
[04:37:20] Prediction used `[y]+X` signature (rows=727615).
[04:37:20] Prediction used `[y]+X` signature (rows=164546).
[04:37:21] Wrote pickle: qlattice_models/2025/model_000.pkl
[04:37:21] Extracted symbolic representation (len=45).
[04:37:21] Wrote text file: qlattice_models/2025/model_000_formula.txt  (size=46 chars)
[04:37:21] [2025] Model 0: Val MSE = 500.58
[04:37:22] Prediction used `[y]+X` signature (rows=4975091).
[04:37:22] Prediction used `[y]+X` signature (rows=727615).
[04:37:22] Prediction used `[y]+X` signature (rows=164546).
[04:37:23] Wrote pickle: qlattice_models/2025/model_001.pkl
[04:37:23] Extracted symbolic representation (len=26).
[04:37:23] Wrote text file: qlattice_models/2025/model_001_formula.txt  (size=27 chars)
[04:37:23] [2025] Model 1: Val MSE = 506.892
[04:37:24] Prediction us

FileNotFoundError: [Errno 2] No such file or directory: ''

In [7]:
# ============== Aggregate summary across all test years ==============
if agg_rows:
    save_csv_df("./qlattice_yearly_summary.csv", pd.DataFrame(agg_rows).sort_values("year"))
else:
    log("No years processed successfully; aggregate CSV not created.")

# Final GC
gc.collect()
log("DONE: rolling yearly QLattice training (2017–2025).")


[08:49:19] Wrote CSV: ./qlattice_yearly_summary.csv  (rows=9)
[08:49:19] DONE: rolling yearly QLattice training (2017–2025).


In [None]:
import pickle
import numpy as np
import pandas as pd
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union

def load_qlattice_model(model_path: str) -> Any:
    """
    Load a trained QLattice model (.pkl) from disk and return it.
    """
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    return model


def predict_dataframe_per_row(
    model: Any,
    df: pd.DataFrame,
    *,
    # If you saved qlattice_preproc_<YEAR>.npz, pass it via np.load(..., allow_pickle=True)
    # or pass a dict with keys: 'feat_cols', 'mu', 'sd'
    preproc: Optional[Dict[str, Any]] = None,
    # If you know the feature list explicitly, pass it. Otherwise we infer.
    feat_cols: Optional[Sequence[str]] = None,
    # Name of target column (if present in df). If provided, we'll try the [y]+X signature first.
    yname: Optional[str] = None,
    # How to resolve NaNs when preproc is not given:
    #   - "zero": fill NaNs with 0.0
    #   - "median": fill NaNs with column medians (computed on df[feat_cols])
    impute_strategy: str = "zero",
) -> pd.Series:
    """
    Run the QLattice model on a DataFrame, returning a 1D Series of predictions
    aligned to df.index. We DO NOT drop rows with NaNs. NaNs are imputed.

    Priority of feature selection / preprocessing:
      1) If `preproc` is provided (with 'feat_cols','mu','sd'), we standardize:
         z = (x - mu) / sd   and fill remaining NaNs with 0.
      2) Else if `feat_cols` is provided, we use those columns from df
         (no standardization), imputing NaNs via `impute_strategy`.
      3) Else we infer numeric columns from df (excluding yname if given),
         imputing NaNs via `impute_strategy`.

    We automatically handle models that expect either X-only or [y]+X DataFrame
    ordering (same behavior as in your training code).
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")

    # 1) Choose feature columns
    if preproc is not None and all(k in preproc for k in ("feat_cols", "mu", "sd")):
        # Use training-time feature order
        xcols = list(preproc["feat_cols"])
        # Intersect with df just in case
        xcols = [c for c in xcols if c in df.columns]
        if len(xcols) == 0:
            raise ValueError("No overlap between preproc['feat_cols'] and df columns.")
        X = df[xcols].astype(float, copy=False)
        # Standardize using TRAIN stats, then fill remaining NaNs with 0
        mu = np.asarray(preproc["mu"], dtype=float)
        sd = np.asarray(preproc["sd"], dtype=float)
        if mu.shape[0] != len(xcols) or sd.shape[0] != len(xcols):
            raise ValueError("preproc['mu']/['sd'] length does not match feat_cols.")

        # z-score in a vectorized way
        # (broadcast: each column j uses mu[j], sd[j])
        Xz = (X - mu) / sd
        Xz = Xz.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        # We'll try both signatures below
        X_infer = Xz

    else:
        # No preproc provided; use feat_cols if given, otherwise infer numeric
        if feat_cols is not None:
            xcols = [c for c in feat_cols if c in df.columns]
            if len(xcols) == 0:
                raise ValueError("Provided feat_cols have no overlap with df columns.")
        else:
            # Infer: numeric columns except yname (if present)
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if yname and yname in numeric_cols:
                numeric_cols.remove(yname)
            xcols = numeric_cols
            if len(xcols) == 0:
                raise ValueError("Could not infer any numeric feature columns from df.")

        X = df[xcols].astype(float, copy=False)

        # Impute NaNs (do NOT drop rows)
        if impute_strategy == "median":
            med = X.median(axis=0, numeric_only=True)
            X_infer = X.fillna(med)
        else:
            # default: zero
            X_infer = X.fillna(0.0)

        # avoid +/-inf
        X_infer = X_infer.replace([np.inf, -np.inf], 0.0)

    # 2) Predict using either X-only or [y]+X signature
    # We avoid copying the big frame; build small views when needed.
    # Keep index alignment by constructing DataFrame views from df.
    try:
        if yname is not None and yname in df.columns:
            df_in = pd.concat([df[[yname]], X_infer], axis=1)
            preds = model.predict(df_in)
        else:
            preds = model.predict(X_infer)
    except Exception:
        # flip the signature if the first attempt failed
        if yname is not None and yname in df.columns:
            try:
                preds = model.predict(X_infer)
            except Exception as e2:
                raise RuntimeError(f"Model prediction failed (both signatures). Last error: {e2}") from e2
        else:
            # try [y]+X only if y exists (rare at inference)
            if yname is not None and yname in df.columns:
                df_in = pd.concat([df[[yname]], X_infer], axis=1)
                preds = model.predict(df_in)
            else:
                raise  # nothing else to try

    # Ensure 1D float series aligned to df index
    preds = np.asarray(preds, dtype=float).reshape(-1)
    if preds.shape[0] != len(df):
        raise ValueError(f"Predictions length {preds.shape[0]} does not match df length {len(df)}.")
    return pd.Series(preds, index=df.index, name="y_pred")
