In [1]:
# ============================================
# FEYN QLattice — Train on ALL features, select by Val MSE (2015–2016)
# Saves ALL candidate models + metrics
# ============================================

import sys, subprocess, warnings, time, os, pickle, json
import numpy as np
import pandas as pd
from typing import Dict, Any, List, Optional, Tuple
import pickle
from scipy.stats import spearmanr, pearsonr

LOG_PATH             = "feyn_qlattice_train.log"
MODEL_OUT            = "qlattice_model.pkl"              # best model
PREPROC_OUT          = "qlattice_preproc.npz"
FORMULAS_OUT         = "qlattice_formulas.txt"           # best model's formula
METRICS_JSON         = "qlattice_metrics.json"           # best model metrics (train/val)
METRICS_CSV          = "qlattice_metrics.csv"
TRAIN_PRED_CSV       = "qlattice_train_predictions.csv"  # best model preds
VAL_PRED_CSV         = "qlattice_val_predictions.csv"    # best model preds

# NEW: where to save ALL candidate models + their metrics
MODELS_DIR                 = "qlattice_models"
CANDIDATE_METRICS_JSON     = "qlattice_candidates_metrics.json"
CANDIDATE_METRICS_CSV      = "qlattice_candidates_metrics.csv"

# ============== Logging ==============
def log(msg: Optional[str] = None) -> None:
    if not hasattr(log, "t0"):
        log.t0 = time.time()
        with open(LOG_PATH, "w") as f:
            f.write(f"[{time.strftime('%H:%M:%S')}] Log started\n")
    if msg:
        ts = f"[{time.strftime('%H:%M:%S')}] {msg}"
        print(ts, flush=True)
        with open(LOG_PATH, "a", encoding="utf-8") as f:
            f.write(ts + "\n")

# ============== Feyn import ==============
log("Checking feyn/QLattice...")
try:
    from feyn import QLattice
    log("QLattice import OK.")
except Exception:
    log("Installing feyn (quiet)...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "feyn"])
    from feyn import QLattice
    log("QLattice installed & imported.")

warnings.filterwarnings("ignore")

# ============== Config ==============
PATH         = "../data/ret_sample.parquet"
ID_COL       = "id"
DATE_COL     = "date"
TARGET       = "stock_ret"
RANDOM_SEED  = 42

# ============== Load ==============
log(f"Reading parquet: {PATH}")
df = pd.read_parquet(PATH)
log(f"Read done. Rows={len(df):,}, Cols={len(df.columns)}")

# Ensure datetime, year
if DATE_COL in df and not np.issubdtype(df[DATE_COL].dtype, np.datetime64):
    log(f"Parsing {DATE_COL} to datetime...")
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
if "year" not in df.columns:
    if DATE_COL in df:
        log("Deriving 'year' from date column...")
        df["year"] = df[DATE_COL].dt.year
    else:
        raise RuntimeError("No 'year' or parsable DATE_COL found to create splits.")

# ============== Split ==============
# Train: year <= 2014 ; Validation: 2015–2016
log("Creating TRAIN (<=2014) and VAL (2015–2016) splits...")
train_df = df[df["year"] <= 2014]
val_df   = df[(df["year"] >= 2015) & (df["year"] <= 2016)]
if train_df.empty or val_df.empty:
    raise RuntimeError("Empty train/val after year split. Check data coverage.")

log("Sorting by (id,date)...")
if {ID_COL, DATE_COL} <= set(train_df.columns):
    train_df = train_df.sort_values([ID_COL, DATE_COL])
if {ID_COL, DATE_COL} <= set(val_df.columns):
    val_df = val_df.sort_values([ID_COL, DATE_COL])
log(f"Train rows={len(train_df):,} | Val rows={len(val_df):,}")

# ============== Feature selection (ALL numeric except obvious drops) ==============
# (Integrated: keep ALL rows; only drop if TARGET is NaN; NaN-aware standardization; fill NaNs with 0 in z-space.)
drop_like = {ID_COL, DATE_COL, TARGET, "year", "month", "char_date", "gvkey", "iid", "ret_eom", "char_eom"}

num_cols_train: List[str] = [c for c in train_df.select_dtypes(include=[np.number]).columns if c not in drop_like]
if not num_cols_train:
    raise RuntimeError("No numeric candidate features found on TRAIN.")

# Keep only columns available in BOTH train and val
num_cols_val_set = set([c for c in val_df.select_dtypes(include=[np.number]).columns if c not in drop_like])
feat_cols: List[str] = [c for c in num_cols_train if c in num_cols_val_set]
if not feat_cols:
    raise RuntimeError("No overlapping numeric features between train and val.")

cols = [TARGET] + feat_cols

# Build TRAIN/VAL without dropping rows for feature NaNs; drop only rows with NaN TARGET
train_small = train_df[cols].copy()
val_small   = val_df[cols].copy()

train_small = train_small[train_small[TARGET].notna()]
val_small   = val_small[val_small[TARGET].notna()]

log(f"train_small (pre-norm) shape={train_small.shape} | val_small (pre-norm) shape={val_small.shape}")

# ============== Standardize using TRAIN stats only (NaN-aware) ==============
log("Standardizing features with TRAIN stats only (NaN-aware), then imputing remaining NaNs with 0 in z-space...")
mu = np.empty(len(feat_cols), dtype=float)
sd = np.empty(len(feat_cols), dtype=float)

# Compute stats on TRAIN ignoring NaNs
for j, c in enumerate(feat_cols):
    arr = train_small[c].to_numpy(dtype=float, copy=False)
    mu[j] = np.nanmean(arr)
    sd[j] = np.nanstd(arr, ddof=1)

# avoid division by zero / non-finite
sd[~np.isfinite(sd)] = 1.0
sd[sd == 0.0] = 1.0

# Apply z-score transform, keep NaNs for now
for j, c in enumerate(feat_cols):
    t = train_small[c].to_numpy(dtype=float, copy=False)
    v = val_small[c].to_numpy(dtype=float, copy=False)
    train_small[c] = (t - mu[j]) / sd[j]
    val_small[c]   = (v - mu[j]) / sd[j]

# Fill remaining NaNs with 0.0 AFTER standardization (mean-imputation in z-space)
train_small[feat_cols] = train_small[feat_cols].fillna(0.0)
val_small[feat_cols]   = val_small[feat_cols].fillna(0.0)

log("Standardization + NaN-imputation complete.")
log(f"train_small (post-norm) shape={train_small.shape} | val_small (post-norm) shape={val_small.shape}")

# ============== Train candidate models ==============
def _make_ql() -> QLattice:
    try:
        return QLattice(random_seed=RANDOM_SEED)
    except TypeError:
        try:
            return QLattice(random_state=RANDOM_SEED)
        except TypeError:
            return QLattice()

def fit_candidates(df_small: pd.DataFrame, yname: str):
    ql = _make_ql()
    log("QLattice.auto_run(...)")
    models = ql.auto_run(df_small, yname, max_complexity=25, n_epochs=18, criterion="bic")
    log(f"Got {len(models)} candidate models.")
    return models

def predict_safely(model: Any, df_small: pd.DataFrame, yname: str, xcols: List[str]) -> np.ndarray:
    try:
        return np.asarray(model.predict(df_small[[yname] + xcols]), dtype=float)
    except Exception:
        return np.asarray(model.predict(df_small[xcols]), dtype=float)

def mse(y: np.ndarray, p: np.ndarray) -> float:
    return float(np.nanmean((p - y)**2))

def summary_stats(y: np.ndarray, p: np.ndarray, prefix: str) -> Dict[str, Any]:
    resid = p - y
    out: Dict[str, Any] = {
        f"{prefix}_n": int(np.isfinite(y).sum()),
        f"{prefix}_mse": float(np.nanmean(resid**2)),
        f"{prefix}_mae": float(np.nanmean(np.abs(resid))),
        f"{prefix}_rmse": float(np.sqrt(np.nanmean(resid**2))),
    }
    # Pearson r
    try:
        r, _ = pearsonr(y, p)
        out[f"{prefix}_pearson_r"] = float(r)
    except Exception:
        out[f"{prefix}_pearson_r"] = float("nan")
    # Spearman
    try:
        ic = spearmanr(y, p, nan_policy="omit").correlation
        out[f"{prefix}_spearman_ic"] = float(ic) if ic is not None else float("nan")
    except Exception:
        out[f"{prefix}_spearman_ic"] = float("nan")
    # R^2
    ybar = float(np.nanmean(y))
    ss_tot = float(np.nansum((y - ybar)**2))
    ss_res = float(np.nansum((y - p )**2))
    out[f"{prefix}_r2"] = float("nan") if ss_tot == 0 else 1.0 - (ss_res / ss_tot)
    return out


def load_qlattice_model(
    model_path: str,
    preproc_path: Optional[str] = None
) -> Tuple[Any, Optional[Dict[str, Any]]]:
    """
    Load a trained QLattice model and optional preprocessing metadata.

    Args:
        model_path (str): Path to the pickled QLattice model file (.pkl).
        preproc_path (Optional[str]): Path to the preprocessing .npz file
            that contains 'feat_cols', 'mu', and 'sd'. Defaults to None.

    Returns:
        Tuple[Any, Optional[Dict[str, Any]]]:
            - model: the unpickled QLattice model object.
            - preproc: a dictionary with preprocessing info
              (keys: 'feat_cols', 'mu', 'sd') if provided, else None.
    """
    # ---- Load model ----
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    preproc: Optional[Dict[str, Any]] = None

    # ---- Load preprocessing (optional) ----
    if preproc_path is not None:
        data = np.load(preproc_path, allow_pickle=True)
        preproc = {
            "feat_cols": data["feat_cols"].tolist(),
            "mu": data["mu"],
            "sd": data["sd"],
        }

    return model, preproc

log("Fitting candidates on TRAIN...")
candidates = fit_candidates(train_small, TARGET)
if not candidates:
    raise RuntimeError("No models returned by QLattice.auto_run")

# Ensure models dir exists
os.makedirs(MODELS_DIR, exist_ok=True)

# ============== Evaluate ALL models; save each; pick best by Val MSE ==============
y_train = train_small[TARGET].to_numpy(dtype=float, copy=False)
y_val   = val_small[TARGET].to_numpy(dtype=float, copy=False)

candidate_rows: List[Dict[str, Any]] = []
best_idx, best_mse = -1, float("inf")

log("Evaluating candidates on VALIDATION and saving each model...")
for i, m in enumerate(candidates):
    # Predictions
    phat_tr = predict_safely(m, train_small, TARGET, feat_cols)
    phat_va = predict_safely(m, val_small,   TARGET, feat_cols)

    # Metrics
    row: Dict[str, Any] = {"model_index": i}
    row.update(summary_stats(y_train, phat_tr, "train"))
    row.update(summary_stats(y_val,   phat_va, "val"))

    # Persist the model
    model_path = os.path.join(MODELS_DIR, f"model_{i:03d}.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(m, f)
    row["model_path"] = model_path

    # Persist its formula (if available)
    try:
        form = str(m.sympify())
    except Exception:
        form = ""
    row["formula"] = form
    if form:
        with open(os.path.join(MODELS_DIR, f"model_{i:03d}_formula.txt"), "w", encoding="utf-8") as f:
            f.write(form + "\n")

    # Track best by validation MSE
    val_mse_i = row["val_mse"]
    log(f"  Model {i}: Val MSE = {val_mse_i:.6g}")
    if np.isfinite(val_mse_i) and (val_mse_i < best_mse):
        best_mse, best_idx = val_mse_i, i

    candidate_rows.append(row)

if best_idx < 0:
    raise RuntimeError("Could not select a best model (no finite Val MSE).")
best_model = candidates[best_idx]
log(f"Selected model index {best_idx} with Val MSE = {best_mse:.6g}")

# Save candidate metrics table
log(f"Saving candidate metrics → {CANDIDATE_METRICS_JSON} / {CANDIDATE_METRICS_CSV}")
with open(CANDIDATE_METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(candidate_rows, f, indent=2)
pd.DataFrame(candidate_rows).to_csv(CANDIDATE_METRICS_CSV, index=False)

# ============== Save best-model artifacts (metrics + preds + preproc) ==============
yhat_train = predict_safely(best_model, train_small, TARGET, feat_cols)
yhat_val   = predict_safely(best_model, val_small,   TARGET, feat_cols)

best_metrics: Dict[str, Any] = {}
best_metrics.update(summary_stats(y_train, yhat_train, "train"))
best_metrics.update(summary_stats(y_val,   yhat_val,   "val"))
best_metrics["best_model_index"]   = int(best_idx)
best_metrics["best_model_val_mse"] = float(best_mse)

log("Selected-model metrics:")
for k, v in best_metrics.items():
    log(f"  {k}: {v}")

# Save best model formula (also kept per-model above)
try:
    forms = str(best_model.sympify())
    log("Top symbolic formula(s) for best model:")
    print(forms)
    with open(FORMULAS_OUT, "w", encoding="utf-8") as f:
        f.write(forms + "\n")
except Exception as e:
    log(f"Could not extract symbolic formulas: {e}")

# Save best model
log(f"Saving BEST model → {MODEL_OUT}")
with open(MODEL_OUT, "wb") as f:
    pickle.dump(best_model, f)

# Save preprocessing for inference
log(f"Saving preprocessing → {PREPROC_OUT}")
np.savez_compressed(
    PREPROC_OUT,
    feat_cols=np.array(feat_cols, dtype=object),
    mu=mu.astype(np.float64),
    sd=sd.astype(np.float64),
)

# Save best model metrics and predictions
log(f"Saving best-model metrics → {METRICS_JSON} / {METRICS_CSV}")
with open(METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(best_metrics, f, indent=2)
pd.DataFrame([best_metrics]).to_csv(METRICS_CSV, index=False)

log(f"Saving predictions → {TRAIN_PRED_CSV} / {VAL_PRED_CSV}")
pd.DataFrame({"y_true": y_train, "y_pred": yhat_train}).to_csv(TRAIN_PRED_CSV, index=False)
pd.DataFrame({"y_true": y_val,   "y_pred": yhat_val  }).to_csv(VAL_PRED_CSV,   index=False)

log("DONE: trained on ≤2014; saved ALL candidates; selected by 2015–2016 Val MSE.")


[19:42:02] Got 10 candidate models.
[19:42:02] Evaluating candidates on VALIDATION and saving each model...
[19:42:09]   Model 0: Val MSE = 50.1661
[19:42:14]   Model 1: Val MSE = 49.7872
[19:42:18]   Model 2: Val MSE = 49.9565
[19:42:22]   Model 3: Val MSE = 631.265
[19:42:25]   Model 4: Val MSE = 49.7872
[19:42:28]   Model 5: Val MSE = 49.7871
[19:42:33]   Model 6: Val MSE = 52.844
[19:42:36]   Model 7: Val MSE = 1.31187e+19
[19:42:39]   Model 8: Val MSE = 49.7872
[19:42:43]   Model 9: Val MSE = 49.7871
[19:42:43] Selected model index 5 with Val MSE = 49.7871
[19:42:43] Saving candidate metrics → qlattice_candidates_metrics.json / qlattice_candidates_metrics.csv
[19:42:47] Selected-model metrics:
[19:42:47]   train_n: 2912017
[19:42:47]   train_mse: 0.27630249556846875
[19:42:47]   train_mae: 0.11157694060211236
[19:42:47]   train_rmse: 0.5256448378596225
[19:42:47]   train_pearson_r: 0.2804722457169072
[19:42:47]   train_spearman_ic: 0.0012592760265154105
[19:42:47]   train_r2: 0.06

In [13]:
print(df.columns)

Index(['id', 'date', 'ret_eom', 'gvkey', 'iid', 'excntry', 'stock_ret', 'year',
       'month', 'char_date',
       ...
       'betadown_252d', 'prc_highprc_252d', 'corr_1260d', 'betabab_1260d',
       'rmax5_rvol_21d', 'age', 'qmj', 'qmj_prof', 'qmj_growth', 'qmj_safety'],
      dtype='object', length=159)


In [3]:
import pickle
import numpy as np
from typing import Any, Dict, Tuple, Optional

def load_qlattice_model(
    model_path: str,
    preproc_path: Optional[str] = None
) -> Tuple[Any, Optional[Dict[str, Any]]]:
    """
    Load a trained QLattice model and optional preprocessing metadata.

    Args:
        model_path (str): Path to the pickled QLattice model file (.pkl).
        preproc_path (Optional[str]): Path to the preprocessing .npz file
            that contains 'feat_cols', 'mu', and 'sd'. Defaults to None.

    Returns:
        Tuple[Any, Optional[Dict[str, Any]]]:
            - model: the unpickled QLattice model object.
            - preproc: a dictionary with preprocessing info
              (keys: 'feat_cols', 'mu', 'sd') if provided, else None.
    """
    # ---- Load model ----
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    preproc: Optional[Dict[str, Any]] = None

    # ---- Load preprocessing (optional) ----
    if preproc_path is not None:
        data = np.load(preproc_path, allow_pickle=True)
        preproc = {
            "feat_cols": data["feat_cols"].tolist(),
            "mu": data["mu"],
            "sd": data["sd"],
        }

    return model, preproc


In [8]:
model, preproc = load_qlattice_model(
    model_path="/Users/tsemerdz/Projects/FIAM2025/test_models/qlattice_model.pkl",
    preproc_path="/Users/tsemerdz/Projects/FIAM2025/test_models/qlattice_preproc.npz"
)


In [16]:
print(model.sympify())

0.929863*exp((3.02132 - 0.0149948*ocfme)*(0.0576986*beme - 2.35076)) + 0.00613281


In [4]:
model.sympify()

530.703*exp(-2.0*(0.660793*mispricingperf + 0.293699)**2 - 2.0*(-0.0060505*rdme + (-0.0948954*ivolhxz421d - 1.32439)*(0.00720373*seas25na - 1.80438) + 0.981202)**4) + 0.0205116

In [None]:
model.sympify()

In [1]:
import pandas as pd
import numpy as np
from typing import Any, Dict, Optional, Sequence

def predict_with_qlattice(
    model: Any,
    df: pd.DataFrame,
    preproc: Optional[Dict[str, Any]] = None,
    target_col: Optional[str] = None,
    feat_cols: Optional[Sequence[str]] = None,
) -> np.ndarray:
    """
    Robust QLattice inference:
      - Ensures the exact feature columns (from `preproc['feat_cols']` or `feat_cols`).
      - Standardizes using `preproc['mu']`/`preproc['sd']` if provided (leak-safe).
      - Replaces any NaN/±Inf in inputs with 0.0.
      - Tries both input layouts that QLattice models accept.

    Args:
      model: unpickled QLattice model.
      df:    DataFrame with raw features.
      preproc: optional dict with keys 'feat_cols', 'mu', 'sd'.
      target_col: if your model expects [y, X...] format, pass the target name (e.g. "stock_ret").
                  We'll add a dummy zero column if it's not present.
      feat_cols: override list of feature names (used only if preproc is None).

    Returns:
      np.ndarray of predictions (float64).
    """
    # ---- Decide feature set ----
    if preproc is not None:
        feat_cols_from = list(preproc["feat_cols"])
        mu = np.asarray(preproc["mu"], dtype=float)
        sd = np.asarray(preproc["sd"], dtype=float)
        if len(feat_cols_from) != len(mu) or len(mu) != len(sd):
            raise ValueError("preproc feat_cols/mu/sd length mismatch.")
        use_cols = feat_cols_from
        # fix degenerate or non-finite sd/mu
        sd = np.where(~np.isfinite(sd) | (sd == 0.0), 1.0, sd)
        mu = np.where(~np.isfinite(mu), 0.0, mu)
        mu_s = pd.Series(mu, index=use_cols)
        sd_s = pd.Series(sd, index=use_cols)
    else:
        # No preproc: fall back to user-supplied feat_cols or numeric intersection
        if feat_cols is not None:
            use_cols = list(feat_cols)
        else:
            use_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        mu_s = None
        sd_s = None

    # ---- Build X with exact columns & sanitize ----
    # Reindex to require the exact features; fill missing with 0 before scaling
    X = df.reindex(columns=use_cols, fill_value=0.0).astype(float, copy=False)

    # Standardize if we have training stats
    if mu_s is not None and sd_s is not None:
        # vectorized, index-aligned ops
        X = (X - mu_s) / sd_s

    # Replace any residual non-finite values with 0
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    # If model expects [y] + X, make sure the y column exists (zeros; not used at inference)
    if target_col:
        if target_col not in X.columns:
            X = pd.concat([pd.DataFrame({target_col: np.zeros(len(X), dtype=float)}, index=X.index), X], axis=1)

    # ---- Predict robustly ----
    try:
        # Try with current column order
        preds = np.asarray(model.predict(X), dtype=float)
    except Exception:
        # If we inserted a target_col, ensure it is first
        if target_col and target_col in X.columns and X.columns[0] != target_col:
            order = [target_col] + [c for c in X.columns if c != target_col]
            preds = np.asarray(model.predict(X[order]), dtype=float)
        else:
            # As a last resort, try without target if present
            if target_col and target_col in X.columns:
                X_wo_y = X[[c for c in X.columns if c != target_col]]
                preds = np.asarray(model.predict(X_wo_y), dtype=float)
            else:
                raise

    # Final sanitize of outputs
    preds = np.where(np.isfinite(preds), preds, 0.0)
    return preds


In [4]:
# Load model + preprocessing
model, preproc = load_qlattice_model("qlattice_model.pkl", "qlattice_preproc.npz")
PATH = "../data/ret_sample.parquet"
df = pd.read_parquet(PATH)

This version of Feyn and the QLattice is available for academic, personal, and non-commercial use. By using the community version of this software you agree to the terms and conditions which can be found at https://abzu.ai/eula.

In [None]:
# Run predictions
y_pred = predict_with_qlattice(model, df[df["year"] >= 2017], preproc, target_col="stock_ret")

In [None]:
# y_pred[2451122]

np.float64(0.020511564125235738)