In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import time 
import joblib
import json 

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

sns.set(context="notebook", style="whitegrid")
RNG = 42

# ----- link transform to keep predictions in [-1, 1] -----
_EPS = 1e-6
def y_to_z(y):
    y_ = np.clip(y, -1 + _EPS, 1 - _EPS)
    return np.arctanh(y_)
def z_to_y(z):
    return np.clip(np.tanh(z), -1.0, 1.0)

def report_metrics(y_true, y_pred, label=""):
    rmse = root_mean_squared_error(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    evs  = explained_variance_score(y_true, y_pred)
    print(f"{label}RMSE={rmse:.4f} | MAE={mae:.4f} | R^2={r2:.4f} | EVS={evs:.4f}")
    return dict(rmse=rmse, mae=mae, r2=r2, evs=evs)

def concat_years(start_year: int, end_year: int, base_dir: str):
    """
    Concatenate yearly parquet files into a single DataFrame.
    
    Parameters
    ----------
    start_year : int
        First year (inclusive).
    end_year : int
        Last year (inclusive).
    base_dir : str
        Path to the directory containing parquet files.
    
    Returns
    -------
    pd.DataFrame
        Combined DataFrame of all years.
    """
    dfs = []
    for year in range(start_year, end_year + 1):
        path = os.path.join(base_dir, f"{year}_mgmt_training_10K.parquet")
        if os.path.exists(path):
            year_df = pd.read_parquet(path, engine="fastparquet")
            dfs.append(year_df)
        else:
            print(f"⚠️ Skipping missing file: {path}")
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [2]:
df = concat_years(2005, 2013, "../../data/training_data_surprise_model")

# Ensure "date" is string and extract the year
df["year"] = df["date"].astype(str).str[:4].astype(int)

# Training: 2005–2011
train_df = df[df["year"].between(2005, 2011)]

# Testing: 2012–2023
test_df  = df[df["year"].between(2012, 2012)]

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

# Build feature/target arrays
X_train = np.vstack(train_df["mgmt_embedding"].values).astype(np.float32)
y_train = train_df["ni_be"].to_numpy(np.float32)

X_test  = np.vstack(test_df["mgmt_embedding"].values).astype(np.float32)
y_test  = test_df["ni_be"].to_numpy(np.float32)

# Enforce bounds + latent transform
y_train = np.clip(y_train, -1, 1)
y_test  = np.clip(y_test, -1, 1)

z_train = y_to_z(y_train)
z_test  = y_to_z(y_test)

X_train.shape, X_test.shape


Train shape: (24077, 6)
Test shape : (3174, 6)


((24077, 768), (3174, 768))

In [10]:
# ----- fixed, reasonable defaults (fast + good for embeddings) -----
N_COMPONENTS = 96
ALPHA        = 5.0         # ridge strength
GAMMA        = 0.05        # RBF smoothness

# Pipeline: (optional) Standardize -> PCA -> KernelRidge(RBF)
# Standardization helps PCA; embeddings are often already scaled, but this is safe.
base_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),  # with_mean=False for sparse safety; ok for dense too
    ("pca", PCA(random_state=RNG, n_components=N_COMPONENTS)),
    ("krr", KernelRidge(kernel="rbf", alpha=ALPHA, gamma=GAMMA))
])

# Wrap so we train on atanh(y) but score/predict on bounded y
model = TransformedTargetRegressor(
    regressor=base_pipe,
    func=y_to_z,
    inverse_func=z_to_y
)


In [5]:
y_train = np.clip(y_train, -1, 1)
y_test  = np.clip(y_test,  -1, 1)

model.fit(X_train, y_train, verbose=1)



0,1,2
,regressor,Pipeline(step...rnel='rbf'))])
,transformer,
,func,<function y_t...t 0x13ca420c0>
,inverse_func,<function z_t...t 0x13ca42160>
,check_inverse,True

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,alpha,1
,kernel,'rbf'
,gamma,
,degree,3
,coef0,1
,kernel_params,


In [9]:
y_tr_pred = model.predict(X_train)
y_te_pred = model.predict(X_test)
train_metrics = report_metrics(y_train, y_tr_pred)
test_metrics  = report_metrics(y_test,  y_te_pred)
print("Train:", train_metrics)
print("Test :", test_metrics)

# ----- save artifacts -----
os.makedirs("artifacts", exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S")
model_path = f"artifacts/pca_krr_bounded_{ts}.joblib"
meta_path  = f"artifacts/pca_krr_bounded_{ts}.json"

joblib.dump(model, model_path)
with open(meta_path, "w") as f:
    json.dump({
        "timestamp": ts,
        "model_file": os.path.basename(model_path),
        "params": {"n_components": N_COMPONENTS, "alpha": ALPHA, "gamma": GAMMA},
        "train_metrics": train_metrics,
        "test_metrics": test_metrics,
        "notes": "PCA->KernelRidge(RBF) wrapped in TransformedTargetRegressor to bound outputs in [-1,1]"
    }, f, indent=2)

print(f"\nSaved model → {model_path}")
print(f"Saved meta  → {meta_path}")

RMSE=0.3664 | MAE=0.2806 | R^2=-0.2444 | EVS=-0.0266
RMSE=0.4473 | MAE=0.3419 | R^2=-1.1662 | EVS=-0.7668
Train: {'rmse': 0.3664322793483734, 'mae': 0.2806130051612854, 'r2': -0.24443936347961426, 'evs': -0.02658998966217041}
Test : {'rmse': 0.44730281829833984, 'mae': 0.3419446349143982, 'r2': -1.1661734580993652, 'evs': -0.7667900323867798}


PicklingError: Can't pickle <function y_to_z at 0x13ca420c0>: it's not the same object as __main__.y_to_z