In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from joblib import dump
import pathlib

RNG = 42
N_FOLDS = 10
FEAT_FILE = "./output/umich_train_features_clean.csv"
MODEL_FILE = "./models/umich_catboost.cbm"
OOF_FILE = "./models/umich_oof.npy"

pathlib.Path("models").mkdir(exist_ok=True)
pathlib.Path("predictions").mkdir(exist_ok=True)

# load features and labels
df = pd.read_csv(FEAT_FILE)
y  = df["Experimental Values"].values.astype(np.float32)
X  = df.drop(columns=["Dataset","Mixture 1","Mixture 2","Experimental Values"]).values

print(f"> training matrix  : {X.shape}")
print(f"> target mean/std  : {y.mean():.3f} / {y.std():.3f}")

# CV training
params = dict(
    iterations      = 400,
    depth           = 6,
    learning_rate   = 0.03,
    l2_leaf_reg     = 3,
    loss_function   = "RMSE",
    random_seed     = RNG,
    verbose         = 0
)

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RNG)
oof_pred = np.zeros_like(y, dtype=np.float32)
pearsons = []
rmses = []

for fold, (tr, val) in enumerate(kf.split(X, y), 1):
    model = CatBoostRegressor(**params)
    model.fit(X[tr], y[tr], eval_set=(X[val], y[val]))
    pred = model.predict(X[val])

    oof_pred[val] = pred
    rmses.append(np.sqrt(mean_squared_error(y[val], pred)))
    pearsons.append(pearsonr(y[val], pred)[0])

    print(f"fold {fold:2d}: Pearson={pearsons[-1]:.3f}  RMSE={rmses[-1]:.3f}")

print(f"\n>> CV mean Pearson = {np.mean(pearsons):.3f} ± {np.std(pearsons):.3f}")
print(f">> CV mean RMSE    = {np.mean(rmses):.3f}")

np.save(OOF_FILE, oof_pred)

# train final model on all data
final_model = CatBoostRegressor(**params)
final_model.fit(X, y)
final_model.save_model(MODEL_FILE)
print(f"saved final CatBoost model → {MODEL_FILE}")

> training matrix  : (500, 138)
> target mean/std  : 0.568 / 0.157
fold  1: Pearson=0.642  RMSE=0.115
fold  2: Pearson=0.795  RMSE=0.117
fold  3: Pearson=0.439  RMSE=0.125
fold  4: Pearson=0.577  RMSE=0.140
fold  5: Pearson=0.617  RMSE=0.123
fold  6: Pearson=0.388  RMSE=0.140
fold  7: Pearson=0.571  RMSE=0.150
fold  8: Pearson=0.511  RMSE=0.120
fold  9: Pearson=0.614  RMSE=0.139
fold 10: Pearson=0.659  RMSE=0.111

>> CV mean Pearson = 0.581 ± 0.110
>> CV mean RMSE    = 0.128
saved final CatBoost model → ./models/umich_catboost.cbm
