Regression Models

In [64]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [65]:
# import classification data
X_test = pd.read_csv('x_test_reg.csv')
X_train = pd.read_csv('X_train_reg.csv')
y_test_or = pd.read_csv('y_test_reg.csv')
y_train_or = pd.read_csv('y_train_reg.csv')

# set predictor indices
idx_tr = X_train.index
idx_te = X_test.index

In [66]:
# convert prices to log returns
cols = ['P+1','P+7','P+14']
y_train = pd.DataFrame({
    h: np.log(y_train_or[h].to_numpy() / X_train['P0'].to_numpy())
    for h in cols
}, index=idx_tr)

y_test = pd.DataFrame({
    h: np.log(y_test_or[h].to_numpy() / X_test['P0'].to_numpy())
    for h in cols
}, index=idx_te)

In [71]:
# define models

ridge = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("mdl",   Ridge(alpha=1.0, random_state=0))
])

hgb = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("mdl", HistGradientBoostingRegressor(
        max_depth=4, learning_rate=0.05, max_iter=500, random_state=0))
])


from xgboost import XGBRegressor
xgb = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("mdl", XGBRegressor(
        objective="reg:squarederror", 
        tree_method="hist",
        n_estimators=2000, 
        learning_rate=0.03, 
        max_depth=4,
        subsample=0.8, 
        colsample_bytree=0.8, 
        reg_alpha=0.0, 
        reg_lambda=1.0,
        random_state=42, 
        n_jobs=-1))
])
model_list = [("Ridge", ridge), ("HistGB", hgb), ("XGBoost", xgb)]

# -------------------------------------------------------
# 2) Train/evaluate per horizon
# -------------------------------------------------------
def eval_one(y_tr, y_te, name, est, idx_tr, idx_te):
    est.fit(X_train, y_tr.loc[idx_tr])
    yhat = est.predict(X_test)
    return {
        "name": name,
        "RMSE": root_mean_squared_error(y_te.loc[idx_te], yhat),
        "MAE":  mean_absolute_error(y_te.loc[idx_te], yhat),
        "R2":   r2_score(y_te.loc[idx_te], yhat)
    }, yhat

rows, preds_store = [], {}
for horizon in ['P+1','P+7','P+14']:
    for name, est in model_list:
        single_y_train = y_train[horizon]
        single_y_test = y_test[horizon]
        row, yhat = eval_one(single_y_train, single_y_test, name, est, idx_tr, idx_te)
        row["timeframe"] = f'{horizon}'
        rows.append(row)

        # (optional) predicted **prices** for this horizon
        # Phat = P0 * exp(rhat)
        preds_store[(name, horizon)] = pd.Series(
            X_test.loc[idx_te, 'P0'].to_numpy() * np.exp(yhat),
            index=idx_te, name=f'Phat{name}{horizon}'
        )

results_reg = pd.DataFrame(rows).sort_values(["timeframe","RMSE"]).sort_values(['name', 'RMSE']).reset_index(drop=True)
print(results_reg)
preds_price = pd.concat(preds_store.values(), axis=1)

      name      RMSE       MAE        R2 timeframe
0   HistGB  0.065371  0.046393  0.013361       P+1
1   HistGB  0.118662  0.082453  0.074639       P+7
2   HistGB  0.197167  0.121960  0.115558      P+14
3    Ridge  0.065727  0.044886  0.002583       P+1
4    Ridge  0.119957  0.082501  0.054329       P+7
5    Ridge  0.206520  0.116462  0.029654      P+14
6  XGBoost  0.067264  0.046912 -0.044596       P+1
7  XGBoost  0.118770  0.083388  0.072958       P+7
8  XGBoost  0.196567  0.119951  0.120930      P+14
