In [1]:
import yfinance as yf
import ta
import pandas as pd
import numpy as np
import joblib

from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return float(np.sqrt(mean_squared_error(a, b)))

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = ta.add_all_ta_features(
        df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True
    )

    base = pd.DataFrame(index=df.index)
    base["month"] = df["Date"].dt.month
    base["dayofweek"] = df["Date"].dt.dayofweek
    base["return_1"] = df["Close"].pct_change(1)
    base["return_5"] = df["Close"].pct_change(5)
    base["return_10"] = df["Close"].pct_change(10)
    base["cci"] = ta.trend.cci(df["High"], df["Low"], df["Close"], window=14)
    base["stc"] = df["Close"].rolling(window=5).std()

    for lag in [1, 2, 5]:
        base[f"open_lag_{lag}"] = df["Open"].shift(lag)
        base[f"close_lag_{lag}"] = df["Close"].shift(lag)
        base[f"vol_lag_{lag}"] = df["Volume"].shift(lag)

    for w in [5, 20]:
        base[f"close_ma_{w}"] = df["Close"].rolling(w).mean()
        base[f"close_std_{w}"] = df["Close"].rolling(w).std()

    df = pd.concat([df, base], axis=1).copy()
    return df

def train_model(ticker="TSM", start="2015-01-01"):
    df = yf.download(ticker, start=start, auto_adjust=False)
    df.columns = df.columns.get_level_values(0)
    df = df.reset_index()

    df = add_features(df)

    # target: next-day open log-return
    df["target"] = np.log(df["Open"].shift(-1)) - np.log(df["Open"])
    df = df.dropna().copy()

    feature_cols = [c for c in df.columns if c not in ["Date", "target"]]
    X = df[feature_cols]
    y = df["target"]

    # 24 fits small grid (same as you used)
    tscv = TimeSeriesSplit(n_splits=3)
    param_grid = {
        "n_estimators": [400, 800],
        "max_depth": [3, 4],
        "learning_rate": [0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.9],
        "min_child_weight": [3],
        "reg_lambda": [3.0],
        "reg_alpha": [0.0, 0.1],
    }

    best_params = None
    best_score = float("inf")

    for params in ParameterGrid(param_grid):
        scores = []
        for tr_idx, va_idx in tscv.split(X):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            model = XGBRegressor(
                objective="reg:squarederror",
                random_state=42,
                tree_method="hist",
                n_jobs=-1,
                **params
            )
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
            pred = model.predict(X_va)
            scores.append(rmse(y_va, pred))

        score = float(np.mean(scores))
        if score < best_score:
            best_score = score
            best_params = params

    final_model = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist",
        n_jobs=-1,
        **best_params
    )
    final_model.fit(X, y, verbose=False)

    bundle = {
        "ticker": ticker,
        "start": start,
        "model": final_model,
        "feature_cols": feature_cols,
        "best_params": best_params,
    }
    return bundle

if __name__ == "__main__":
    bundle = train_model("TSM", "2015-01-01")
    joblib.dump(bundle, "tsm_open_model.joblib")
    print("Saved -> tsm_open_model.joblib")
    print("Best params:", bundle["best_params"])
    print("Feature cols:", len(bundle["feature_cols"]))


[*********************100%***********************]  1 of 1 completed


Saved -> tsm_open_model.joblib
Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 400, 'reg_alpha': 0.1, 'reg_lambda': 3.0, 'subsample': 0.8}
Feature cols: 112
