In [20]:
import pandas as pd
from DBConnection import get_engine

from FeatureEngineering import load_home_game_data,build_model_dataframe

from pipeline_code.mainMLM import run_ml_pipeline
from pipeline_code.DataPreprocessor import DataPreprocessor
from pipeline_code.ModelMetricDisplays import display_predictions_vs_actuals
from Backtest import backtest_betting_strategy
from DisplayTodaysPicks import display_todays_picks


# -----------------------------
# CONFIG
# -----------------------------
TARGET_COLUMN = "homeCover"

ROLLING_STATS = [
    "Pace", "OrbPct", "PointsOffTO","Rating"
]

FEATURES = [
    "restDiff",
    "orbpct_diff_l3",
    "pace_diff_l3",
    "rating_diff_l3",
    "pointsoffto_diff_l3",
    "homeSpread",
]

ROLL_WINDOWS = [3]

RANDOM_SEED = 42

In [21]:
# -----------------------------
# LOAD + BUILD DATA
# -----------------------------
engine = get_engine()

raw_games = load_home_game_data(engine)

df_model = build_model_dataframe(
    game_df=raw_games,
    rolling_stats=ROLLING_STATS,
    windows=ROLL_WINDOWS,
    target=TARGET_COLUMN
)

# split future vs historical
now_utc = pd.Timestamp.utcnow()
#df_future = df_model[df_model["startDate"] >= now_utc - pd.Timedelta(days=14)]
#df_train  = df_model[df_model["startDate"] < now_utc - pd.Timedelta(days=14)]
df_future = df_model[df_model["startDate"] >= now_utc]
df_train  = df_model[df_model["startDate"] < now_utc]

df_train = df_train.dropna(subset=['homePoints',FEATURES[0]])

df_train["homeCover"] = df_train["homeCover"].astype(int)

In [22]:
# -----------------------------
# TRAIN MODEL
# -----------------------------
preprocessor = DataPreprocessor()

train_df = df_train[FEATURES + ["startDate", TARGET_COLUMN]]

model = run_ml_pipeline(
    df=train_df,
    target_column=TARGET_COLUMN,
    preprocessor=preprocessor,
    model_type="xgb",
    random_seed=RANDOM_SEED,
    metric="precision",
    n_trials=100,
    test_size=0.2,
    show_plots=False,
    split_type="time",
    time_col="startDate",
)


=== Training XGBoost Model ===

Problem type: CLASSIFICATION
Split type used: time
Train rows: 19276
Test rows: 4819

Optimizing hyperparameters...
Optimizing for metric: precision
Optimization direction: maximize
Pruning enabled: True
âœ… Trial #0 completed
âœ… Trial #1 completed
âœ… Trial #2 completed
âœ… Trial #3 completed
âœ… Trial #4 completed
ðŸ”´ PRUNING Trial #5
âœ… Trial #6 completed
ðŸ”´ PRUNING Trial #7
âœ… Trial #8 completed
ðŸ”´ PRUNING Trial #9
ðŸ”´ PRUNING Trial #10
ðŸ”´ PRUNING Trial #11
ðŸ”´ PRUNING Trial #12
âœ… Trial #13 completed
ðŸ”´ PRUNING Trial #14
ðŸ”´ PRUNING Trial #15
ðŸ”´ PRUNING Trial #16
âœ… Trial #17 completed
ðŸ”´ PRUNING Trial #18
ðŸ”´ PRUNING Trial #19
ðŸ”´ PRUNING Trial #20
ðŸ”´ PRUNING Trial #21
ðŸ”´ PRUNING Trial #22
ðŸ”´ PRUNING Trial #23
âœ… Trial #24 completed
ðŸ”´ PRUNING Trial #25
ðŸ”´ PRUNING Trial #26
ðŸ”´ PRUNING Trial #27
ðŸ”´ PRUNING Trial #28
ðŸ”´ PRUNING Trial #29
ðŸ”´ PRUNING Trial #30
ðŸ”´ PRUNING Trial #31
ðŸ”´ PRUNING Trial #32
ðŸ”´

In [27]:
from Backtest import optimize_threshold_range
from pipeline_code.PickleModels import load_model

#model = load_model("saved_models/HomeSpreadCover_XGBoost.pkl")
model = load_model("saved_models/HomeSpreadCover_XGBoostv2.pkl")

# -----------------------------
# EVALUATION
# -----------------------------

test_df = model.X_test.copy()
test_df["actual"] = model.y_test.values
test_df["pred"]   = model.y_test_pred
test_df["proba"]  = model.y_test_proba

optimization_results = optimize_threshold_range(
    test_df,
    lower_range=(0.0, 0.5),
    upper_range=(0.5, 1.0),
    step=0.001,
    odds=-110,
    stake=1.0,
    min_bets=200,
    optimize_metric="roi_per_bet"
)

# Get the best configuration
best = optimization_results['best_config']

# Now run backtest with the optimal thresholds
summary, bets = backtest_betting_strategy(
    test_df,
    lower_threshold=best['lower_threshold'],
    upper_threshold=best['upper_threshold'],
    odds=-110,
    stake=1.0
)

# Print the results of the backtest using the best thresholds
print(f"\nLower Threshold: {100*best['lower_threshold']:.1f}%")
print(f"Upper Threshold: {100*best['upper_threshold']:.1f}%")

print(f"\nTotal Bets: {best['bets']:.0f}")
print(f"Win rate: {100 * best['win_rate']:.2f}%")
print(f"Units: {best['total_units']:.2f}")
print(f"ROI Per Bet: {best['roi_per_bet']:.4f}")
print(f"Sharpe Ratio: {best['sharpe']:.4f}")
print(f"Max Drawdown: {best['max_drawdown']:.2f}")

Model loaded successfully from saved_models/HomeSpreadCover_XGBoostv2.pkl

Lower Threshold: 42.3%
Upper Threshold: 53.0%

Total Bets: 206
Win rate: 61.17%
Units: 34.55
ROI Per Bet: 0.1677
Sharpe Ratio: 0.1798
Max Drawdown: -5.00


In [28]:
from DisplayTodaysPicks import display_todays_picks

from Backtest import optimize_threshold_range
from pipeline_code.PickleModels import load_model

#model = load_model("saved_models/HomeSpreadCover_XGBoost.pkl")
model = load_model("saved_models/HomeSpreadCover_XGBoostv2.pkl")

# -----------------------------
# EVALUATION
# -----------------------------

test_df = model.X_test.copy()
test_df["actual"] = model.y_test.values
test_df["pred"]   = model.y_test_pred
test_df["proba"]  = model.y_test_proba

X_pred = df_future[FEATURES]

preds = model.predict(X_pred)

df_out = pd.concat([df_future.reset_index(drop=True), preds.reset_index(drop=True)],axis=1)

picks_df = display_todays_picks(
    df_out,
    best['lower_threshold'],
    best['upper_threshold'],
    engine
)

Model loaded successfully from saved_models/HomeSpreadCover_XGBoostv2.pkl
No games met tail criteria.


In [29]:
picks_df

In [9]:
from DBConnection import upsert_via_staging

if not picks_df.empty:

    pks = ["gameId"]
    data_columns = [c for c in picks_df.columns if c not in pks + ["insert_date", "update_date"]]

    upsert_via_staging(
        df              = picks_df,
        table_name      = "SpreadModelPicks",
        primary_keys    = pks,
        data_columns    = data_columns,
        engine          = engine,
        schema          = 'CBB',
        dry_run         = False
    )