In [1]:
import setuptools.dist
import polars as pl
import numpy as np
from scipy.stats import spearmanr
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import re
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
import optuna
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Input
import random
import matplotlib.pyplot as plt
import seaborn as sns
import joblib




  from .autonotebook import tqdm as notebook_tqdm
2025-03-17 18:09:26.671405: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
SEED = 42
TRAIN_SIZE = 0.7

data = pl.read_parquet("model_data.parquet")

# Filter data to two groups: pre-2025 NCAA tournament games OR 2025 games
data = data.filter((pl.col("NCAATourneyFlag_A")==1)|(pl.col("Season")==2025))

## Target variable
- WinFlag_A

## Categorical variables (one hot encode)
- GameLocation
- TeamConf (A & B)
- CoachName (A & B)
- TeamID (A & B)
- Season

## Categorical variables (leave alone)
- RegSeasonFlag_A
- ConfTourneyFlag_A
- NCAATourneyFlag_A

## Numeric variables (standardize)
- ActiveTourneyWins_School (A & B)
- ActiveTourneyWins_Coach (A & B)
- NCAATourneySeed (A & B)
- ActiveAPRank (A & B)
- ActivePOMRank (A & B)
- ActiveNETRank (A & B)
- SeasonBestAPRank (A & B)
- SeasonBestPOMRank (A & B)
- All non zero-to-one average metrics (Self & Opponent, Overall & Last5, A & B)

## Numeric variables (leave alone)
- RollingWinPct (Self & Opponent, Overall & Last5, A & B)
- RollingP5WinPctOverall (Self & Opponent, A & B)
- RollingNP5WinPctOverall (Self & Opponent, A & B)
- FreeThrowPct (Self & Opponent, Overall & Last5, A & B)
- ThreePtPct (Self & Opponent, Overall & Last5, A & B)
- FieldGoalPct (Self & Opponent, Overall & Last5, A & B)




In [3]:
# Split into train/test sets
X, y = data.drop(["WinFlag_A"]), data["WinFlag_A"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, random_state=SEED, stratify = X['NCAATourneyFlag_A'])

train_weights = X_train.with_columns(
    pl.when(pl.col("Season") == 2025).then(pl.lit(2)).otherwise(pl.lit(1)).alias("train_weights")
)['train_weights'].to_numpy()

# Split up categorical variables...cv1: Subset to be one hot encoded. cv2: Subset to remain untouched.
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_1 = enc.fit_transform(X_train.select(['GameLocation_A','TeamConf_A','TeamConf_B','CoachName_A','CoachName_B','TeamID_A','TeamID_B','Season','NCAATourneySeed_A','NCAATourneySeed_B','ActiveTourneyWins_School_A','ActiveTourneyWins_School_B','ActiveTourneyWins_Coach_A','ActiveTourneyWins_Coach_B','ActiveAPRank_A','ActivePOMRank_A','ActiveNETRank_A','SeasonBestAPRank_A','SeasonBestPOMRank_A','ActiveAPRank_B','ActivePOMRank_B','ActiveNETRank_B','SeasonBestAPRank_B','SeasonBestPOMRank_B']))
X_test_1 = enc.transform(X_test.select(['GameLocation_A','TeamConf_A','TeamConf_B','CoachName_A','CoachName_B','TeamID_A','TeamID_B','Season','NCAATourneySeed_A','NCAATourneySeed_B','ActiveTourneyWins_School_A','ActiveTourneyWins_School_B','ActiveTourneyWins_Coach_A','ActiveTourneyWins_Coach_B','ActiveAPRank_A','ActivePOMRank_A','ActiveNETRank_A','SeasonBestAPRank_A','SeasonBestPOMRank_A','ActiveAPRank_B','ActivePOMRank_B','ActiveNETRank_B','SeasonBestAPRank_B','SeasonBestPOMRank_B']))

X_train_2 = X_train.select(["RegSeasonFlag_A","ConfTourneyFlag_A","NCAATourneyFlag_A"]).to_numpy()
X_test_2 = X_test.select(["RegSeasonFlag_A","ConfTourneyFlag_A","NCAATourneyFlag_A"]).to_numpy()

X_train_categorical = np.hstack((X_train_1, X_train_2))
X_test_categorical = np.hstack((X_test_1, X_test_2))




In [5]:
scaler = StandardScaler()

scaler_cols = [col for col in data.columns if re.match(r'.*Avg.*',col) != None]

X_train_3 = scaler.fit_transform(X_train.select(scaler_cols))
X_test_3 = scaler.transform(X_test.select(scaler_cols))

X_train_4 = X_train.select(["RollingWinPct_Overall_A","RollingWinPct_Last5_A","RollingP5WinPct_Overall_A","RollingNP5WinPct_Overall_A","FreeThrowPct_Overall_A","FreeThrowPct_Last5_A","OppFreeThrowPct_Overall_A","OppFreeThrowPct_Last5_A","FieldGoalPct_Overall_A","FieldGoalPct_Last5_A","OppFieldGoalPct_Overall_A","OppFieldGoalPct_Last5_A","ThreePtPct_Overall_A","ThreePtPct_Last5_A","OppThreePtPct_Overall_A","OppThreePtPct_Last5_A",
                            "RollingWinPct_Overall_B","RollingWinPct_Last5_B","RollingP5WinPct_Overall_B","RollingNP5WinPct_Overall_B","FreeThrowPct_Overall_B","FreeThrowPct_Last5_B","OppFreeThrowPct_Overall_B","OppFreeThrowPct_Last5_B","FieldGoalPct_Overall_B","FieldGoalPct_Last5_B","OppFieldGoalPct_Overall_B","OppFieldGoalPct_Last5_B","ThreePtPct_Overall_B","ThreePtPct_Last5_B","OppThreePtPct_Overall_B","OppThreePtPct_Last5_B"]).to_numpy()
X_test_4 = X_test.select(["RollingWinPct_Overall_A","RollingWinPct_Last5_A","RollingP5WinPct_Overall_A","RollingNP5WinPct_Overall_A","FreeThrowPct_Overall_A","FreeThrowPct_Last5_A","OppFreeThrowPct_Overall_A","OppFreeThrowPct_Last5_A","FieldGoalPct_Overall_A","FieldGoalPct_Last5_A","OppFieldGoalPct_Overall_A","OppFieldGoalPct_Last5_A","ThreePtPct_Overall_A","ThreePtPct_Last5_A","OppThreePtPct_Overall_A","OppThreePtPct_Last5_A",
                            "RollingWinPct_Overall_B","RollingWinPct_Last5_B","RollingP5WinPct_Overall_B","RollingNP5WinPct_Overall_B","FreeThrowPct_Overall_B","FreeThrowPct_Last5_B","OppFreeThrowPct_Overall_B","OppFreeThrowPct_Last5_B","FieldGoalPct_Overall_B","FieldGoalPct_Last5_B","OppFieldGoalPct_Overall_B","OppFieldGoalPct_Last5_B","ThreePtPct_Overall_B","ThreePtPct_Last5_B","OppThreePtPct_Overall_B","OppThreePtPct_Last5_B"]).to_numpy()

X_train_numeric = np.hstack((X_train_3, X_train_4))
X_test_numeric = np.hstack((X_test_3, X_test_4))
#print(scaler_cols)

In [6]:
# Final model-ready input datasets
X_train_final = np.hstack((X_train_categorical, X_train_numeric))
X_test_final = np.hstack((X_test_categorical, X_test_numeric))

In [7]:
print(f"Input variable dataset shape (train): {X_train_final.shape}")
print(f"Target variable shape (train): {y_train.shape}")
print(f"Input variable dataset shape (test): {X_test_final.shape}")
print(f"Target variable shape (test): {y_test.shape}")


Input variable dataset shape (train): (4219, 4474)
Target variable shape (train): (4219,)
Input variable dataset shape (test): (1809, 4474)
Target variable shape (test): (1809,)


In [8]:
# Logistic regression: l2 penalty
def logistic_objective(trial):
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs','liblinear'])
    params = {'random_state':SEED, 'penalty': 'l2', 'C':C, 'solver':solver, 'n_jobs':-1, 'max_iter':1000}

    mod = LogisticRegression(**params)
    mod.fit(X = X_train_final, y = y_train)

    y_pred = mod.predict_proba(X_test_final)[:,1]
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(logistic_objective, n_trials = 100)

print(study.best_params)
print(study.best_value)


[I 2025-03-17 18:10:20,838] A new study created in memory with name: no-name-1ca58c46-ef82-4ce9-9cf9-39fc6389c652
[I 2025-03-17 18:10:21,105] Trial 0 finished with value: 0.2340264937111996 and parameters: {'C': 6.558900787768207e-05, 'solver': 'liblinear'}. Best is trial 0 with value: 0.2340264937111996.
[I 2025-03-17 18:10:21,797] Trial 1 finished with value: 0.29356114613766887 and parameters: {'C': 15.99917553970912, 'solver': 'liblinear'}. Best is trial 0 with value: 0.2340264937111996.
[I 2025-03-17 18:10:34,218] Trial 2 finished with value: 0.31823083820558684 and parameters: {'C': 93.12330260039107, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.2340264937111996.
[I 2025-03-17 18:10:34,353] Trial 3 finished with value: 0.2261280017271386 and parameters: {'C': 0.00013969215862233891, 'solver': 'liblinear'}. Best is trial 3 with value: 0.2261280017271386.
[I 2025-03-17 18:10:42,087] Trial 4 finished with value: 0.30688801690187006 and parameters: {'C': 36.60242682072474, 'solv

{'C': 0.0216474637891819, 'solver': 'lbfgs'}
0.18750008742374427


In [None]:
# Logistic regression: l1 penalty
def logistic_objective_2(trial):
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear','saga'])
    params = {'random_state':SEED, 'penalty': 'l1', 'C':C, 'solver':solver, 'n_jobs':-1, 'max_iter':1000}

    mod = LogisticRegression(**params)
    mod.fit(X = X_train_final, y = y_train)

    y_pred = mod.predict_proba(X_test_final)[:,1]
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(logistic_objective_2, n_trials = 100)

print(study.best_params)
print(study.best_value)



In [None]:
# Logistic regression: elasticnet (both)
def logistic_objective_3(trial):
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
    params = {'random_state':SEED, 'l1_ratio':l1_ratio, 'penalty': 'elasticnet', 'C':C, 'solver':'saga', 'n_jobs':-1, 'max_iter':1000}

    mod = LogisticRegression(**params)
    mod.fit(X = X_train_final, y = y_train)

    y_pred = mod.predict_proba(X_test_final)[:,1]
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(logistic_objective_3, n_trials = 100)

print(study.best_params)
print(study.best_value)

In [None]:
# Random forest
def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators',10,500)
    min_samples_split = trial.suggest_int('min_samples_split',2,20)
    params = {'random_state': SEED, 'n_estimators':n_estimators, 'min_samples_split':min_samples_split, 'n_jobs':-1}

    mod = RandomForestClassifier(**params)
    mod.fit(X = X_train_final, y = y_train)

    y_pred = mod.predict_proba(X_test_final)[:,1]
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(rf_objective, n_trials = 100)

print(study.best_params)
print(study.best_value)

In [None]:
# USE NATIVE TENSORFLOW HYPERPARAMETER TUNING METHODS 


# Start with logistic regressio
tf.random.set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

def nn_objective(trial):
    neurons = trial.suggest_int('neurons',16,256)
    learning_rate = trial.suggest_float('learning_rate',1e-5,1e-2)
    dense_layers = trial.suggest_int('dense_layers',1,5)
    #threshold = trial.suggest_float('threshold', 0, 1)

    mod = Sequential([
        Input(shape=(X_train_final.shape[1],))
    ])
    for _ in range(dense_layers):
        mod.add(Dense(neurons, activation='relu'))
    mod.add(Dense(1, activation='sigmoid'))
    
    mod.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                loss = 'binary_crossentropy',
                metrics=['mse'])

    mod.fit(X_train_final, y_train, epochs=10, batch_size=256, verbose=0)
    y_pred = mod.predict(X_test_final).flatten()
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(nn_objective, n_trials = 100)

print(study.best_params)
print(study.best_value)



In [12]:
# Start with logistic regression
def ensemble_objective_1(trial):
    C = trial.suggest_float('C', 1e-4, 1, log=True)
    log_kwargs = {'C':C, 'n_jobs': -1, 'random_state':SEED, 'max_iter':1000}

    n_estimators = trial.suggest_int('n_estimators',75, 500)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    rf_kwargs = {'n_estimators': n_estimators, 'min_samples_split': min_samples_split, 'n_jobs': -1, 'random_state':SEED}


    mod = VotingClassifier(estimators=[('rf', RandomForestClassifier(**rf_kwargs)),
                                       ('lr',LogisticRegression(**log_kwargs))], 
                            voting='soft', 
                            n_jobs=-1)
    mod.fit(X = X_train_final, y = y_train, sample_weight = train_weights)

    y_pred = mod.predict_proba(X_test_final)[:,1]
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction = 'minimize')
study.optimize(ensemble_objective_1, n_trials = 100)

print(study.best_params)
print(study.best_value)

[I 2025-03-17 18:16:23,749] A new study created in memory with name: no-name-0925de75-7aef-4546-ab4e-03fd54a8d632
[I 2025-03-17 18:16:37,686] Trial 0 finished with value: 0.19012500675873895 and parameters: {'C': 0.018505160166819416, 'n_estimators': 472, 'min_samples_split': 7}. Best is trial 0 with value: 0.19012500675873895.
[I 2025-03-17 18:16:45,363] Trial 1 finished with value: 0.19036319213145617 and parameters: {'C': 0.010943151893460191, 'n_estimators': 470, 'min_samples_split': 5}. Best is trial 0 with value: 0.19012500675873895.
[I 2025-03-17 18:16:51,164] Trial 2 finished with value: 0.19302665898813134 and parameters: {'C': 0.0018540739509317887, 'n_estimators': 305, 'min_samples_split': 10}. Best is trial 0 with value: 0.19012500675873895.
[I 2025-03-17 18:16:56,704] Trial 3 finished with value: 0.19117528938574996 and parameters: {'C': 0.09520312474025192, 'n_estimators': 370, 'min_samples_split': 2}. Best is trial 0 with value: 0.19012500675873895.
[I 2025-03-17 18:16:5

{'C': 0.029817784326631003, 'n_estimators': 112, 'min_samples_split': 10}
0.18899149472146098


In [13]:
# Go with voting regressor containing random forest and logistic regression with optimized hyperparameters
params = {'random_state':SEED, 
          'penalty': 'l2', 
          'C':0.0216474637891819, 
          'solver':'lbfgs', 
          'n_jobs':-1, 
          'max_iter':1000}

model = LogisticRegression(**params).fit(X = X_train_final, y = y_train)

# Export scaler, encoder, and model for dashboard use.
joblib.dump(model, 'ensemble_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(enc, 'encoder.joblib')



['encoder.joblib']