In [1]:
import warnings
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import date
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

load_dotenv()
SQL_PASS = os.getenv('SQL_PASS')
engine = create_engine(f"mysql+pymysql://root:{SQL_PASS}@localhost:3306/daily_lockz")
df = pd.read_sql(f"SELECT * FROM ncaab_games", engine, index_col='index')
df = df[df["date"] < pd.Timestamp('2024-03-18')]
df['winner'] = (df['total'] > df['total_opp']).astype(int)

def make_season(date):
    season = str(date).split('-')[0].split('20')[-1]
    return season
df['season'] = df['date'].apply(make_season)

df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["index_opp"]
del df["logo_opp"]

def add_target(group):
    group["target"] = group["winner"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

removed_columns = ["season", "date", "winner", "target", "team", "team_opp", "logo"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

df_rolling5 = df[list(selected_columns) + ['team']]
def find_team_averages5(team):
    rolling = team.rolling(5).mean(numeric_only=True)
    return rolling
df_rolling5 = df_rolling5.groupby(['team'], group_keys=False).apply(find_team_averages5)
rolling_cols5 = [f"{col}_5" for col in df_rolling5.columns]
df_rolling5.columns = rolling_cols5

df_rolling10 = df[list(selected_columns) + ['team']]
def find_team_averages10(team):
    rolling = team.rolling(10).mean(numeric_only=True)
    return rolling
df_rolling10 = df_rolling10.groupby(['team'], group_keys=False).apply(find_team_averages10)
rolling_cols10 = [f"{col}_10" for col in df_rolling10.columns]
df_rolling10.columns = rolling_cols10

# 15-game rolling averages
df_rolling15 = df[list(selected_columns) + ['team']]
def find_team_averages15(team):
    rolling = team.rolling(15).mean(numeric_only=True)
    return rolling
df_rolling15 = df_rolling15.groupby(['team'], group_keys=False).apply(find_team_averages15)
rolling_cols15 = [f"{col}_15" for col in df_rolling15.columns]
df_rolling15.columns = rolling_cols15

# Combine all features
df = pd.concat([df, df_rolling5], axis=1)
df = pd.concat([df, df_rolling10], axis=1)
df = pd.concat([df, df_rolling15], axis=1)
df = df.dropna()

# Add next game info
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

# home_team = 'illinois'
# away_team = 'iowa-state'
EAST = [('north-carolina','wagner'),('mississippi-state','michigan-state'),('saint-marys-ca','grand-canyon'),('alabama','college-of-charleston')
        ,('clemson','new-mexico'),('baylor','colgate'),('dayton','nevada'),('arizona','long-beach-state')]

for GAME in EAST:
    home_team = GAME[0]
    away_team = GAME[1]
    null_indices = df[(df['team'] == home_team) & (df['home_next'].isnull())].index
    for idx in null_indices:
        df.at[idx, 'home_next'] = 0
        df.at[idx, 'team_opp_next'] = away_team
        df.at[idx, 'date_next'] = date.today()
    null_indices = df[(df['team'] == away_team) & (df['home_next'].isnull())].index
    for idx in null_indices:
        df.at[idx, 'home_next'] = 0
        df.at[idx, 'team_opp_next'] = home_team
        df.at[idx, 'date_next'] = date.today()

full = df.merge(df[rolling_cols5 + rolling_cols10 + rolling_cols15 + ["team_opp_next", "date_next", "team"]], 
            left_on=["team", "date_next"], 
            right_on=["team_opp_next", "date_next"])

removed_columns = list(full.columns[full.dtypes != 'float']) + removed_columns
features = full.columns[~full.columns.isin(removed_columns)]

predictors = list(features)
def scikit_backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        probs = model.predict_proba(test[predictors])[:, 1]
        preds = model.predict(test[predictors])
        
        results = pd.DataFrame({
            "actual": test["target"],
            "prediction": preds,
            "win_probability": probs,
        })
        
        all_predictions.append(results)
    
    return pd.concat(all_predictions)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
predictions = scikit_backtest(full, model, predictors)

accuracy = accuracy_score(predictions["actual"], predictions["prediction"])
print(f"Model accuracy: {accuracy:.4f}")

for GAME in EAST:
    teams = full[(full['target'] == 2) & (full['team_x'].isin(GAME))]['team_x']
    score = predictions[predictions['actual'] == 2]
    score = score.loc[teams.index]
    teams = teams.reset_index(drop=True)
    score = score.reset_index(drop=True)
    
    if score['prediction'][0] == 1:
        index = 0
    elif score['prediction'][1] == 1:
        index = 1
    else:
        if score['win_probability'][0] >= score['win_probability'][1]:
            index = 0
        else:
            index = 1
        if score['win_probability'][0] + score['win_probability'][1] >= 0.75:
            if index == 0: index = 1 
            else: index = 0
    print(teams[index])

Model accuracy: 0.6478
north-carolina
mississippi-state
saint-marys-ca
college-of-charleston
new-mexico
colgate
dayton
arizona


In [2]:
predictions[predictions['actual'] == 2]

Unnamed: 0,actual,prediction,win_probability
15650,2,1,0.51
15660,2,1,0.69
15662,2,0,0.38
15675,2,1,0.55
15681,2,0,0.47
15723,2,0,0.44
15739,2,0,0.47
15769,2,0,0.34
15772,2,1,0.57
15785,2,0,0.32


In [3]:
full[(full['target'] == 2)]['team_x']

15650           saint-marys-ca
15660    college-of-charleston
15662                   wagner
15675                  colgate
15681                  clemson
15723                   dayton
15739                   nevada
15769                  alabama
15772                  arizona
15785                   baylor
15793           michigan-state
15810             grand-canyon
15811           north-carolina
15812         long-beach-state
15813               new-mexico
15816        mississippi-state
Name: team_x, dtype: object