In [1]:
import pandas as pd 
import pandas as pd

from nba_betting_ai.data.processing import prepare_game_data, merge_game_data
from nba_betting_ai.data.storage import get_engine, load_teams, load_games, load_gameflow
from nba_betting_ai.training.pipeline import filter_nba_matchups, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.multioutput import MultiOutputClassifier

In [2]:
seasons_of_interest = ['2023-24', '2024-25']


engine = get_engine()
df_teams = load_teams(engine)
df_games = load_games(engine)
df_games = filter_nba_matchups(df_games, df_teams)
df_games = df_games[df_games['season_id'].isin(seasons_of_interest)]
df_gameflow = load_gameflow(engine, game_id = list(df_games['game_id'].unique()))
df_games = prepare_game_data(df_games, df_gameflow)

(Timestamp('2023-10-05 00:00:00'), Timestamp('2024-06-17 00:00:00'))

In [3]:
seed = 66
test_size = 0.2
n = 20

games_idx_train, games_idx_test, gameflow_idx_train, gameflow_idx_test = train_test_split(df_games, df_gameflow, test_size=test_size, n=n, seed=seed)
df_games_train = df_games.loc[games_idx_train]
df_games_test = df_games.loc[games_idx_test]
df_gameflow_train = df_gameflow.loc[gameflow_idx_train]
df_gameflow_test = df_gameflow.loc[gameflow_idx_test]

print(f"Original data: {df_gameflow.shape[0]} score changes in {len(df_games['game_id'].unique())} games through "
      "{len(df_games['season_id'].unique())} seasons.")
print(f"Train data: {df_gameflow_train.shape[0]} score changes in {len(df_games_train['game_id'].unique())} games.")
print(f"Test data: {df_gameflow_test.shape[0]} score changes in {len(df_games_test['game_id'].unique())} games.")

print(f"Train games have happened between {df_games_train['game_date'].dt.date.min()} and {df_games_train['game_date'].dt.date.max()},"
      f" and test games have happened between {df_games_test['game_date'].dt.date.min()} and {df_games_test['game_date'].dt.date.max()}.")

Original data: 243395 score changes in 1988 games through {len(df_games['season_id'].unique())} seasons.
Train data: 31760 score changes in 1588 games.
Test data: 8000 score changes in 400 games.
Train games have happened between 2023-10-05 and 2024-11-08, and test games have happened between 2024-11-09 and 2025-01-07.


In [4]:
df_games_train.columns

Index(['season_id', 'team_id', 'team_abbreviation', 'team_name', 'game_id',
       'game_date', 'matchup', 'wl', 'points_for', 'points_against', 'win',
       'season_wins', 'season_pts_for', 'season_pts_against_so_far',
       'season_games', 'last_5_wins', 'last_5_pts_for_avg',
       'last_5_pts_for_total', 'last_5_pts_against_avg',
       'last_5_pts_against_total'],
      dtype='object')

In [5]:
df_train = merge_game_data(df_gameflow_train.index, df_games_train, df_gameflow_train)
df_test = merge_game_data(df_gameflow_test.index, df_games_test, df_gameflow_test)
len(df_train), len(df_test)

(1588, 400)

In [6]:
feature_cols = ['home_team_abbreviation', 'away_team_abbreviation', 'home_score', 'away_score', 'time_remaining', 'win']
feature_rename = ['HomeName', 'AwayName', 'HomeScore', 'AwayScore', 'TimeRemaining', 'HomeWin']
X_train = pd.merge(df_train, df_gameflow, left_on='game_id', right_on='game_id')
X_test = pd.merge(df_test, df_gameflow, left_on='game_id', right_on='game_id')
del df_games, df_train, df_test, df_gameflow_train, df_gameflow_test, df_games_train, df_games_test

print(f'Final train data set size: {X_train.shape[0]}')
print(f'Final test data set size: {X_test.shape[0]}')

Final train data set size: 194997
Final test data set size: 48398


In [7]:
X_train

Unnamed: 0,game_id,win,home_team_id,home_team_abbreviation,home_season_wins,home_season_pts_for,home_season_pts_against_so_far,home_season_games,home_last_5_wins,home_last_5_pts_for_avg,...,away_season_pts_against_so_far,away_season_games,away_last_5_wins,away_last_5_pts_for_avg,away_last_5_pts_for_total,away_last_5_pts_against_avg,away_last_5_pts_against_total,home_score,away_score,time_remaining
0,0012300001,0,1610612742,DAL,,,,0,,,...,,0,,,,,,0,3,2861
1,0012300001,0,1610612742,DAL,,,,0,,,...,,0,,,,,,1,3,2788
2,0012300001,0,1610612742,DAL,,,,0,,,...,,0,,,,,,1,5,2752
3,0012300001,0,1610612742,DAL,,,,0,,,...,,0,,,,,,1,7,2722
4,0012300001,0,1610612742,DAL,,,,0,,,...,,0,,,,,,1,10,2691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194992,0022400185,0,1610612737,ATL,5.0,1473.0,1549.0,13,2.0,115.0,...,1470.0,14,3.0,106.2,531.0,105.2,526.0,119,120,42
194993,0022400185,0,1610612737,ATL,5.0,1473.0,1549.0,13,2.0,115.0,...,1470.0,14,3.0,106.2,531.0,105.2,526.0,120,120,12
194994,0022400185,0,1610612737,ATL,5.0,1473.0,1549.0,13,2.0,115.0,...,1470.0,14,3.0,106.2,531.0,105.2,526.0,121,120,12
194995,0022400185,0,1610612737,ATL,5.0,1473.0,1549.0,13,2.0,115.0,...,1470.0,14,3.0,106.2,531.0,105.2,526.0,121,122,8


In [8]:
X_train = X_train[feature_cols].rename(columns=dict(zip(feature_cols, feature_rename)))
X_test = X_test[feature_cols].rename(columns=dict(zip(feature_cols, feature_rename)))

In [None]:
def add_home_away_scores(df: pd.DataFrame) -> pd.DataFrame:
    # Create the 'HomeWin' and 'AwayWin' columns based on scores
    df['AwayWin'] = 1 - df['HomeWin'] 

    # In case of a draw (both scores are equal), set both to 0
    df.loc[df['HomeScore'] == df['AwayScore'], ['HomeWin', 'AwayWin']] = 0
    return df

X_train = add_home_away_scores(X_train)
X_test = add_home_away_scores(X_test)

# Display the DataFrame with the 'HomeWin' and 'AwayWin' columns
X_train

Unnamed: 0,HomeName,AwayName,HomeScore,AwayScore,TimeRemaining,HomeWin,AwayWin
0,DAL,MIN,0,3,2861,0,1
1,DAL,MIN,1,3,2788,0,1
2,DAL,MIN,1,5,2752,0,1
3,DAL,MIN,1,7,2722,0,1
4,DAL,MIN,1,10,2691,0,1
...,...,...,...,...,...,...,...
194992,ATL,DET,119,120,42,0,1
194993,ATL,DET,120,120,12,0,0
194994,ATL,DET,121,120,12,0,1
194995,ATL,DET,121,122,8,0,1


In [10]:
team_encoder = OneHotEncoder()
encoded_names = team_encoder.fit_transform(X_train['AwayName'].values.reshape(-1, 1)).toarray()
X_train.loc[:, team_encoder.get_feature_names_out()] = encoded_names
X_train.loc[:, team_encoder.get_feature_names_out()] = team_encoder.transform(X_train['HomeName'].values.reshape(-1, 1)).toarray()
X_test.loc[:, team_encoder.get_feature_names_out()] = team_encoder.transform(X_test['AwayName'].values.reshape(-1, 1)).toarray()
X_test.loc[:, team_encoder.get_feature_names_out()] = team_encoder.transform(X_test['HomeName'].values.reshape(-1, 1)).toarray()

scaler = StandardScaler()
X_train[['HomeScore', 'AwayScore', 'TimeRemaining']] = scaler.fit_transform(X_train[['HomeScore', 'AwayScore', 'TimeRemaining']])
X_test[['HomeScore', 'AwayScore', 'TimeRemaining']] = scaler.transform(X_test[['HomeScore', 'AwayScore', 'TimeRemaining']])


# Drop the original categorical columns
X_train = X_train.drop(['AwayName', 'HomeName'], axis=1)
X_test = X_test.drop(['AwayName', 'HomeName'], axis=1)

# Prepare features (X) and target (y)
y_train = X_train[['HomeWin', 'AwayWin']]
X_train = X_train.drop(['HomeWin', 'AwayWin'], axis=1)
y_test = X_test[['HomeWin', 'AwayWin']]
X_test = X_test.drop(['HomeWin', 'AwayWin'], axis=1)


# Initialize LogisticRegression model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)  # Increase max_iter if convergence warnings occur

# Wrap Logistic Regression in a MultiOutputClassifier
model = MultiOutputClassifier(log_reg_model)

# Train the Logistic Regression model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy for both HomeWin and AwayWin
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Get the predicted probabilities for each class (HomeWin and AwayWin)
y_prob = model.predict_proba(X_test)

# The `predict_proba` method returns probabilities for both classes (0 and 1)
# y_prob[:, 1] will give you the probability for the 'win' class (1)
df_test_with_prob = X_test.copy()

# For each target column (HomeWin and AwayWin), you need to extract the probabilities for each target separately.
df_test_with_prob['predicted_home_win_probability'] = y_prob[0][:, 1]
df_test_with_prob['predicted_away_win_probability'] = y_prob[1][:, 1]

# Map the encoded columns back to their original names
df_test_with_prob['AwayName'] = team_encoder.inverse_transform(df_test_with_prob[team_encoder.get_feature_names_out()]).reshape(-1)
df_test_with_prob['HomeName'] = team_encoder.inverse_transform(df_test_with_prob[team_encoder.get_feature_names_out()]).reshape(-1)
df_test_with_prob = df_test_with_prob.drop(team_encoder.get_feature_names_out(), axis=1)

df_test_with_prob[scaler.get_feature_names_out()] = scaler.inverse_transform(df_test_with_prob[scaler.get_feature_names_out()])

# Format the output in the desired format
team_win_probabilities = df_test_with_prob[['AwayName', 'HomeName', 'predicted_home_win_probability', 'predicted_away_win_probability']]

# Display the table
team_win_probabilities

Accuracy: 0.6865366337451961


Unnamed: 0,AwayName,HomeName,predicted_home_win_probability,predicted_away_win_probability
0,TOR,TOR,0.262338,0.651334
1,TOR,TOR,0.214054,0.709428
2,TOR,TOR,0.154442,0.784876
3,TOR,TOR,0.137716,0.806586
4,TOR,TOR,0.172678,0.761290
...,...,...,...,...
48393,CHA,CHA,0.102920,0.879674
48394,CHA,CHA,0.115876,0.864723
48395,CHA,CHA,0.102872,0.879624
48396,CHA,CHA,0.091129,0.893075
