# Objective

Predict the outcome of NBA games based on seasonal and game-specific statistics.

**Target:**

A binary variable indicating the outcome of the game for the home team: win (1) or loss (0). This prediction will be based on factors like team performance metrics (e.g., points scored, field goal percentage), home/away status, and historical performance against the opposing team.


In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [119]:
nba_gamelog_data = pd.read_csv("./data/nba_gamelog.csv")
nba_gamelog_data.describe()

# Find any missing data in the dataset.
missing_values = nba_gamelog_data.isnull().sum()

missing_values

home          0
season        0
game          0
date          0
away          0
team_wl       0
team_score    0
opp_score     0
team_fg       0
team_fga      0
team_fgp      0
team_3p       0
team_3pa      0
team_3pp      0
team_ft       0
team_fta      0
team_ftp      0
team_orb      0
team_trb      0
team_ast      0
team_stl      0
team_blk      0
team_tov      0
team_pf       0
opp_fg        0
opp_fga       0
opp_fgp       0
opp_3p        0
opp_3pa       0
opp_3pp       0
opp_ft        0
opp_fta       0
opp_ftp       0
opp_orb       0
opp_trb       0
opp_ast       0
opp_stl       0
opp_blk       0
opp_tov       0
opp_pf        0
dtype: int64

In [120]:
unique_teams = set(nba_gamelog_data["home"].apply(lambda team: team).to_list() + nba_gamelog_data["away"].apply(lambda team: team).to_list())
f"There are {len(unique_teams)} unique teams: {unique_teams}."

"There are 38 unique teams: {'BKN', 'NOK', 'WAS', 'CLE', 'VAN', 'NJN', 'HOU', 'MEM', 'SAC', 'CHI', 'MIL', 'MIN', 'UTA', 'GSW', 'NOP', 'SAS', 'ATL', 'BRK', 'CHO', 'MIA', 'IND', 'TOR', 'ORL', 'OKC', 'PHO', 'LAC', 'DET', 'DEN', 'BOS', 'POR', 'PHI', 'SEA', 'CHA', 'DAL', 'LAL', 'NYK', 'CHH', 'NOH'}."

In [121]:
# Historically convert teams to their 2024 team abbreviation.

historic_to_current = {
    "BRK": "BKN",
	"NJN": "BKN",
	"SEA": "OKC",
	"NOK": "NOP",
	"NOH": "NOP",
	"CHH": "NOP",
	"VAN": "MEM",
	"CHO": "CHA",
}

nba_gamelog_data["home"] = nba_gamelog_data["home"].apply(lambda team: team if team not in historic_to_current else historic_to_current[team])
nba_gamelog_data["away"] = nba_gamelog_data["away"].apply(lambda team: team if team not in historic_to_current else historic_to_current[team])

f"Unique home teams: {len(nba_gamelog_data['home'].unique())}", f"Unique away teams: {len(nba_gamelog_data['away'].unique())}"

('Unique home teams: 30', 'Unique away teams: 30')

In [122]:
# Convert tean_wl to a numerical field 1 indicates the home team won a 0 indicates the away team won.

nba_gamelog_data["win"] = nba_gamelog_data["team_wl"].apply(lambda wl: 1 if wl == "W" else 0)
nba_gamelog_data["win"].head()

0    0
1    0
2    1
3    0
4    0
Name: win, dtype: int64

In [123]:
from datetime import datetime

# Convert date fields into a datetime

nba_gamelog_data["date"]= nba_gamelog_data["date"].apply(lambda dateStr: datetime.strptime(dateStr, "%Y-%m-%d").timestamp())
nba_gamelog_data["date"].head()

0    941522400.0
1    941695200.0
2    941868000.0
3    942040800.0
4    942213600.0
Name: date, dtype: float64

In [124]:
# Add historical win / loss rate

win_rates = nba_gamelog_data.groupby('home')['win'].mean()
nba_gamelog_data['home_win_rate'] = nba_gamelog_data['home'].map(win_rates)
nba_gamelog_data['away_win_rate'] = nba_gamelog_data['away'].map(win_rates)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,opp_orb,opp_trb,opp_ast,opp_stl,opp_blk,opp_tov,opp_pf,win,home_win_rate,away_win_rate
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,12,42,23,5,5,15,30,0,0.451204,0.418416
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,12,38,24,15,6,11,25,0,0.451204,0.50695
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,17,39,14,6,6,14,26,1,0.451204,0.459432
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,22,49,28,6,15,7,23,0,0.451204,0.538772
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,15,49,27,9,10,18,24,0,0.451204,0.470811


In [125]:
# Get season win rate 

nba_gamelog_data["team_season"] = nba_gamelog_data["home"] + nba_gamelog_data["season"].apply(lambda season: "_" + str(season))
nba_gamelog_data["away_season"] = nba_gamelog_data["away"] + nba_gamelog_data["season"].apply(lambda season: "_" + str(season))

season_win_rates = nba_gamelog_data.groupby('team_season')['win'].mean()
nba_gamelog_data['home_season_performance'] = nba_gamelog_data["team_season"].map(season_win_rates)
nba_gamelog_data['away_season_performance'] = nba_gamelog_data['away_season'].map(season_win_rates)

nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,opp_blk,opp_tov,opp_pf,win,home_win_rate,away_win_rate,team_season,away_season,home_season_performance,away_season_performance
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,5,15,30,0,0.451204,0.418416,ATL_2000,WAS_2000,0.341463,0.353659
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,6,11,25,0,0.451204,0.50695,ATL_2000,MIL_2000,0.341463,0.512195
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,6,14,26,1,0.451204,0.459432,ATL_2000,CHI_2000,0.341463,0.207317
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,15,7,23,0,0.451204,0.538772,ATL_2000,DEN_2000,0.341463,0.426829
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,10,18,24,0,0.451204,0.470811,ATL_2000,MEM_2000,0.341463,0.268293


In [148]:
# Previous overall performance

def previous_performance(row: pd.Series):
    team = row["home"]
    date = row["date"]
    return nba_gamelog_data.query(f"home == '{team}' and date < {date}")["win"].mean()

nba_gamelog_data["win_percent_to_date"] = nba_gamelog_data.apply(previous_performance, axis=1)
nba_gamelog_data["win_percent_to_date"] = nba_gamelog_data["win_percent_to_date"].fillna(0)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,win,home_win_rate,away_win_rate,team_season,away_season,home_season_performance,away_season_performance,win_percent_before_date,win_percent_to_date,season_win_to_date
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,0,0.451204,0.418416,ATL_2000,WAS_2000,0.341463,0.353659,,0.0,0.0
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,0,0.451204,0.50695,ATL_2000,MIL_2000,0.341463,0.512195,0.0,0.0,0.0
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,1,0.451204,0.459432,ATL_2000,CHI_2000,0.341463,0.207317,0.0,0.0,0.0
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,0,0.451204,0.538772,ATL_2000,DEN_2000,0.341463,0.426829,0.333333,0.333333,0.333333
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,0,0.451204,0.470811,ATL_2000,MEM_2000,0.341463,0.268293,0.25,0.25,0.25


In [154]:
# Calculate season previous performance.

def season_performance(row: pd.Series):
    team = row["home"]
    date = row["date"]
    season = row["season"]
    return nba_gamelog_data.query(f"season == {season} and home == '{team}' and date < {date}")["win"].mean()


nba_gamelog_data["season_win_to_date"] = nba_gamelog_data.apply(season_performance, axis=1)
nba_gamelog_data["season_win_to_date"] = nba_gamelog_data["season_win_to_date"].fillna(0)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,win,home_win_rate,away_win_rate,team_season,away_season,home_season_performance,away_season_performance,win_percent_before_date,win_percent_to_date,season_win_to_date
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,0,0.451204,0.418416,ATL_2000,WAS_2000,0.341463,0.353659,,0.0,0.0
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,0,0.451204,0.50695,ATL_2000,MIL_2000,0.341463,0.512195,0.0,0.0,0.0
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,1,0.451204,0.459432,ATL_2000,CHI_2000,0.341463,0.207317,0.0,0.0,0.0
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,0,0.451204,0.538772,ATL_2000,DEN_2000,0.341463,0.426829,0.333333,0.333333,0.333333
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,0,0.451204,0.470811,ATL_2000,MEM_2000,0.341463,0.268293,0.25,0.25,0.25


In [155]:
# Calculate overall head to head matchup between teams.

def previous_head_to_head(row: pd.Series):
    team = row["home"]
    away= row["away"]
    date = row["date"]
    return nba_gamelog_data.query(f"(home == '{team}' and away == '{away}' or home == '{away}' and away == '{team}') and date < {date}")["win"].mean() + 1


nba_gamelog_data["head_to_head"] = nba_gamelog_data.apply(previous_head_to_head, axis=1)
nba_gamelog_data["head_to_head"] = nba_gamelog_data["head_to_head"].fillna(0)
nba_gamelog_data.head()

In [None]:
# Calculate season head to head matchup between teams.

def season_head_to_head(row: pd.Series):
    team = row["home"]
    away = row["away"]
    date = row["date"]
    season = row["season"]
    return nba_gamelog_data.query(f"season == {season} and (home == '{team}' and away == '{away}' or home == '{away}' and away == '{team}') and date < {date}")["win"].mean() + 1


nba_gamelog_data["season_head_to_head"] = nba_gamelog_data.apply(season_head_to_head, axis=1)
nba_gamelog_data["season_head_to_head"] = nba_gamelog_data["season_head_to_head"].fillna(0)
nba_gamelog_data.head()

SyntaxError: Python keyword not valid identifier in numexpr query (<unknown>, line 1)