# Objective

Predict the outcome of NBA games based on seasonal and game-specific statistics.

**Target:**

A binary variable indicating the outcome of the game for the home team: win (1) or loss (0). This prediction will be based on factors like team performance metrics (e.g., points scored, field goal percentage), home/away status, and historical performance against the opposing team.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nba_gamelog_data = pd.read_csv("./data/nba_gamelog.csv")
nba_gamelog_data.describe()

# Find any missing data in the dataset.
missing_values = nba_gamelog_data.isnull().sum()

missing_values

home          0
season        0
game          0
date          0
away          0
team_wl       0
team_score    0
opp_score     0
team_fg       0
team_fga      0
team_fgp      0
team_3p       0
team_3pa      0
team_3pp      0
team_ft       0
team_fta      0
team_ftp      0
team_orb      0
team_trb      0
team_ast      0
team_stl      0
team_blk      0
team_tov      0
team_pf       0
opp_fg        0
opp_fga       0
opp_fgp       0
opp_3p        0
opp_3pa       0
opp_3pp       0
opp_ft        0
opp_fta       0
opp_ftp       0
opp_orb       0
opp_trb       0
opp_ast       0
opp_stl       0
opp_blk       0
opp_tov       0
opp_pf        0
dtype: int64

## Data Cleanup


In [3]:
# Each game should be represented twice once for the home team and once for the away team. Clear up all duplicate games.

print(f"There were originally {len(nba_gamelog_data)} games logged.")
nba_gamelog_data = nba_gamelog_data.drop_duplicates()
f"{len(nba_gamelog_data)}"

There were originally 117236 games logged.


'58726'

In [4]:
unique_teams = set(
    nba_gamelog_data["home"].apply(lambda team: team).to_list()
    + nba_gamelog_data["away"].apply(lambda team: team).to_list()
)
f"There are {len(unique_teams)} unique teams: {unique_teams}."

"There are 38 unique teams: {'OKC', 'BKN', 'MEM', 'MIA', 'ORL', 'PHO', 'SEA', 'PHI', 'UTA', 'CHI', 'NOK', 'TOR', 'NJN', 'DEN', 'DAL', 'ATL', 'GSW', 'IND', 'WAS', 'CHO', 'POR', 'LAL', 'BOS', 'MIN', 'NYK', 'SAC', 'CHA', 'HOU', 'CHH', 'DET', 'SAS', 'BRK', 'VAN', 'CLE', 'MIL', 'NOH', 'LAC', 'NOP'}."

In [5]:
# Historically convert teams to their 2024 team abbreviation.

historic_to_current = {
    "BRK": "BKN",
    "NJN": "BKN",
    "SEA": "OKC",
    "NOK": "NOP",
    "NOH": "NOP",
    "CHH": "NOP",
    "VAN": "MEM",
    "CHO": "CHA",
}

nba_gamelog_data["home"] = nba_gamelog_data["home"].apply(
    lambda team: team if team not in historic_to_current else historic_to_current[team]
)
nba_gamelog_data["away"] = nba_gamelog_data["away"].apply(
    lambda team: team if team not in historic_to_current else historic_to_current[team]
)

f"Unique home teams: {len(nba_gamelog_data['home'].unique())}", f"Unique away teams: {len(nba_gamelog_data['away'].unique())}"

('Unique home teams: 30', 'Unique away teams: 30')

In [6]:
# Convert tean_wl to a numerical field 1 indicates the home team won a 0 indicates the away team won.

nba_gamelog_data["win"] = nba_gamelog_data["team_wl"].apply(
    lambda wl: 1 if wl == "W" else 0
)
nba_gamelog_data["win"].head()

0    0
1    0
2    1
3    0
4    0
Name: win, dtype: int64

In [7]:
from datetime import datetime

# Convert date fields into a datetime

nba_gamelog_data["date"] = nba_gamelog_data["date"].apply(
    lambda dateStr: datetime.strptime(dateStr, "%Y-%m-%d").timestamp()
)
nba_gamelog_data["date"].head()

0    941522400.0
1    941695200.0
2    941868000.0
3    942040800.0
4    942213600.0
Name: date, dtype: float64

In [8]:
# Add historical win / loss rate

win_rates = nba_gamelog_data.groupby("home")["win"].mean()
nba_gamelog_data["home_win_rate"] = nba_gamelog_data["home"].map(win_rates)
nba_gamelog_data["away_win_rate"] = nba_gamelog_data["away"].map(win_rates)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,opp_orb,opp_trb,opp_ast,opp_stl,opp_blk,opp_tov,opp_pf,win,home_win_rate,away_win_rate
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,12,42,23,5,5,15,30,0,0.451417,0.418182
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,12,38,24,15,6,11,25,0,0.451417,0.507064
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,17,39,14,6,6,14,26,1,0.451417,0.459241
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,22,49,28,6,15,7,23,0,0.451417,0.539082
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,15,49,27,9,10,18,24,0,0.451417,0.470737


In [9]:
# Get season win rate

nba_gamelog_data["team_season"] = nba_gamelog_data["home"] + nba_gamelog_data[
    "season"
].apply(lambda season: "_" + str(season))
nba_gamelog_data["away_season"] = nba_gamelog_data["away"] + nba_gamelog_data[
    "season"
].apply(lambda season: "_" + str(season))

season_win_rates = nba_gamelog_data.groupby("team_season")["win"].mean()
nba_gamelog_data["home_season_performance"] = nba_gamelog_data["team_season"].map(
    season_win_rates
)
nba_gamelog_data["away_season_performance"] = nba_gamelog_data["away_season"].map(
    season_win_rates
)

nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,opp_blk,opp_tov,opp_pf,win,home_win_rate,away_win_rate,team_season,away_season,home_season_performance,away_season_performance
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,5,15,30,0,0.451417,0.418182,ATL_2000,WAS_2000,0.341463,0.353659
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,6,11,25,0,0.451417,0.507064,ATL_2000,MIL_2000,0.341463,0.512195
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,6,14,26,1,0.451417,0.459241,ATL_2000,CHI_2000,0.341463,0.207317
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,15,7,23,0,0.451417,0.539082,ATL_2000,DEN_2000,0.341463,0.426829
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,10,18,24,0,0.451417,0.470737,ATL_2000,MEM_2000,0.341463,0.268293


In [10]:
# Previous overall performance


def previous_performance(row: pd.Series):
    team = row["home"]
    away = row["away"]
    date = row["date"]
    team_df = nba_gamelog_data.query(f"home == '{team}' and date < {date}")
    opp_df = nba_gamelog_data.query(f"home == '{away}' and date < {date}")
    row["avg_win_to_date"] = team_df["win"].mean()
    row["avg_team_pts"] = team_df["team_score"].mean()
    row["avg_team_fg"] = team_df["team_fg"].mean()
    row["avg_team_fga"] = team_df["team_fga"].mean()
    row["avg_team_fgp"] = team_df["team_fgp"].mean()
    row["avg_team_3p"] = team_df["team_3p"].mean()
    row["avg_team_3pa"] = team_df["team_3pa"].mean()
    row["avg_team_3pp"] = team_df["team_3pp"].mean()
    row["avg_team_ft"] = team_df["team_ft"].mean()
    row["avg_team_fta"] = team_df["team_fta"].mean()
    row["avg_team_trb"] = team_df["team_trb"].mean()
    row["avg_team_ast"] = team_df["team_ast"].mean()
    row["avg_team_stl"] = team_df["team_stl"].mean()
    row["avg_team_blk"] = team_df["team_blk"].mean()
    row["avg_team_tov"] = team_df["team_tov"].mean()
    row["avg_team_pf"] = team_df["team_pf"].mean()
    row["avg_team_pts_allowed"] = team_df["opp_score"].mean()
    row["avg_opp_win_to_date"] = opp_df["win"].mean()
    row["avg_opp_pts"] = opp_df["team_score"].mean()
    row["avg_opp_fg"] = opp_df["team_fg"].mean()
    row["avg_opp_fga"] = opp_df["team_fga"].mean()
    row["avg_opp_fgp"] = opp_df["team_fgp"].mean()
    row["avg_opp_3p"] = opp_df["team_3p"].mean()
    row["avg_opp_3pa"] = opp_df["team_3pa"].mean()
    row["avg_opp_3pp"] = opp_df["team_3pp"].mean()
    row["avg_opp_ft"] = opp_df["team_ft"].mean()
    row["avg_opp_fta"] = opp_df["team_fta"].mean()
    row["avg_opp_trb"] = opp_df["team_trb"].mean()
    row["avg_opp_ast"] = opp_df["team_ast"].mean()
    row["avg_opp_stl"] = opp_df["team_stl"].mean()
    row["avg_opp_blk"] = opp_df["team_blk"].mean()
    row["avg_opp_tov"] = opp_df["team_tov"].mean()
    row["avg_opp_pf"] = opp_df["team_pf"].mean()
    row["avg_opp_pts_allowed"] = opp_df["opp_score"].mean()
    return row


nba_gamelog_data = nba_gamelog_data.apply(previous_performance, axis=1)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,avg_opp_3pp,avg_opp_ft,avg_opp_fta,avg_opp_trb,avg_opp_ast,avg_opp_stl,avg_opp_blk,avg_opp_tov,avg_opp_pf,avg_opp_pts_allowed
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,,,,,,,,,,
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,0.429,19.0,28.0,35.0,20.0,10.0,2.0,8.0,25.0,93.0
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,0.3115,24.5,33.5,35.0,19.5,8.5,6.0,15.0,19.5,94.5
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,0.432333,19.0,24.333333,41.0,20.666667,6.666667,9.333333,19.0,20.0,102.0
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,0.206333,23.0,31.333333,40.666667,19.666667,7.666667,2.666667,14.333333,23.333333,101.0


In [11]:
# Calculate season previous performance.


def season_performance(row: pd.Series):
    team = row["home"]
    away = row["away"]
    date = row["date"]
    season = row["season"]
    team_season_df = nba_gamelog_data.query(
        f"season == {season} and home == '{team}' and date < {date}"
    )
    opp_season_df = nba_gamelog_data.query(
        f"season == {season} and home == '{away}' and date < {date}"
    )
    row["season_win_to_date"] = team_season_df["win"].mean()
    row["season_team_pts"] = team_season_df["team_score"].mean()
    row["season_team_fg"] = team_season_df["team_fg"].mean()
    row["season_team_fga"] = team_season_df["team_fga"].mean()
    row["season_team_fgp"] = team_season_df["team_fgp"].mean()
    row["season_team_3p"] = team_season_df["team_3p"].mean()
    row["season_team_3pa"] = team_season_df["team_3pa"].mean()
    row["season_team_3pp"] = team_season_df["team_3pp"].mean()
    row["season_team_ft"] = team_season_df["team_ft"].mean()
    row["season_team_fta"] = team_season_df["team_fta"].mean()
    row["season_team_trb"] = team_season_df["team_trb"].mean()
    row["season_team_ast"] = team_season_df["team_ast"].mean()
    row["season_team_stl"] = team_season_df["team_stl"].mean()
    row["season_team_blk"] = team_season_df["team_blk"].mean()
    row["season_team_tov"] = team_season_df["team_tov"].mean()
    row["season_team_pf"] = team_season_df["team_pf"].mean()
    row["season_team_pts_allowed"] = team_season_df["opp_score"].mean()
    row["season_opp_win_to_date"] = opp_season_df["win"].mean()
    row["season_opp_pts"] = opp_season_df["team_score"].mean()
    row["season_opp_fg"] = opp_season_df["team_fg"].mean()
    row["season_opp_fga"] = opp_season_df["team_fga"].mean()
    row["season_opp_fgp"] = opp_season_df["team_fgp"].mean()
    row["season_opp_3p"] = opp_season_df["team_3p"].mean()
    row["season_opp_3pa"] = opp_season_df["team_3pa"].mean()
    row["season_opp_3pp"] = opp_season_df["team_3pp"].mean()
    row["season_opp_ft"] = opp_season_df["team_ft"].mean()
    row["season_opp_fta"] = opp_season_df["team_fta"].mean()
    row["season_opp_trb"] = opp_season_df["team_trb"].mean()
    row["season_opp_ast"] = opp_season_df["team_ast"].mean()
    row["season_opp_stl"] = opp_season_df["team_stl"].mean()
    row["season_opp_blk"] = opp_season_df["team_blk"].mean()
    row["season_opp_tov"] = opp_season_df["team_tov"].mean()
    row["season_opp_pf"] = opp_season_df["team_pf"].mean()
    row["season_opp_pts_allowed"] = opp_season_df["opp_score"].mean()
    return row


nba_gamelog_data = nba_gamelog_data.apply(season_performance, axis=1)
nba_gamelog_data.head()

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,season_opp_3pp,season_opp_ft,season_opp_fta,season_opp_trb,season_opp_ast,season_opp_stl,season_opp_blk,season_opp_tov,season_opp_pf,season_opp_pts_allowed
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,,,,,,,,,,
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,0.429,19.0,28.0,35.0,20.0,10.0,2.0,8.0,25.0,93.0
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,0.3115,24.5,33.5,35.0,19.5,8.5,6.0,15.0,19.5,94.5
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,0.432333,19.0,24.333333,41.0,20.666667,6.666667,9.333333,19.0,20.0,102.0
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,0.206333,23.0,31.333333,40.666667,19.666667,7.666667,2.666667,14.333333,23.333333,101.0


In [12]:
# Calculate overall head to head matchup between teams.


def head_to_head(row: pd.Series):
    team = row["home"]
    away = row["away"]
    date = row["date"]
    home_win_rate = nba_gamelog_data.query(
        f"home == '{team}' and away == '{away}' and date < {date}"
    )["win"].mean()
    away_win_rate = (
        1
        - nba_gamelog_data.query(
            f"home == '{away}' and away == '{team}' and date < {date}"
        )["win"].mean()
    )
    return (home_win_rate + away_win_rate) / 2


nba_gamelog_data["head_to_head"] = nba_gamelog_data.apply(head_to_head, axis=1)
nba_gamelog_data.head()

  nba_gamelog_data["head_to_head"] = nba_gamelog_data.apply(head_to_head, axis=1)


Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,season_opp_ft,season_opp_fta,season_opp_trb,season_opp_ast,season_opp_stl,season_opp_blk,season_opp_tov,season_opp_pf,season_opp_pts_allowed,head_to_head
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,,,,,,,,,,
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,19.0,28.0,35.0,20.0,10.0,2.0,8.0,25.0,93.0,
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,24.5,33.5,35.0,19.5,8.5,6.0,15.0,19.5,94.5,
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,19.0,24.333333,41.0,20.666667,6.666667,9.333333,19.0,20.0,102.0,
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,23.0,31.333333,40.666667,19.666667,7.666667,2.666667,14.333333,23.333333,101.0,


In [13]:
# Calculate overall head to head matchup between teams.


def season_head_to_head(row: pd.Series):
    team = row["home"]
    away = row["away"]
    date = row["date"]
    season = row["season"]
    home_win_rate = nba_gamelog_data.query(
        f"season == {season} and home == '{team}' and away == '{away}' and date < {date}"
    )["win"].mean()
    away_win_rate = (
        1
        - nba_gamelog_data.query(
            f"season == {season} and home == '{away}' and away == '{team}' and date < {date}"
        )["win"].mean()
    )
    return (home_win_rate + away_win_rate) / 2


nba_gamelog_data["season_head_to_head"] = nba_gamelog_data.apply(
    season_head_to_head, axis=1
)
nba_gamelog_data.head()

  nba_gamelog_data["season_head_to_head"] = nba_gamelog_data.apply(


Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,season_opp_fta,season_opp_trb,season_opp_ast,season_opp_stl,season_opp_blk,season_opp_tov,season_opp_pf,season_opp_pts_allowed,head_to_head,season_head_to_head
0,ATL,2000,1,941522400.0,WAS,L,87,94,31,78,...,,,,,,,,,,
1,ATL,2000,2,941695200.0,MIL,L,109,119,41,83,...,28.0,35.0,20.0,10.0,2.0,8.0,25.0,93.0,,
2,ATL,2000,3,941868000.0,CHI,W,113,97,44,81,...,33.5,35.0,19.5,8.5,6.0,15.0,19.5,94.5,,
3,ATL,2000,4,942040800.0,DEN,L,100,115,39,82,...,24.333333,41.0,20.666667,6.666667,9.333333,19.0,20.0,102.0,,
4,ATL,2000,5,942213600.0,MEM,L,97,102,39,92,...,31.333333,40.666667,19.666667,7.666667,2.666667,14.333333,23.333333,101.0,,


In [14]:
nba_gamelog_data.fillna(0)

Unnamed: 0,home,season,game,date,away,team_wl,team_score,opp_score,team_fg,team_fga,...,season_opp_fta,season_opp_trb,season_opp_ast,season_opp_stl,season_opp_blk,season_opp_tov,season_opp_pf,season_opp_pts_allowed,head_to_head,season_head_to_head
0,ATL,2000,1,9.415224e+08,WAS,L,87,94,31,78,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,ATL,2000,2,9.416952e+08,MIL,L,109,119,41,83,...,28.000000,35.000000,20.000000,10.000000,2.000000,8.000000,25.000000,93.000000,0.000000,0.0
2,ATL,2000,3,9.418680e+08,CHI,W,113,97,44,81,...,33.500000,35.000000,19.500000,8.500000,6.000000,15.000000,19.500000,94.500000,0.000000,0.0
3,ATL,2000,4,9.420408e+08,DEN,L,100,115,39,82,...,24.333333,41.000000,20.666667,6.666667,9.333333,19.000000,20.000000,102.000000,0.000000,0.0
4,ATL,2000,5,9.422136e+08,MEM,L,97,102,39,92,...,31.333333,40.666667,19.666667,7.666667,2.666667,14.333333,23.333333,101.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58721,WAS,2024,48,1.706854e+09,MIA,L,102,110,37,90,...,23.270833,41.541667,25.770833,7.250000,3.354167,12.333333,18.229167,111.583333,0.305263,0.0
116450,IND,2024,12,1.700374e+09,ORL,L,116,128,42,95,...,27.166667,44.750000,23.750000,9.583333,5.083333,15.250000,20.750000,105.833333,0.600000,0.0
116542,LAL,2024,22,1.701756e+09,PHO,W,106,103,38,102,...,27.100000,44.100000,26.000000,6.950000,7.050000,14.200000,19.150000,113.250000,0.531250,1.0
116872,ORL,2024,13,1.700374e+09,IND,W,128,116,43,87,...,20.090909,41.909091,30.818182,7.454545,5.818182,12.181818,20.818182,123.363636,0.400000,0.0


In [15]:
nba_gamelog_data = nba_gamelog_data.drop(
    ["team_wl", "team_season", "away_season"], axis=1
)

In [16]:
nba_gamelog_data.to_csv("data/cleaned_nba_gamelog.csv", encoding="utf-8", index=False)