# 250A Project - Predicting Basketball Game Outcomes with Bayesian Networks


In [None]:
import pandas as pd
import kagglehub
import os

## Data Loading and EDA

In [2]:
# Only if you wanna download the original dataset and query with sql
# theres also a play_by_play.csv file that was too big to push but ion think we need it
#------------------------------------------------------------------------------------------------
# path = kagglehub.dataset_download("wyattowalsh/basketball")

# print("Path to dataset files:", path)

### DataFrame Descriptions

In [None]:
data_folder = '../data'
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv') and f != 'play_by_play.csv']

print("CSV tables and their descriptions:\n")

for csv_file in csv_files:
    path = os.path.join(data_folder, csv_file)
    try:
        df = pd.read_csv(path, nrows=5)
        print(f"Table: {csv_file}")
        print("Columns:", list(df.columns))
        print(df.head())
        print("\n" + "-"*60 + "\n")
    except Exception as e:
        print(f"Could not read {csv_file}: {e}\n")



CSV tables and their descriptions:

Table: game_info.csv
Columns: ['game_id', 'game_date', 'attendance', 'game_time']
    game_id            game_date  attendance  game_time
0  24600001  1946-11-01 00:00:00         NaN        NaN
1  24600003  1946-11-02 00:00:00         NaN        NaN
2  24600002  1946-11-02 00:00:00         NaN        NaN
3  24600004  1946-11-02 00:00:00         NaN        NaN
4  24600005  1946-11-02 00:00:00         NaN        NaN

------------------------------------------------------------

Table: player.csv
Columns: ['id', 'full_name', 'first_name', 'last_name', 'is_active']
      id            full_name first_name     last_name  is_active
0  76001       Alaa Abdelnaby       Alaa     Abdelnaby          0
1  76002      Zaid Abdul-Aziz       Zaid    Abdul-Aziz          0
2  76003  Kareem Abdul-Jabbar     Kareem  Abdul-Jabbar          0
3     51   Mahmoud Abdul-Rauf    Mahmoud    Abdul-Rauf          0
4   1505    Tariq Abdul-Wahad      Tariq   Abdul-Wahad          0


In [5]:
players = pd.read_csv('../data/player.csv')

In [9]:
active_players = players[players['is_active'] == 1]
active_players

Unnamed: 0,id,full_name,first_name,last_name,is_active
10,1630173,Precious Achiuwa,Precious,Achiuwa,1
22,203500,Steven Adams,Steven,Adams,1
24,1628389,Bam Adebayo,Bam,Adebayo,1
29,1630534,Ochai Agbaji,Ochai,Agbaji,1
41,1630583,Santi Aldama,Santi,Aldama,1
...,...,...,...,...,...
4806,201152,Thaddeus Young,Thaddeus,Young,1
4808,1629027,Trae Young,Trae,Young,1
4809,1630209,Omer Yurtseven,Omer,Yurtseven,1
4812,203469,Cody Zeller,Cody,Zeller,1


In [15]:
games = pd.read_csv('../data/game.csv')
games.sort_values('game_date').tail(20)

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,team_name_home,game_id,game_date,matchup_home,wl_home,min,fgm_home,...,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,video_available_away,season_type
65675,42022,1610612755,PHI,Philadelphia 76ers,42200216,2023-05-11 00:00:00,PHI vs. BOS,L,240,30.0,...,50.0,22.0,7.0,6.0,18.0,14.0,95.0,9,1,Playoffs
65677,42022,1610612747,LAL,Los Angeles Lakers,42200236,2023-05-12 00:00:00,LAL vs. GSW,W,240,39.0,...,53.0,25.0,3.0,2.0,11.0,30.0,101.0,-21,1,Playoffs
65678,42022,1610612748,MIA,Miami Heat,42200206,2023-05-12 00:00:00,MIA vs. NYK,W,240,33.0,...,41.0,13.0,2.0,9.0,12.0,22.0,92.0,-4,1,Playoffs
65679,42022,1610612738,BOS,Boston Celtics,42200217,2023-05-14 00:00:00,BOS vs. PHI,W,240,41.0,...,37.0,15.0,4.0,7.0,14.0,14.0,88.0,-24,1,Playoffs
65680,42022,1610612743,DEN,Denver Nuggets,42200311,2023-05-16 00:00:00,DEN vs. LAL,W,240,50.0,...,30.0,30.0,6.0,4.0,7.0,21.0,126.0,-6,1,Playoffs
65681,42022,1610612738,BOS,Boston Celtics,42200301,2023-05-17 00:00:00,BOS vs. MIA,L,240,42.0,...,35.0,20.0,12.0,3.0,15.0,22.0,123.0,7,1,Playoffs
65682,42022,1610612743,DEN,Denver Nuggets,42200312,2023-05-18 00:00:00,DEN vs. LAL,W,240,39.0,...,40.0,26.0,10.0,7.0,12.0,19.0,103.0,-5,1,Playoffs
65683,42022,1610612738,BOS,Boston Celtics,42200302,2023-05-19 00:00:00,BOS vs. MIA,L,240,37.0,...,45.0,23.0,9.0,5.0,11.0,17.0,111.0,6,1,Playoffs
65684,42022,1610612747,LAL,Los Angeles Lakers,42200313,2023-05-20 00:00:00,LAL vs. DEN,L,240,38.0,...,39.0,30.0,7.0,1.0,6.0,19.0,119.0,11,1,Playoffs
65685,42022,1610612748,MIA,Miami Heat,42200303,2023-05-21 00:00:00,MIA vs. BOS,W,240,46.0,...,57.0,25.0,3.0,3.0,15.0,24.0,102.0,-26,1,Playoffs


In [16]:
games.describe()

Unnamed: 0,season_id,team_id_home,game_id,min,fgm_home,fga_home,fg_pct_home,fg3m_home,fg3a_home,fg3_pct_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,video_available_away
count,65698.0,65698.0,65698.0,65698.0,65685.0,50251.0,50208.0,52480.0,47015.0,46624.0,...,46700.0,49973.0,49897.0,46849.0,47073.0,47013.0,62847.0,65698.0,65698.0,65698.0
mean,22949.338747,1609926000.0,25847470.0,221.003486,39.672269,83.992796,0.467321,5.735099,17.741146,0.346136,...,30.238073,42.119645,22.135419,7.854148,4.681537,15.19986,23.097284,100.991567,-3.627569,0.20133
std,5000.3055,33243130.0,6303760.0,67.903521,6.770802,9.164445,0.059423,4.537337,10.54581,0.151234,...,5.588675,6.867396,5.380805,3.031766,2.50082,4.299798,5.227208,14.418755,13.091395,0.400997
min,12005.0,45.0,10500000.0,0.0,4.0,0.0,0.14,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,-73.0,0.0
25%,21981.0,1610613000.0,21300530.0,240.0,35.0,78.0,0.427,2.0,10.0,0.261,...,26.0,37.0,18.0,6.0,3.0,12.0,20.0,92.0,-12.0,0.0
50%,21997.0,1610613000.0,26300070.0,240.0,40.0,84.0,0.467,5.0,16.0,0.348,...,30.0,42.0,22.0,8.0,4.0,15.0,23.0,101.0,-4.0,0.0
75%,22011.0,1610613000.0,28800690.0,240.0,44.0,89.0,0.506,9.0,24.0,0.42975,...,34.0,47.0,26.0,10.0,6.0,18.0,26.0,110.0,5.0,0.0
max,42022.0,1610617000.0,49800090.0,365.0,84.0,240.0,0.697,28.0,77.0,1.0,...,60.0,90.0,89.0,27.0,19.0,40.0,115.0,196.0,68.0,1.0


In [17]:
games.columns

Index(['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home',
       'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away',
       'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away',
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away',
       'season_type'],
      dtype='object')

## Modeling

In [None]:
# prob use pgmpy or pomegranate for bayesian networks