In [1]:
import pandas as pd
import csv
import numpy as np
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
%matplotlib inline


In [2]:
from nba_api.stats.static import teams

nba_teams = teams.get_teams()
# Select the dictionary for the Celtics, which contains their team ID
celtics = [team for team in nba_teams if team['abbreviation'] == 'PHX'][0]
celtics_id = celtics['id']

from nba_api.stats.endpoints import leaguegamefinder

# Query for games where the Celtics were playing
gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=celtics_id)
# The first DataFrame of those returned is what we want.
games = gamefinder.get_data_frames()[0]
games.groupby(games.SEASON_ID.str[-4:])[['GAME_ID']].count().loc['2020':]

# Subset the games to when the last 4 digits of SEASON_ID were 2017.
games_2021 = games[games.SEASON_ID.str[-4:] == '2020']
games_2021.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42020,1610612756,PHX,Phoenix Suns,42000313,2021-06-24,PHX @ LAC,L,240,92,...,0.923,11,32,43,24,5,3,10,19,-14.0
1,42020,1610612756,PHX,Phoenix Suns,42000312,2021-06-22,PHX vs. LAC,W,239,104,...,0.889,7,32,39,24,5,3,9,20,1.0
2,42020,1610612756,PHX,Phoenix Suns,42000311,2021-06-20,PHX vs. LAC,W,240,120,...,1.0,7,36,43,31,5,3,7,16,6.0
3,42020,1610612756,PHX,Phoenix Suns,42000234,2021-06-13,PHX @ DEN,W,241,125,...,0.935,7,41,48,18,6,6,14,21,7.0
4,42020,1610612756,PHX,Phoenix Suns,42000233,2021-06-11,PHX @ DEN,W,239,116,...,0.9,4,34,38,26,10,2,12,13,14.0


In [3]:
from basketball_reference_scraper.seasons import get_schedule

In [4]:
playoff2020 = get_schedule(2020, playoffs = True)

In [5]:
playoff2020

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS
1060,2020-08-17,Utah Jazz,125,Denver Nuggets,135
1061,2020-08-17,Brooklyn Nets,110,Toronto Raptors,134
1062,2020-08-17,Philadelphia 76ers,101,Boston Celtics,109
1063,2020-08-17,Dallas Mavericks,110,Los Angeles Clippers,118
1064,2020-08-18,Orlando Magic,122,Milwaukee Bucks,110
...,...,...,...,...,...
1138,2020-10-02,Miami Heat,114,Los Angeles Lakers,124
1139,2020-10-04,Los Angeles Lakers,104,Miami Heat,115
1140,2020-10-06,Los Angeles Lakers,102,Miami Heat,96
1141,2020-10-09,Miami Heat,111,Los Angeles Lakers,108


In [30]:
playoff2019 = get_schedule(2019, playoffs = True)
playoff2019

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS
1231,2019-04-13,Brooklyn Nets,111,Philadelphia 76ers,102
1232,2019-04-13,Orlando Magic,104,Toronto Raptors,101
1233,2019-04-13,Los Angeles Clippers,104,Golden State Warriors,121
1234,2019-04-13,San Antonio Spurs,101,Denver Nuggets,96
1235,2019-04-14,Indiana Pacers,74,Boston Celtics,84
...,...,...,...,...,...
1308,2019-06-02,Golden State Warriors,109,Toronto Raptors,104
1309,2019-06-05,Toronto Raptors,123,Golden State Warriors,109
1310,2019-06-07,Toronto Raptors,105,Golden State Warriors,92
1311,2019-06-10,Golden State Warriors,106,Toronto Raptors,105


In [6]:
np.savetxt('2020playoff.csv', playoff2020, delimiter =",", fmt ="% s")

In [7]:
csv_path = Path('2020data.csv')
df_2020 = pd.read_csv(csv_path)
df_2020.head()

Unnamed: 0,Season,Time,VTeam,Hteam,VScore,Hscore,Mar.,FGM,FGA,FG%,...,OPOFF,OPDEF,OPREB,OPAST,OPSTL,OPBLK,OPTO,OPPF,OPPTS,H Betting Line
0,(2019-20),8/17/2020 0:00,UTA,DEN,125,135,-10,40.1,85.1,0.47,...,10.08,32.56,42.64,25.37,7.19,4.44,14.26,20.47,109.18,160
1,(2019-20),8/17/2020 0:00,BRO,TOR,110,134,-24,40.4,90.3,0.45,...,10.92,35.14,46.06,25.57,7.22,5.49,16.83,20.36,106.51,450
2,(2019-20),8/17/2020 0:00,PHI,BOS,101,109,-8,41.1,87.9,0.47,...,10.32,34.01,44.33,22.4,7.14,5.5,15.24,20.65,107.35,230
3,(2019-20),8/17/2020 0:00,DAL,LAC,110,118,-8,41.7,90.3,0.46,...,10.71,34.6,45.31,23.74,7.5,4.69,14.07,22.88,109.9,285
4,(2019-20),8/18/2020 0:00,ORL,MIL,122,110,12,39.3,88.6,0.44,...,9.49,36.32,45.81,24.16,7.37,4.47,14.11,21.73,108.59,900


In [8]:
df_2020.set_index('Time', inplace=True)
df_2020

Unnamed: 0_level_0,Season,VTeam,Hteam,VScore,Hscore,Mar.,FGM,FGA,FG%,3FGM,...,OPOFF,OPDEF,OPREB,OPAST,OPSTL,OPBLK,OPTO,OPPF,OPPTS,H Betting Line
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8/17/2020 0:00,(2019-20),UTA,DEN,125,135,-10,40.1,85.1,0.47,13.4,...,10.08,32.56,42.64,25.37,7.19,4.44,14.26,20.47,109.18,160
8/17/2020 0:00,(2019-20),BRO,TOR,110,134,-24,40.4,90.3,0.45,13.1,...,10.92,35.14,46.06,25.57,7.22,5.49,16.83,20.36,106.51,450
8/17/2020 0:00,(2019-20),PHI,BOS,101,109,-8,41.1,87.9,0.47,11.6,...,10.32,34.01,44.33,22.40,7.14,5.50,15.24,20.65,107.35,230
8/17/2020 0:00,(2019-20),DAL,LAC,110,118,-8,41.7,90.3,0.46,15.1,...,10.71,34.60,45.31,23.74,7.50,4.69,14.07,22.88,109.90,285
8/18/2020 0:00,(2019-20),ORL,MIL,122,110,12,39.3,88.6,0.44,11.1,...,9.49,36.32,45.81,24.16,7.37,4.47,14.11,21.73,108.59,900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10/2/2020 0:00,(2019-20),MIA,LAL,114,124,-10,39.5,84.4,0.47,13.4,...,9.41,32.86,42.27,23.41,8.24,3.70,15.92,21.75,107.65,450
10/4/2020 0:00,(2019-20),LAL,MIA,104,115,-11,42.3,88.3,0.48,11.0,...,9.23,33.30,42.53,24.58,7.47,4.14,14.00,21.70,109.10,-420
10/6/2020 0:00,(2019-20),LAL,MIA,102,96,6,42.3,88.3,0.48,11.0,...,9.23,33.30,42.53,24.58,7.47,4.14,14.00,21.70,109.10,-330
10/9/2020 0:00,(2019-20),MIA,LAL,111,108,3,39.5,84.4,0.47,13.4,...,9.41,32.86,42.27,23.41,8.24,3.70,15.92,21.75,107.65,280


In [9]:
# Set x variable list of features 
df_2020_x_var_list = ['FGM','FGA','FG%','3FGM','3FGA','3FG%','FTM','FTA','FT%','OFF',
                      'DEF','REB','AST','STL','BLK','TO','PF','PTS','OPFGM','OPFGA',
                      'OPFG%','OP3FGM','OP3FGA','OP3FG%','OPFTM','OPFTA','OPFT%',
                      'OPOFF','OPDEF','OPREB','OPAST','OPSTL','OPBLK','OPTO','OPPF',
                      'OPPTS','H Betting Line'
                     ]

In [10]:
# Construct the X_train and y_train datasets
df_2020_x_train = df_2020[df_2020_x_var_list]
df_2020_y_train = df_2020['Mar.']

In [13]:
csv_path = Path('2021playoff.csv')
df_2021 = pd.read_csv(csv_path)
df_2021.head()

Unnamed: 0,Season,Time,VTeam,Hteam,VScore,Hscore,Mar.,FGM,FGA,FG%,...,OPOFF,OPDEF,OPREB,OPAST,OPSTL,OPBLK,OPTO,OPPF,OPPTS,H Betting Line
0,(2020-21),5/22/2021 0:00,MIA,MIL,107.0,109.0,-2,39.22,83.74,0.47,...,8.04,33.46,41.5,26.32,7.9,3.97,14.07,18.93,108.07,190.0
1,(2020-21),5/22/2021 0:00,DAL,LAC,113.0,103.0,10,41.08,87.32,0.47,...,9.13,34.21,43.33,22.88,6.25,4.32,12.07,19.39,112.44,210.0
2,(2020-21),5/22/2021 0:00,BOS,BRO,93.0,104.0,-11,41.46,88.9,0.47,...,10.63,33.63,44.25,23.46,7.72,5.32,14.06,20.43,112.63,320.0
3,(2020-21),5/22/2021 0:00,POR,DEN,123.0,109.0,14,41.29,91.08,0.45,...,10.64,33.9,44.54,21.26,6.89,5.04,11.1,18.92,116.11,100.0
4,(2020-21),5/23/2021 0:00,WAS,PHI,118.0,125.0,-7,43.17,90.93,0.47,...,9.68,35.51,45.19,25.49,7.33,4.13,14.4,21.6,116.64,300.0


In [14]:
df_2021.set_index('Time', inplace=True)
df_2021

Unnamed: 0_level_0,Season,VTeam,Hteam,VScore,Hscore,Mar.,FGM,FGA,FG%,3FGM,...,OPOFF,OPDEF,OPREB,OPAST,OPSTL,OPBLK,OPTO,OPPF,OPPTS,H Betting Line
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5/22/2021 0:00,(2020-21),MIA,MIL,107.0,109.0,-2,39.22,83.74,0.47,12.94,...,8.04,33.46,41.50,26.32,7.90,3.97,14.07,18.93,108.07,190.0
5/22/2021 0:00,(2020-21),DAL,LAC,113.0,103.0,10,41.08,87.32,0.47,13.81,...,9.13,34.21,43.33,22.88,6.25,4.32,12.07,19.39,112.44,210.0
5/22/2021 0:00,(2020-21),BOS,BRO,93.0,104.0,-11,41.46,88.90,0.47,13.60,...,10.63,33.63,44.25,23.46,7.72,5.32,14.06,20.43,112.63,320.0
5/22/2021 0:00,(2020-21),POR,DEN,123.0,109.0,14,41.29,91.08,0.45,15.72,...,10.64,33.90,44.54,21.26,6.89,5.04,11.10,18.92,116.11,100.0
5/23/2021 0:00,(2020-21),WAS,PHI,118.0,125.0,-7,43.17,90.93,0.47,10.18,...,9.68,35.51,45.19,25.49,7.33,4.13,14.40,21.60,116.64,300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6/30/2021 0:00,(2020-21),PHO,LAC,,,0,43.31,88.29,0.49,13.06,...,8.75,34.19,42.94,26.93,7.18,4.33,12.53,19.08,115.28,
7/1/2021 0:00,(2020-21),ATL,MIL,,,0,40.79,87.24,0.47,12.43,...,10.56,35.07,45.63,24.13,6.99,4.75,13.24,19.33,113.69,
7/2/2021 0:00,(2020-21),LAC,PHO,,,0,41.79,86.69,0.48,14.26,...,9.42,34.74,44.15,24.39,7.07,4.10,13.19,19.21,114.01,
7/3/2021 0:00,(2020-21),MIL,ATL,,,0,44.74,91.81,0.49,14.42,...,10.29,37.83,48.13,25.47,8.13,4.64,13.82,17.28,120.13,


In [15]:
df_2021_x_var_list = ['FGM','FGA','FG%','3FGM','3FGA','3FG%','FTM','FTA','FT%','OFF',
                      'DEF','REB','AST','STL','BLK','TO','PF','PTS','OPFGM','OPFGA',
                      'OPFG%','OP3FGM','OP3FGA','OP3FG%','OPFTM','OPFTA','OPFT%',
                      'OPOFF','OPDEF','OPREB','OPAST','OPSTL','OPBLK','OPTO','OPPF',
                      'OPPTS','H Betting Line'
                     ]

In [18]:
playoff2021_testing_start =  '5/22/2021 0:00'
playoff2021_testing_end = '6/20/2021 0:00'

In [19]:
df_2021_test_X = df_2021[df_2021_x_var_list][playoff2021_testing_start:playoff2021_testing_end]
df_2021_test_y = df_2021['Mar.'][playoff2021_testing_start:playoff2021_testing_end]

In [20]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
df_2020_forest_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
df_2020_forest_model.fit(df_2020_x_train, df_2020_y_train)



RandomForestClassifier(max_depth=3, random_state=0)

In [21]:
# Make a prediction of "y" values from the X_test dataset
playoff2021_predictions = df_2020_forest_model.predict(df_2021_test_X)


In [29]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
playoff2021_results = df_2021_test_y.to_frame()
playoff2021_results["Predicted Mar."] = playoff2021_predictions
playoff2021_results.tail(20)

Unnamed: 0_level_0,Mar.,Predicted Mar.
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
6/8/2021 0:00,-3,-12
6/9/2021 0:00,-25,-12
6/10/2021 0:00,-3,-12
6/10/2021 0:00,-6,-12
6/11/2021 0:00,16,8
6/11/2021 0:00,14,-2
6/12/2021 0:00,-26,-12
6/13/2021 0:00,-11,8
6/13/2021 0:00,7,8
6/14/2021 0:00,-3,8
