In [2]:
from scipy import stats
import time
# import requests
from bs4 import BeautifulSoup
import threading
import pandas as pd
import tqdm
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

base_url = 'https://www.basketball-reference.com/'
day_scores_base_url = 'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
data_path = r'/media/td/Samsung_T5/sports/nba'
# data_path = r'C:\Users\TristanDelforge\Documents\sports_predictor\nba'
db_name = 'nba_db'
box_score_link_table_name = 'boxscore_links'

box_score_details_table_name = 'boxscore_details'
processed_team_data_table_name = 'processed_team_data'
player_detail_table_name = 'player_details'
processed_player_data_table_name = 'processed_player_data'
aggregated_player_data_table_name = 'aggregated_player_data'
combined_feature_file_data_table_name = 'combined_feature_file'
past_n_game_dataset_table_name = 'past_n_game_dataset'
target = 'win'

date_record_pickle_file_name = 'scraped_dates'
box_score_record_pickle_file_name = 'scraped_games'
max_tries = 5
file_lock = threading.Lock()

starting_rating = 1000
rating_k_factor = 100
rating_floor = 100
rating_ceiling = 10000
rating_d = 1000
k_min_sensitivity = 1

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [3]:
team_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=box_score_details_table_name),sep='|', low_memory=False)
player_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=player_detail_table_name), sep='|', low_memory=False)

team_data.shape, player_data.shape

((25756, 45), (262446, 48))

In [4]:
team_data.head()

Unnamed: 0,team_tag,team_link,team_name,opponent_tag,opponent_link,opponent_name,location,win,year,month,day,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg
0,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,"Rose Garden Arena, Portland, Oregon",0,2009,10,27,240,30,81,0.37,5,18,0.278,22,29,0.759,10,23,33,18,12,2,16,26,87,,0.464,0.401,0.222,0.358,20.4,65.7,39.3,60.0,12.4,3.6,14.6,100.0,90.1,99.4
1,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,"Rose Garden Arena, Portland, Oregon",1,2009,10,27,240,33,77,0.429,10,21,0.476,20,22,0.909,12,39,51,23,9,12,26,27,96,,0.554,0.494,0.273,0.286,34.3,79.6,60.7,69.7,9.3,19.0,23.1,100.0,99.4,90.1
2,lac,https://www.basketball-reference.com//teams/LA...,Los Angeles Clippers,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,"STAPLES Center, Los Angeles, California",0,2009,10,27,240,39,87,0.448,3,15,0.2,11,16,0.688,15,36,51,27,10,4,20,27,92,,0.489,0.466,0.172,0.184,33.3,67.9,52.0,69.2,10.3,5.9,17.5,100.0,94.4,101.6
3,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,lac,https://www.basketball-reference.com//teams/LA...,Los Angeles Clippers,"STAPLES Center, Los Angeles, California",1,2009,10,27,240,35,85,0.412,4,17,0.235,25,37,0.676,17,30,47,17,13,4,16,15,99,,0.489,0.435,0.2,0.435,32.1,66.7,48.0,48.6,13.3,5.6,13.6,100.0,101.6,94.4
4,was,https://www.basketball-reference.com//teams/WA...,Washington Wizards,dal,https://www.basketball-reference.com//teams/DA...,Dallas Mavericks,"American Airlines Center, Dallas, Texas",1,2009,10,27,240,39,84,0.464,4,13,0.308,20,24,0.833,9,37,46,19,6,4,9,29,102,,0.539,0.488,0.155,0.286,22.5,77.1,52.3,48.7,6.7,6.9,8.7,100.0,113.9,101.6


In [5]:
player_data.head()

Unnamed: 0,ast,ast_pct,blk,blk_pct,day,def_rtg,drb,drb_pct,efg_pct,fg,fg3,fg3_pct,fg3a,fg3a_per_fga_pct,fg_pct,fga,ft,ft_pct,fta,fta_per_fga_pct,location,month,mp,off_rtg,opponent_link,opponent_name,opponent_tag,orb,orb_pct,pf,player_link,player_name,plus_minus,pts,reason,stl,stl_pct,team_link,team_name,team_tag,tov,tov_pct,trb,trb_pct,ts_pct,usg_pct,win,year
0,5.0,27.7,1.0,2.1,27,102.0,2.0,6.6,0.471,8.0,0.0,0.0,6.0,0.353,0.471,17.0,3.0,0.75,4.0,0.235,"Rose Garden Arena, Portland, Oregon",10,41:41,101.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,1.0,2.4,2.0,https://www.basketball-reference.com//players/...,Aaron Brooks,7.0,19.0,,2.0,2.4,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,2.0,9.6,3.0,4.1,0.506,21.8,0,2009
1,2.0,11.4,0.0,0.0,27,103.0,3.0,12.5,0.5,3.0,2.0,0.5,4.0,0.5,0.375,8.0,4.0,0.667,6.0,0.75,"Rose Garden Arena, Portland, Oregon",10,32:58,84.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,3.0,https://www.basketball-reference.com//players/...,Trevor Ariza,-10.0,12.0,,1.0,1.5,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,4.0,27.3,3.0,5.2,0.564,19.4,0,2009
2,0.0,0.0,0.0,0.0,27,98.0,3.0,14.8,0.278,2.0,1.0,0.25,4.0,0.444,0.222,9.0,3.0,0.75,4.0,0.444,"Rose Garden Arena, Portland, Oregon",10,27:43,77.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,1.0,3.5,1.0,https://www.basketball-reference.com//players/...,Shane Battier,-22.0,8.0,,2.0,3.6,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,1.0,8.5,4.0,8.2,0.372,18.6,0,2009
3,1.0,8.0,0.0,0.0,27,94.0,2.0,11.0,0.5,3.0,0.0,,0.0,0.0,0.5,6.0,0.0,,0.0,0.0,"Rose Garden Arena, Portland, Oregon",10,24:52,120.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,3.0,11.8,1.0,https://www.basketball-reference.com//players/...,Chuck Hayes,-8.0,6.0,,3.0,6.0,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,0.0,0.0,5.0,11.5,0.5,10.6,0,2009
4,1.0,7.8,1.0,3.9,27,95.0,4.0,24.8,0.167,1.0,0.0,,0.0,0.0,0.167,6.0,1.0,0.5,2.0,0.333,"Rose Garden Arena, Portland, Oregon",10,22:05,36.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,4.0,https://www.basketball-reference.com//players/...,Luis Scola,-8.0,3.0,,1.0,2.3,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,3.0,30.4,4.0,10.4,0.218,19.6,0,2009


- Negatively correlated with win
- High stat def rating is highly correlated with losing (.51). Teams under pressure don't win that game.
- Personal fouls also correlated with losing (.13)
- Turnovers - (investigate, not intuitive)
- 

In [6]:
team_data_corr = team_data.corr()
team_data_corr = team_data_corr[(team_data_corr['win'] > .2)|(team_data_corr['win'] < -.08)]
team_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.51808
pf,-0.123578
tov,-0.119115
tov_pct,-0.110968
fg3,0.212561
trb,0.261446
fg3_pct,0.306229
ast,0.320717
drb,0.343178
fg,0.369003


In [7]:
player_data_corr = player_data.corr()
player_data_corr = player_data_corr[(player_data_corr['win'] > .05)|(player_data_corr['win'] < -.05)]
player_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.425587
fg,0.059951
fg3,0.061186
ast,0.063367
drb,0.06514
pts,0.066164
fg3_pct,0.10023
fg_pct,0.102537
efg_pct,0.110622
ts_pct,0.112799


In [19]:
past_n_game_obj = None
with open('{data_path}/{db_name}.pkl'.format(data_path=data_path,
                                                             db_name='past_n_game_dataset_4_False'), 'rb') as f:
    past_n_game_dataset = pickle.load(f)
past_n_game_dataset.keys()

dict_keys(['cho', 'okc', 'mem', 'nyk', 'mia', 'mil', 'lac', 'bos', 'sac', 'ind', 'brk', 'cle', 'phi', 'sas', 'por', 'uta', 'min', 'dal', 'den', 'atl', 'det', 'tor', 'hou', 'noh', 'chi', 'nop', 'pho', 'gsw', 'was', 'njn', 'cha', 'lal', 'orl'])

In [20]:
team_data[(team_data['team_tag'] == 'pho')&(team_data['month'] == 12)&(team_data['year'] == 2017)][['month', 'day', 'ast']].head(20)

Unnamed: 0,month,day,ast
21172,12,2,24
21188,12,4,25
21206,12,5,23
21235,12,7,22
21259,12,9,11
21300,12,12,22
21317,12,13,20
21368,12,16,22
21400,12,18,17
21418,12,20,22


In [22]:
initial_team_data_columns = ['ast', 'ast_pct', 'blk', 'blk_pct', 'def_rtg', 'drb', 'drb_pct', 'efg_pct',
                                          'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg3a_per_fga_pct', 'fg_pct', 'fga', 'ft',
                                          'ft_pct',
                                          'fta', 'fta_per_fga_pct', 'mp', 'off_rtg', 'orb', 'orb_pct', 'pf',
                                          'plus_minus', 'pts', 'stl', 'stl_pct', 'tov', 'tov_pct', 'trb', 'trb_pct',
                                          'ts_pct',
                                          'usg_pct', 'home', 'r1', 'r2', 'r3', 'r4']

pd.DataFrame(data=past_n_game_dataset['pho']["['2017-12-31', 'phi', 'pho']"],
             columns=initial_team_data_columns)


Unnamed: 0,ast,ast_pct,blk,blk_pct,def_rtg,drb,drb_pct,efg_pct,fg,fg3,fg3_pct,fg3a,fg3a_per_fga_pct,fg_pct,fga,ft,ft_pct,fta,fta_per_fga_pct,mp,off_rtg,orb,orb_pct,pf,plus_minus,pts,stl,stl_pct,tov,tov_pct,trb,trb_pct,ts_pct,usg_pct,home,r1,r2,r3,r4
0,0.427427,0.697698,0.042042,0.02953,0.481982,0.492492,0.172673,0.296296,0.162663,0.703704,0.627127,0.663664,0.706206,0.187688,0.369369,0.724725,0.544044,0.711211,0.729229,0.0,0.563564,0.66016,0.552052,0.221221,,0.353854,0.101602,0.118619,0.171672,0.195195,0.57958,0.271772,0.364865,0.0,1.0,0.135719,0.695169,0.0,0.108695
1,0.653153,0.569069,0.3999,0.317317,0.832332,0.169169,0.157658,0.648148,0.653654,0.778779,0.463463,0.861862,0.858859,0.574575,0.633133,0.355856,0.100601,0.567067,0.531532,0.0,0.56006,0.362863,0.327828,0.658659,,0.633634,0.190691,0.169169,0.451451,0.422923,0.146146,0.107107,0.537037,0.0,1.0,0.167207,0.697696,0.317818,0.798845
2,0.653153,0.951952,0.559059,0.481982,0.248749,0.697197,0.608108,0.481481,0.117618,0.511512,0.546547,0.473974,0.648148,0.423924,0.054054,0.929429,0.518519,0.935435,0.965966,0.0,0.305806,0.107107,0.185185,0.96997,,0.415916,0.190691,0.169169,0.912913,0.931431,0.394895,0.640641,0.647147,0.0,1.0,0.147307,0.697431,0.0,0.152157
3,0.778278,0.693694,0.559059,0.607608,0.714715,0.421421,0.693694,0.794795,0.718719,0.88989,0.783784,0.807808,0.809309,0.684685,0.583083,0.41992,0.175676,0.567067,0.542543,0.0,0.93994,0.904905,0.918418,0.386887,,0.762763,0.190691,0.225726,0.171672,0.178679,0.73974,0.872372,0.728729,0.0,0.0,0.178922,0.700387,1.0,0.825301


In [23]:
results.shape

(3, 12)

In [37]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results.sort_values('accuracy', ascending = False)


Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy
0,16,3,6,2,128,1,1,1,cnn,8,True,0.577778


In [27]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['network_type'] == 'dnn'].sort_values('accuracy', ascending = False)


Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy
15,2,1,1,2,64,1,1,1,dnn,64,False,0.640994
14,64,2,1,2,512,1,1,1,dnn,64,False,0.638199
13,8,4,1,3,64,1,1,1,dnn,64,False,0.632609
12,4,6,1,2,512,1,2,1,dnn,32,True,0.630435
10,1,4,7,1,512,1,2,2,dnn,32,True,0.625776
9,4,2,1,2,64,2,2,2,dnn,8,True,0.619876
8,32,3,7,1,64,1,1,1,dnn,16,False,0.618012
7,2,5,5,3,512,2,2,1,dnn,32,False,0.614596
6,8,5,1,2,512,1,1,2,dnn,128,True,0.613354
5,4,4,3,3,64,1,2,1,dnn,8,True,0.605901


In [34]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['network_type'] == 'cnn'].sort_values('accuracy', ascending = False)


Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy


In [33]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['network_type'] == 'rnn'].sort_values('accuracy', ascending = False)


Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy
25,16,2,2,1,64,1,1,1,rnn,64,False,0.648758
24,256,7,6,1,64,1,1,2,rnn,32,True,0.647205
23,32,3,5,1,64,1,1,2,rnn,64,True,0.641304
22,64,4,6,1,64,2,1,1,rnn,32,False,0.638199
21,16,3,1,1,512,1,1,2,rnn,32,True,0.637578
20,4,5,4,2,64,1,1,2,rnn,32,True,0.634783
19,8,3,1,2,512,1,1,2,rnn,64,True,0.632298
18,1,1,4,3,64,2,1,1,rnn,32,False,0.628882
17,2,2,7,2,512,1,1,2,rnn,64,False,0.628571
16,64,5,4,2,64,1,1,2,rnn,64,False,0.626708
