In [2]:
from scipy import stats
import time
# import requests
from bs4 import BeautifulSoup
import threading
import pandas as pd
import tqdm
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

base_url = 'https://www.basketball-reference.com/'
day_scores_base_url = 'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
data_path = r'/media/td/Samsung_T5/sports/nba'
# data_path = r'C:\Users\TristanDelforge\Documents\sports_predictor\nba'
db_name = 'nba_db'
box_score_link_table_name = 'boxscore_links'

box_score_details_table_name = 'boxscore_details'
processed_team_data_table_name = 'processed_team_data'
player_detail_table_name = 'player_details'
processed_player_data_table_name = 'processed_player_data'
aggregated_player_data_table_name = 'aggregated_player_data'
combined_feature_file_data_table_name = 'combined_feature_file'
past_n_game_dataset_table_name = 'past_n_game_dataset'
target = 'win'

date_record_pickle_file_name = 'scraped_dates'
box_score_record_pickle_file_name = 'scraped_games'
max_tries = 5
file_lock = threading.Lock()

starting_rating = 1000
rating_k_factor = 100
rating_floor = 100
rating_ceiling = 10000
rating_d = 1000
k_min_sensitivity = 1

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [3]:
team_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=box_score_details_table_name),sep='|', low_memory=False)
player_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=player_detail_table_name), sep='|', low_memory=False)

team_data.shape, player_data.shape

((25756, 45), (262446, 48))

In [4]:
team_data.head()

Unnamed: 0,team_tag,team_link,team_name,opponent_tag,opponent_link,opponent_name,location,win,year,month,day,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg
0,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,"Rose Garden Arena, Portland, Oregon",0,2009,10,27,240,30,81,0.37,5,18,0.278,22,29,0.759,10,23,33,18,12,2,16,26,87,,0.464,0.401,0.222,0.358,20.4,65.7,39.3,60.0,12.4,3.6,14.6,100.0,90.1,99.4
1,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,"Rose Garden Arena, Portland, Oregon",1,2009,10,27,240,33,77,0.429,10,21,0.476,20,22,0.909,12,39,51,23,9,12,26,27,96,,0.554,0.494,0.273,0.286,34.3,79.6,60.7,69.7,9.3,19.0,23.1,100.0,99.4,90.1
2,lac,https://www.basketball-reference.com//teams/LA...,Los Angeles Clippers,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,"STAPLES Center, Los Angeles, California",0,2009,10,27,240,39,87,0.448,3,15,0.2,11,16,0.688,15,36,51,27,10,4,20,27,92,,0.489,0.466,0.172,0.184,33.3,67.9,52.0,69.2,10.3,5.9,17.5,100.0,94.4,101.6
3,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,lac,https://www.basketball-reference.com//teams/LA...,Los Angeles Clippers,"STAPLES Center, Los Angeles, California",1,2009,10,27,240,35,85,0.412,4,17,0.235,25,37,0.676,17,30,47,17,13,4,16,15,99,,0.489,0.435,0.2,0.435,32.1,66.7,48.0,48.6,13.3,5.6,13.6,100.0,101.6,94.4
4,was,https://www.basketball-reference.com//teams/WA...,Washington Wizards,dal,https://www.basketball-reference.com//teams/DA...,Dallas Mavericks,"American Airlines Center, Dallas, Texas",1,2009,10,27,240,39,84,0.464,4,13,0.308,20,24,0.833,9,37,46,19,6,4,9,29,102,,0.539,0.488,0.155,0.286,22.5,77.1,52.3,48.7,6.7,6.9,8.7,100.0,113.9,101.6


In [5]:
player_data.head()

Unnamed: 0,ast,ast_pct,blk,blk_pct,day,def_rtg,drb,drb_pct,efg_pct,fg,fg3,fg3_pct,fg3a,fg3a_per_fga_pct,fg_pct,fga,ft,ft_pct,fta,fta_per_fga_pct,location,month,mp,off_rtg,opponent_link,opponent_name,opponent_tag,orb,orb_pct,pf,player_link,player_name,plus_minus,pts,reason,stl,stl_pct,team_link,team_name,team_tag,tov,tov_pct,trb,trb_pct,ts_pct,usg_pct,win,year
0,5.0,27.7,1.0,2.1,27,102.0,2.0,6.6,0.471,8.0,0.0,0.0,6.0,0.353,0.471,17.0,3.0,0.75,4.0,0.235,"Rose Garden Arena, Portland, Oregon",10,41:41,101.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,1.0,2.4,2.0,https://www.basketball-reference.com//players/...,Aaron Brooks,7.0,19.0,,2.0,2.4,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,2.0,9.6,3.0,4.1,0.506,21.8,0,2009
1,2.0,11.4,0.0,0.0,27,103.0,3.0,12.5,0.5,3.0,2.0,0.5,4.0,0.5,0.375,8.0,4.0,0.667,6.0,0.75,"Rose Garden Arena, Portland, Oregon",10,32:58,84.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,3.0,https://www.basketball-reference.com//players/...,Trevor Ariza,-10.0,12.0,,1.0,1.5,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,4.0,27.3,3.0,5.2,0.564,19.4,0,2009
2,0.0,0.0,0.0,0.0,27,98.0,3.0,14.8,0.278,2.0,1.0,0.25,4.0,0.444,0.222,9.0,3.0,0.75,4.0,0.444,"Rose Garden Arena, Portland, Oregon",10,27:43,77.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,1.0,3.5,1.0,https://www.basketball-reference.com//players/...,Shane Battier,-22.0,8.0,,2.0,3.6,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,1.0,8.5,4.0,8.2,0.372,18.6,0,2009
3,1.0,8.0,0.0,0.0,27,94.0,2.0,11.0,0.5,3.0,0.0,,0.0,0.0,0.5,6.0,0.0,,0.0,0.0,"Rose Garden Arena, Portland, Oregon",10,24:52,120.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,3.0,11.8,1.0,https://www.basketball-reference.com//players/...,Chuck Hayes,-8.0,6.0,,3.0,6.0,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,0.0,0.0,5.0,11.5,0.5,10.6,0,2009
4,1.0,7.8,1.0,3.9,27,95.0,4.0,24.8,0.167,1.0,0.0,,0.0,0.0,0.167,6.0,1.0,0.5,2.0,0.333,"Rose Garden Arena, Portland, Oregon",10,22:05,36.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,4.0,https://www.basketball-reference.com//players/...,Luis Scola,-8.0,3.0,,1.0,2.3,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,3.0,30.4,4.0,10.4,0.218,19.6,0,2009


- Negatively correlated with win
- High stat def rating is highly correlated with losing (.51). Teams under pressure don't win that game.
- Personal fouls also correlated with losing (.13)
- Turnovers - (investigate, not intuitive)
- 

In [6]:
team_data_corr = team_data.corr()
team_data_corr = team_data_corr[(team_data_corr['win'] > .2)|(team_data_corr['win'] < -.08)]
team_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.51808
pf,-0.123578
tov,-0.119115
tov_pct,-0.110968
fg3,0.212561
trb,0.261446
fg3_pct,0.306229
ast,0.320717
drb,0.343178
fg,0.369003


In [7]:
player_data_corr = player_data.corr()
player_data_corr = player_data_corr[(player_data_corr['win'] > .05)|(player_data_corr['win'] < -.05)]
player_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.425587
fg,0.059951
fg3,0.061186
ast,0.063367
drb,0.06514
pts,0.066164
fg3_pct,0.10023
fg_pct,0.102537
efg_pct,0.110622
ts_pct,0.112799


In [64]:
past_n_game_obj = None
with open('{data_path}/{db_name}.pkl'.format(data_path=data_path,
                                                             db_name='past_n_game_dataset_4_False'), 'rb') as f:
    past_n_game_dataset = pickle.load(f)
past_n_game_dataset.keys()

dict_keys(['cle', 'cho', 'sea', 'uta', 'mem', 'njn', 'min', 'por', 'phi', 'cha', 'mil', 'orl', 'lal', 'hou', 'den', 'nyk', 'tor', 'nop', 'lac', 'sas', 'noh', 'sac', 'pho', 'chi', 'bos', 'brk', 'gsw', 'atl', 'dal', 'was', 'mia', 'det', 'ind', 'okc'])

In [63]:
team_data[(team_data['team_tag'] == 'pho')&(team_data['month'] == 12)&(team_data['year'] == 2017)][['month', 'day', 'ast']].head(20)

Unnamed: 0,month,day,ast
21172,12,2,24
21188,12,4,25
21206,12,5,23
21235,12,7,22
21259,12,9,11
21300,12,12,22
21317,12,13,20
21368,12,16,22
21400,12,18,17
21418,12,20,22


In [66]:
initial_team_data_columns = ['ast', 'ast_pct', 'blk', 'blk_pct', 'def_rtg', 'drb', 'drb_pct', 'efg_pct',
                                          'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg3a_per_fga_pct', 'fg_pct', 'fga', 'ft',
                                          'ft_pct',
                                          'fta', 'fta_per_fga_pct', 'mp', 'off_rtg', 'orb', 'orb_pct', 'pf',
                                          'plus_minus', 'pts', 'stl', 'stl_pct', 'tov', 'tov_pct', 'trb', 'trb_pct',
                                          'ts_pct',
                                          'usg_pct', 'home', 'r1', 'r2', 'r3', 'r4']

pd.DataFrame(data=past_n_game_dataset['pho']["['2017-12-31', 'phi', 'pho']"])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113
0,0.438939,0.701201,0.042543,0.030531,0.476476,0.511512,0.180681,0.298298,0.17017,0.723223,0.622122,0.693694,0.731732,0.185686,0.391391,0.713714,0.543043,0.6997,0.715215,0.0,0.557057,0.657157,0.54004,0.215716,0.0,0.361361,0.105606,0.121622,0.175676,0.197197,0.591592,0.271772,0.364364,0.0,1.0,0.173465,1.0,0.0,0.080581,0.034034,0.805305,0.86987,0.558559,0.712713,0.45996,0.254254,0.59009,0.215716,0.063063,0.693694,0.612112,0.343844,0.814314,0.172673,0.432432,0.158659,0.134635,0.0,0.476476,0.859359,0.819319,0.651151,0.0,0.302803,0.196697,0.229229,0.111111,0.107608,0.865866,0.728228,0.183684,0.0,0.0,0.147344,0.0,0.0,0.358358,0.667167,-0.762763,-0.839339,-0.082082,-0.201201,-0.279279,0.044044,-0.41992,0.507508,0.559059,0.0,0.11962,-0.158158,-0.422923,0.541041,0.110611,0.541041,0.580581,0.0,0.080581,-0.202202,-0.279279,-0.435435,0.0,0.058559,-0.091091,-0.107608,0.064565,0.08959,-0.274274,-0.456456,0.180681,0.0,1.0,0.026122,1.0,0.0
1,0.663163,0.575576,0.402402,0.322823,0.829329,0.181181,0.165165,0.650651,0.661662,0.795796,0.46046,0.877377,0.873874,0.57007,0.651151,0.346346,0.097598,0.556056,0.516517,0.0,0.553554,0.357357,0.316817,0.651151,0.0,0.640641,0.196697,0.172673,0.457958,0.424925,0.153153,0.107608,0.535035,0.0,1.0,0.153105,0.0,0.492492,0.221221,0.078579,0.698699,0.786286,0.553554,0.649149,0.683183,0.68969,0.783283,0.317317,0.586086,0.211712,0.187688,0.782282,0.551552,0.874875,0.842342,0.77978,0.745245,0.0,0.829329,0.738238,0.834835,0.563564,0.0,0.85035,0.315816,0.278779,0.56006,0.507508,0.749249,0.892392,0.794294,0.0,0.0,0.476395,1.0,0.853353,0.441942,0.496997,-0.296296,-0.463463,0.275776,-0.467968,-0.518018,-0.039039,-0.121622,0.478478,-0.125626,0.665666,0.686186,-0.212212,0.0996,-0.528529,-0.744745,-0.223724,-0.228729,0.0,-0.275776,-0.380881,-0.518018,0.087588,0.0,-0.20971,-0.119119,-0.106106,-0.102102,-0.082583,-0.596096,-0.784785,-0.259259,0.0,1.0,-0.323291,-1.0,-0.360861
2,0.663163,0.952953,0.561562,0.487988,0.244745,0.712713,0.61962,0.484484,0.123624,0.534034,0.542543,0.50951,0.677177,0.422422,0.06006,0.922923,0.517017,0.92993,0.961962,0.0,0.301301,0.104104,0.177177,0.968969,0.0,0.424925,0.196697,0.172673,0.915415,0.930931,0.408408,0.640641,0.644645,0.0,1.0,0.18537,1.0,0.853353,0.019019,0.012513,0.97998,0.997304,0.301301,0.237738,0.822823,0.094595,0.289289,0.068068,0.028529,0.407908,0.351351,0.166667,0.698198,0.842843,0.737738,0.77978,0.715215,0.0,0.244745,0.459459,0.38038,0.951952,0.0,0.361361,0.804304,0.784284,0.175676,0.147648,0.241742,0.35986,0.170671,0.0,0.0,0.157931,0.0,0.853353,0.644144,0.94044,-0.418418,-0.509316,-0.056557,0.474975,-0.203203,0.38989,-0.165666,0.465966,0.514014,0.101602,0.325826,0.255756,-0.638138,0.08008,-0.220721,0.15015,0.246747,0.0,0.056557,-0.355355,-0.203203,0.017017,0.0,0.063564,-0.607608,-0.611612,0.73974,0.783283,0.166667,0.280781,0.473974,0.0,1.0,0.027439,1.0,0.0
3,0.785285,0.696196,0.561562,0.613113,0.70971,0.439439,0.704705,0.797798,0.726226,0.8999,0.777778,0.827828,0.827828,0.681682,0.602603,0.409409,0.172172,0.556056,0.527528,0.0,0.937437,0.903403,0.913413,0.379379,0.0,0.768769,0.196697,0.229229,0.175676,0.17968,0.749249,0.871371,0.727227,0.0,0.0,0.222034,1.0,0.853353,0.591091,0.652653,0.123123,0.137638,0.937437,0.3003,0.086587,0.565065,0.436937,0.534034,0.477477,0.558058,0.607107,0.558058,0.338839,0.6001,0.732733,0.504004,0.549049,0.0,0.70971,0.259259,0.295295,0.72973,0.0,0.486486,0.315816,0.364865,0.111111,0.147648,0.195195,0.128629,0.607608,0.0,1.0,0.229156,0.0,0.492492,0.194194,0.043544,0.438438,0.475475,-0.227728,0.139139,0.618118,0.232733,0.289289,0.365866,0.3003,0.26977,0.220721,0.123624,0.263764,-0.190691,-0.560561,0.052052,-0.021522,0.0,0.227728,0.644144,0.618118,-0.35035,0.0,0.282282,-0.119119,-0.135636,0.064565,0.032032,0.554054,0.742743,0.11962,0.0,-1.0,-0.007122,1.0,0.360861


In [23]:
results.shape

(3, 12)

In [85]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results.sort_values('accuracy', ascending = False).head(10)


Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy
24,72,11,1,4,69,4,2,1,cnn,64,False,0.673
23,8,10,9,1,35,4,1,1,rnn,64,False,0.670056
22,60,7,5,3,67,4,3,1,rnn,64,False,0.668718
21,19,7,7,3,91,2,1,1,rnn,64,True,0.668451
20,124,5,6,2,69,1,1,1,rnn,64,False,0.668183
19,55,8,10,3,22,4,2,1,rnn,64,False,0.667648
18,74,11,1,1,101,4,2,1,dnn,64,True,0.666577
17,73,12,11,2,68,2,4,1,dnn,64,True,0.665775
16,90,9,9,4,44,3,1,1,rnn,64,True,0.664704
15,24,12,14,2,71,2,1,1,rnn,64,True,0.664437
