In [1]:
from scipy import stats
import time
# import requests
from bs4 import BeautifulSoup
import threading
import pandas as pd
import tqdm
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

base_url = 'https://www.basketball-reference.com/'
day_scores_base_url = 'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
# data_path = r'/media/td/Samsung_T5/sports/nba'
data_path = r'C:\Users\TristanDelforge\Documents\sports_predictor\nba'
db_name = 'nba_db'
box_score_link_table_name = 'boxscore_links'

box_score_details_table_name = 'boxscore_details'
processed_team_data_table_name = 'processed_team_data'
player_detail_table_name = 'player_details'
processed_player_data_table_name = 'processed_player_data'
aggregated_player_data_table_name = 'aggregated_player_data'
combined_feature_file_data_table_name = 'combined_feature_file'
past_n_game_dataset_table_name = 'past_n_game_dataset'
target = 'win'

date_record_pickle_file_name = 'scraped_dates'
box_score_record_pickle_file_name = 'scraped_games'
max_tries = 5
file_lock = threading.Lock()

starting_rating = 1000
rating_k_factor = 100
rating_floor = 100
rating_ceiling = 10000
rating_d = 1000
k_min_sensitivity = 1

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
team_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=box_score_details_table_name),sep='|', low_memory=False)
player_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=player_detail_table_name), sep='|', low_memory=False)

team_data.shape, player_data.shape

((6840, 45), (86353, 48))

In [3]:
team_data.head()

Unnamed: 0,team_tag,team_link,team_name,opponent_tag,opponent_link,opponent_name,location,win,year,month,day,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg
0,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,min,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,"Target Center, Minneapolis, Minnesota",1,2017,1,1,240,36,73,0.493,6,22,0.273,17,28,0.607,4,37,41,13,10,8,14,19,95,,0.557,0.534,0.301,0.384,10.5,74.0,46.6,36.1,10.5,12.5,14.1,100.0,100.1,93.8
1,min,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,"Target Center, Minneapolis, Minnesota",0,2017,1,1,240,35,87,0.402,7,23,0.304,12,18,0.667,13,34,47,21,11,5,16,25,89,,0.469,0.443,0.264,0.207,26.0,89.5,53.4,60.0,11.6,9.8,14.4,100.0,93.8,100.1
2,det,https://www.basketball-reference.com//teams/DE...,Detroit Pistons,mia,https://www.basketball-reference.com//teams/MI...,Miami Heat,"AmericanAirlines Arena, Miami, Florida",1,2017,1,1,240,42,86,0.488,9,22,0.409,14,22,0.636,10,38,48,16,7,5,12,18,107,,0.559,0.541,0.256,0.256,24.4,92.7,58.5,38.1,7.5,9.8,11.1,100.0,114.9,105.2
3,mia,https://www.basketball-reference.com//teams/MI...,Miami Heat,det,https://www.basketball-reference.com//teams/DE...,Detroit Pistons,"AmericanAirlines Arena, Miami, Florida",0,2017,1,1,240,35,77,0.455,14,26,0.538,14,18,0.778,3,31,34,25,6,4,10,22,98,,0.577,0.545,0.338,0.234,7.3,75.6,41.5,71.4,6.4,6.3,10.5,100.0,105.2,114.9
4,tor,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,"STAPLES Center, Los Angeles, California",1,2017,1,1,240,42,75,0.56,10,17,0.588,29,32,0.906,5,35,40,18,5,9,16,25,123,,0.69,0.627,0.227,0.427,15.6,68.6,48.2,42.9,5.2,15.0,15.2,100.0,128.6,119.2


In [4]:
player_data.head()

Unnamed: 0,ast,ast_pct,blk,blk_pct,day,def_rtg,drb,drb_pct,efg_pct,fg,fg3,fg3_pct,fg3a,fg3a_per_fga_pct,fg_pct,fga,ft,ft_pct,fta,fta_per_fga_pct,location,month,mp,off_rtg,opponent_link,opponent_name,opponent_tag,orb,orb_pct,pf,player_link,player_name,plus_minus,pts,reason,stl,stl_pct,team_link,team_name,team_tag,tov,tov_pct,trb,trb_pct,ts_pct,usg_pct,win,year
0,1.0,3.8,1.0,1.9,1,97.0,8.0,19.2,0.45,4.0,1.0,0.2,5.0,0.5,0.4,10.0,1.0,0.5,2.0,0.2,"Target Center, Minneapolis, Minnesota",1,40:04,80.0,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,min,0.0,0.0,3.0,https://www.basketball-reference.com//players/...,Allen Crabbe,14.0,10.0,,0.0,0.0,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,2.0,15.5,8.0,10.9,0.46,15.5,1,2017
1,3.0,22.3,1.0,1.9,1,92.0,4.0,9.8,0.7,16.0,3.0,0.5,6.0,0.24,0.64,25.0,8.0,0.889,9.0,0.36,"Target Center, Minneapolis, Minnesota",1,39:14,136.0,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,min,1.0,3.2,3.0,https://www.basketball-reference.com//players/...,CJ McCollum,7.0,43.0,,3.0,3.9,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,3.0,9.4,5.0,7.0,0.742,39.4,1,2017
2,3.0,11.7,0.0,0.0,1,93.0,6.0,16.2,0.143,1.0,0.0,0.0,2.0,0.286,0.143,7.0,0.0,,0.0,0.0,"Target Center, Minneapolis, Minnesota",1,35:39,52.0,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,min,0.0,0.0,3.0,https://www.basketball-reference.com//players/...,Al-Farouq Aminu,10.0,2.0,,2.0,2.8,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,6.0,9.2,0.143,9.5,1,2017
3,1.0,4.7,2.0,4.6,1,91.0,4.0,11.8,0.429,3.0,0.0,0.0,2.0,0.286,0.429,7.0,2.0,0.667,3.0,0.429,"Target Center, Minneapolis, Minnesota",1,32:37,108.0,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,min,1.0,3.9,2.0,https://www.basketball-reference.com//players/...,Maurice Harkless,10.0,8.0,,2.0,3.1,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,0.0,0.0,5.0,8.4,0.481,12.3,1,2017
4,3.0,19.0,0.0,0.0,1,94.0,6.0,19.9,0.667,6.0,0.0,,0.0,0.0,0.667,9.0,6.0,0.6,10.0,1.111,"Target Center, Minneapolis, Minnesota",1,29:00,128.0,https://www.basketball-reference.com//teams/MI...,Minnesota Timberwolves,min,2.0,8.7,2.0,https://www.basketball-reference.com//players/...,Mason Plumlee,11.0,18.0,,1.0,1.7,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,por,2.0,13.0,8.0,15.0,0.672,25.7,1,2017


- Negatively correlated with win
- High stat def rating is highly correlated with losing (.51). Teams under pressure don't win that game.
- Personal fouls also correlated with losing (.13)
- Turnovers - (investigate, not intuitive)
- 

In [5]:
team_data_corr = team_data.corr()
team_data_corr = team_data_corr[(team_data_corr['win'] > .2)|(team_data_corr['win'] < -.08)]
team_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.523151
tov,-0.112771
tov_pct,-0.107155
pf,-0.084628
fg3,0.264511
trb,0.287366
ast,0.3084
fg3_pct,0.340934
drb,0.359578
fg,0.382432


In [6]:
player_data_corr = player_data.corr()
player_data_corr = player_data_corr[(player_data_corr['win'] > .05)|(player_data_corr['win'] < -.05)]
player_data_corr.sort_values('win')[['win']]

Unnamed: 0,win
def_rtg,-0.460235
fg,0.059796
ast,0.060459
pts,0.064656
drb,0.065264
fg3,0.06704
fg3_pct,0.095914
fg_pct,0.103754
ts_pct,0.110535
efg_pct,0.111211


In [7]:
past_n_game_obj = None
with open('{data_path}/{db_name}.pkl'.format(data_path=data_path,
                                                             db_name=past_n_game_dataset_table_name), 'rb') as f:
    past_n_game_dataset = pickle.load(f)
past_n_game_dataset.keys()

dict_keys(['sac', 'phi', 'orl', 'mem', 'hou', 'cle', 'cho', 'min', 'den', 'mia', 'was', 'tor', 'ind', 'okc', 'sas', 'chi', 'lac', 'dal', 'bos', 'lal', 'nop', 'mil', 'gsw', 'brk', 'nyk', 'pho', 'det', 'atl', 'por', 'uta'])

In [21]:
team_data[(team_data['team_tag'] == 'pho')&(team_data['month'] == 12)&(team_data['year'] == 2017)][['month', 'day', 'ast']].head(20)

Unnamed: 0,month,day,ast
2270,12,2,24
2286,12,4,25
2304,12,5,23
2333,12,7,22
2357,12,9,11
2398,12,12,22
2415,12,13,20
2466,12,16,22
2498,12,18,17
2516,12,20,22


In [15]:
past_n_game_dataset['pho']["['2017-12-29', 'pho', 'sac']"]

array([[24.0, 75.0, 5.0, 7.5, 100.2, 35.0, 77.8, 0.5, 32.0, 8.0, 0.364,
        22.0, 0.306, 0.444, 72.0, 27.0, 0.7709999999999999, 35.0, 0.486,
        240.0, 102.2, 6.0, 17.6, 29.0, nan, 99.0, 5.0, 5.2, 19.0, 17.9,
        41.0, 51.9, 0.5660000000000001, 100.0, 1.0],
       [24.0, 60.0, 4.0, 5.9, 119.0, 27.0, 67.5, 0.529, 40.0, 11.0,
        0.344, 32.0, 0.37200000000000005, 0.465, 86.0, 15.0, 0.625, 24.0,
        0.27899999999999997, 240.0, 109.7, 9.0, 20.9, 22.0, nan, 106.0,
        5.0, 5.2, 13.0, 11.9, 36.0, 43.4, 0.5489999999999999, 100.0, 1.0],
       [21.0, 63.6, 1.0, 1.6, 107.5, 32.0, 68.1, 0.469, 33.0, 10.0,
        0.385, 26.0, 0.321, 0.40700000000000003, 81.0, 21.0, 0.778, 27.0,
        0.33299999999999996, 240.0, 109.8, 12.0, 25.5, 17.0, nan, 97.0,
        4.0, 4.5, 10.0, 9.7, 44.0, 46.8, 0.522, 100.0, 1.0],
       [22.0, 62.9, 3.0, 5.3, 111.6, 36.0, 80.0, 0.402, 35.0, 8.0,
        0.22899999999999998, 35.0, 0.361, 0.361, 97.0, 17.0, 0.895, 19.0,
        0.196, 240.0, 98.