In [1]:
from scipy import stats
import time
# import requests
from bs4 import BeautifulSoup
import threading
import pandas as pd
import tqdm
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

base_url = 'https://www.basketball-reference.com/'
day_scores_base_url = 'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
data_path = r'/media/td/Samsung_T5/sports/nba'
# data_path = r'C:\Users\TristanDelforge\Documents\sports_predictor\nba'
db_name = 'nba_db'
box_score_link_table_name = 'boxscore_links'

box_score_details_table_name = 'boxscore_details'
processed_team_data_table_name = 'processed_team_data'
player_detail_table_name = 'player_details'
processed_player_data_table_name = 'processed_player_data'
aggregated_player_data_table_name = 'aggregated_player_data'
combined_feature_file_data_table_name = 'combined_feature_file'
past_n_game_dataset_table_name = 'past_n_game_dataset'
target = 'win'

date_record_pickle_file_name = 'scraped_dates'
box_score_record_pickle_file_name = 'scraped_games'
max_tries = 5
file_lock = threading.Lock()

starting_rating = 1000
rating_k_factor = 100
rating_floor = 100
rating_ceiling = 10000
rating_d = 1000
k_min_sensitivity = 1

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [None]:
team_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=box_score_details_table_name),sep='|', low_memory=False)
player_data = pd.read_csv('{data_path}/{db_name}.csv'.format(data_path=data_path,db_name=player_detail_table_name), sep='|', low_memory=False)

team_data.shape, player_data.shape

In [None]:
team_data.head()

In [None]:
player_data.head()

- Negatively correlated with win
- High stat def rating is highly correlated with losing (.51). Teams under pressure don't win that game.
- Personal fouls also correlated with losing (.13)
- Turnovers - (investigate, not intuitive)
- 

In [None]:
team_data_corr = team_data.corr()
team_data_corr = team_data_corr[(team_data_corr['win'] > .2)|(team_data_corr['win'] < -.08)]
team_data_corr.sort_values('win')[['win']]

In [None]:
player_data_corr = player_data.corr()
player_data_corr = player_data_corr[(player_data_corr['win'] > .05)|(player_data_corr['win'] < -.05)]
player_data_corr.sort_values('win')[['win']]

In [None]:

past_n_game_obj = None
with open('{data_path}/{db_name}.pkl'.format(data_path=data_path,
                                                             db_name='past_n_game_dataset_32_False'), 'rb') as f:
    past_n_game_dataset = pickle.load(f)
past_n_game_dataset.keys()


In [None]:
team_data[(team_data['team_tag'] == 'pho')&(team_data['month'] == 12)&(team_data['year'] == 2017)][['month', 'day', 'ast']].head(20)

In [None]:
initial_team_data_columns = ['ast', 'ast_pct', 'blk', 'blk_pct', 'def_rtg', 'drb', 'drb_pct', 'efg_pct',
                                          'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg3a_per_fga_pct', 'fg_pct', 'fga', 'ft',
                                          'ft_pct',
                                          'fta', 'fta_per_fga_pct', 'mp', 'off_rtg', 'orb', 'orb_pct', 'pf',
                                          'plus_minus', 'pts', 'stl', 'stl_pct', 'tov', 'tov_pct', 'trb', 'trb_pct',
                                          'ts_pct',
                                          'usg_pct', 'home', 'r1', 'r2', 'r3', 'r4']

pd.DataFrame(data=past_n_game_dataset['pho']["['2017-12-31', 'phi', 'pho']"])


In [None]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results.sort_values('accuracy', ascending = False).head(10)


In [48]:
results.shape

(24, 15)

In [2]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results.sort_values('accuracy', ascending = False).head(10)

Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy,pooling_choice,recurrent_layers_width,resnet
34,61,8,1,2,13,1,2,1,LocallyConnected1D,32,False,0.681332,,51,True
33,2,2,4,1,55,2,2,1,dnn,16,False,0.678217,,31,True
32,90,1,5,2,64,1,1,1,LocalConvLSTM1DCell,32,True,0.677633,layers.AveragePooling1D,61,True
31,36,6,5,2,23,1,2,1,LocalConvLSTM1DCell,8,True,0.674713,layers.MaxPooling1D,60,True
30,90,8,8,1,94,1,2,1,LocallyConnected1D,16,False,0.674129,,55,True
29,15,2,1,2,45,2,2,1,Bidirectional_GRU,16,False,0.674129,layers.AveragePooling1D,102,False
28,83,4,9,1,23,1,2,1,Bidirectional_LSTM,32,False,0.673934,layers.AveragePooling1D,67,False
27,6,1,3,2,62,1,2,1,dnn,8,True,0.67374,,122,True
26,48,4,7,2,86,1,1,1,Conv1D2+GRU,16,True,0.672766,layers.AveragePooling1D,58,False
25,118,2,5,1,96,2,2,1,LocalConvLSTM1DCell,16,False,0.671404,,124,True


In [3]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['history_lengths'] == 4].sort_values('accuracy', ascending = False).head(20)

Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy,pooling_choice,recurrent_layers_width,resnet
13,16,6,7,2,33,1,2,1,LocallyConnected1D,4,True,0.664201,layers.AveragePooling1D,13,True
9,19,5,3,3,97,1,1,1,LSTM,4,True,0.661476,layers.AveragePooling1D,70,True
3,27,1,4,3,92,1,2,1,Bidirectional_LSTM,4,False,0.65875,,73,True
0,44,3,3,3,85,1,2,1,Conv1D,4,True,0.645902,layers.MaxPooling1D,17,True


In [4]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['history_lengths'] == 8].sort_values('accuracy', ascending = False).head(20)

Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy,pooling_choice,recurrent_layers_width,resnet
31,36,6,5,2,23,1,2,1,LocalConvLSTM1DCell,8,True,0.674713,layers.MaxPooling1D,60,True
27,6,1,3,2,62,1,2,1,dnn,8,True,0.67374,,122,True
23,29,8,9,2,89,1,1,1,LSTM,8,True,0.67082,layers.AveragePooling1D,26,True
22,80,8,6,2,26,1,1,1,LSTM,8,True,0.67043,layers.AveragePooling1D,43,False
21,89,8,9,3,58,1,1,1,Conv1D2+GRU,8,True,0.669846,layers.MaxPooling1D,5,True
17,51,2,1,3,110,1,1,1,LocalConvLSTM1DCell,8,False,0.667316,,119,False
14,110,1,6,3,46,1,2,1,GRU,8,True,0.664396,,33,False
11,95,3,1,1,33,2,2,1,LocalConvLSTM1DCell2,8,True,0.661865,layers.AveragePooling1D,55,True
4,18,8,7,3,35,2,2,1,Conv1D2,8,True,0.659334,layers.MaxPooling1D,83,True
1,96,6,7,3,55,2,1,1,LocalConvLSTM1DCell2,8,False,0.653494,layers.MaxPooling1D,101,True


In [5]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['history_lengths'] == 16].sort_values('accuracy', ascending = False).head(20)

Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy,pooling_choice,recurrent_layers_width,resnet
33,2,2,4,1,55,2,2,1,dnn,16,False,0.678217,,31,True
29,15,2,1,2,45,2,2,1,Bidirectional_GRU,16,False,0.674129,layers.AveragePooling1D,102,False
30,90,8,8,1,94,1,2,1,LocallyConnected1D,16,False,0.674129,,55,True
26,48,4,7,2,86,1,1,1,Conv1D2+GRU,16,True,0.672766,layers.AveragePooling1D,58,False
25,118,2,5,1,96,2,2,1,LocalConvLSTM1DCell,16,False,0.671404,,124,True
18,78,6,6,3,52,2,2,1,Bidirectional_LSTM,16,False,0.667316,layers.MaxPooling1D,108,False
15,52,7,3,3,120,2,2,1,GRU,16,True,0.666926,layers.AveragePooling1D,18,False
8,42,2,4,3,87,2,1,1,LocalConvLSTM1DCell,16,False,0.661476,,126,True
5,49,7,3,1,20,2,1,1,Conv1D2,16,True,0.660113,layers.AveragePooling1D,1,False


In [6]:
results = pd.read_csv(f'{data_path}/nn_architectures.csv')
results[results['history_lengths'] == 32].sort_values('accuracy', ascending = False).head(20)

Unnamed: 0,filters,kernel_size,pool_size,dense_top_layers,dense_layers_width,convolutional_layers,recurrent_layers,dnn_layers,network_type,history_lengths,transpose_history,accuracy,pooling_choice,recurrent_layers_width,resnet
34,61,8,1,2,13,1,2,1,LocallyConnected1D,32,False,0.681332,,51,True
32,90,1,5,2,64,1,1,1,LocalConvLSTM1DCell,32,True,0.677633,layers.AveragePooling1D,61,True
28,83,4,9,1,23,1,2,1,Bidirectional_LSTM,32,False,0.673934,layers.AveragePooling1D,67,False
24,3,8,4,2,3,1,1,1,dnn,32,False,0.671209,layers.AveragePooling1D,92,True
20,7,9,5,1,73,1,2,1,Bidirectional_GRU,32,False,0.669457,layers.MaxPooling1D,110,False
19,94,2,5,2,105,2,1,1,GRU,32,False,0.669262,,68,True
16,24,5,7,3,75,2,2,1,Bidirectional_GRU,32,True,0.666926,,102,True
12,67,4,3,3,7,1,1,1,Conv1D,32,False,0.663617,,48,False
10,89,6,5,3,100,1,1,1,LocalConvLSTM1DCell,32,False,0.661865,layers.AveragePooling1D,102,True
6,85,1,9,1,24,2,2,1,LocalConvLSTM1DCell2,32,True,0.660892,,91,False
