In [1]:
import sys
sys.path.append('/home/td/Documents/sports_predictor/nba')
sys.path.append('/home/td/Documents/sports_predictor')

from scipy import stats
import time
# import requests
from bs4 import BeautifulSoup
import threading
import pandas as pd
import tqdm
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import random
from sklearn import model_selection, metrics 
import pickle
import lightgbm
from nba.common import (
    timeit,
)
   


base_url = 'https://www.basketball-reference.com/'
day_scores_base_url = 'https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}'
data_path = r'/media/td/Samsung_T5/sports/nba'
# data_path = r'C:\Users\TristanDelforge\Documents\sports_predictor\nba'
db_name = 'nba_db'
box_score_link_table_name = 'boxscore_links'
general_feature_data_table_name = 'general_features'

box_score_details_table_name = 'boxscore_details'
processed_team_data_table_name = 'processed_team_data'
player_detail_table_name = 'player_details'
processed_player_data_table_name = 'processed_player_data'
aggregated_player_data_table_name = 'aggregated_player_data'
combined_feature_file_data_table_name = 'combined_feature_file'
past_n_game_dataset_table_name = 'past_n_game_dataset'
target = 'win'

date_record_pickle_file_name = 'scraped_dates'
box_score_record_pickle_file_name = 'scraped_games'
max_tries = 5
file_lock = threading.Lock()

starting_rating = 1000
rating_k_factor = 100
rating_floor = 100
rating_ceiling = 10000
rating_d = 1000
k_min_sensitivity = 1

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:

class Model():
    max_iter = 1000000

    lightgbm_max_iter = 10000
    lightgbm_early_stopping_rounds = 100

    def __init__(self, model_type, model_params):
        self.model_type = model_type
        self.model_params = model_params
        self.transformers_dict = dict()

    @timeit
    def fit(self, x, y):
        print('entered fit, x shape: {}'.format(x.shape))
        self.transformers_dict = dict()
        self.columns = x.columns

        if self.model_type == 'lightgbm':
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
            lgtrain = lightgbm.Dataset(x_train, y_train)
            lgvalid = lightgbm.Dataset(x_val, y_val)

            self.model = lightgbm.train(
                self.model_params,
                lgtrain,
                num_boost_round=self.lightgbm_max_iter,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train', 'valid'],
                early_stopping_rounds=self.lightgbm_early_stopping_rounds,
                verbose_eval=100
            )

    @timeit
    def predict(self, x):
        if self.model_type == 'lightgbm':
            return self.model.predict(x, num_iteration=self.model.best_iteration)
            

    def evaluate(self):
        if self.model_type == 'lightgbm':
            output = []

            for i, j in zip(self.columns, self.model.feature_importance('gain', iteration=self.model.best_iteration)):
                output.append({'column': i, 'feature_importance': j})
            return pd.DataFrame.from_dict(output).sort_values('feature_importance', ascending=False)
        

## Raw data

In [3]:
team_df = pd.read_csv(f'{data_path}/{box_score_details_table_name}.csv', sep = '|', low_memory=False)
player_df = pd.read_csv(f'{data_path}/{player_detail_table_name}.csv', sep = '|', low_memory=False)


In [4]:
team_df.shape, player_df.shape

((11462, 46), (145507, 49))

In [5]:
team_df.head()

Unnamed: 0,team_tag,team_link,team_name,opponent_tag,opponent_link,opponent_name,location,win,score_diff,year,month,day,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,off_rtg,def_rtg
0,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,tor,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,"Air Canada Centre, Toronto, Canada",0,-3.0,2015,3,30,240,35,84,0.417,8,27,0.296,18,31,0.581,12,29,41,21,7,3,13,22,96,,0.492,0.464,0.321,0.369,25.0,70.7,46.1,60.0,7.5,5.4,11.7,100.0,102.9,106.1
1,tor,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,hou,https://www.basketball-reference.com//teams/HO...,Houston Rockets,"Air Canada Centre, Toronto, Canada",1,3.0,2015,3,30,240,35,78,0.449,8,22,0.364,21,27,0.778,12,36,48,20,9,7,15,26,99,,0.551,0.5,0.282,0.346,29.3,75.0,53.9,57.1,9.6,12.3,14.3,100.0,106.1,102.9
2,pho,https://www.basketball-reference.com//teams/PH...,Phoenix Suns,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,"Moda Center, Portland, Oregon",0,-23.0,2015,3,30,240,38,87,0.437,3,16,0.188,7,10,0.7,8,30,38,18,6,2,12,11,86,,0.47,0.454,0.184,0.115,17.4,76.9,44.7,47.4,6.4,3.4,11.6,100.0,91.7,116.3
3,por,https://www.basketball-reference.com//teams/PO...,Portland Trail Blazers,pho,https://www.basketball-reference.com//teams/PH...,Phoenix Suns,"Moda Center, Portland, Oregon",1,23.0,2015,3,30,240,44,86,0.512,12,27,0.444,9,10,0.9,9,38,47,26,6,5,14,10,109,,0.603,0.581,0.314,0.116,23.1,82.6,55.3,59.1,6.4,7.0,13.4,100.0,116.3,91.7
4,lal,https://www.basketball-reference.com//teams/LA...,Los Angeles Lakers,phi,https://www.basketball-reference.com//teams/PH...,Philadelphia 76ers,"Wells Fargo Center, Philadelphia, Pennsylvania",1,2.0,2015,3,30,265,40,82,0.488,10,21,0.476,23,30,0.767,12,33,45,27,7,10,17,26,113,,0.593,0.549,0.256,0.366,27.9,62.3,46.9,67.5,6.9,15.2,15.2,100.0,112.1,110.1


In [6]:
player_df.head()

Unnamed: 0,ast,ast_pct,blk,blk_pct,day,def_rtg,drb,drb_pct,efg_pct,fg,fg3,fg3_pct,fg3a,fg3a_per_fga_pct,fg_pct,fga,ft,ft_pct,fta,fta_per_fga_pct,location,month,mp,off_rtg,opponent_link,opponent_name,opponent_tag,orb,orb_pct,pf,player_link,player_name,plus_minus,pts,reason,score_diff,stl,stl_pct,team_link,team_name,team_tag,tov,tov_pct,trb,trb_pct,ts_pct,usg_pct,win,year
0,5.0,21.3,1.0,2.2,30,100.0,8.0,24.0,0.433,5.0,3.0,0.375,8.0,0.533,0.333,15.0,2.0,0.5,4.0,0.267,"Air Canada Centre, Toronto, Canada",3,39:05,93.0,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,tor,0.0,0.0,2.0,https://www.basketball-reference.com//players/...,Trevor Ariza,-12.0,15.0,,-3.0,2.0,2.6,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,2.0,10.7,8.0,11.0,0.447,20.8,0,2015
1,5.0,26.8,0.0,0.0,30,110.0,3.0,9.3,0.477,9.0,3.0,0.375,8.0,0.364,0.409,22.0,10.0,0.833,12.0,0.545,"Air Canada Centre, Toronto, Canada",3,37:57,115.0,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,tor,2.0,5.3,3.0,https://www.basketball-reference.com//players/...,James Harden,-13.0,31.0,,-3.0,1.0,1.4,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,3.0,9.9,5.0,7.1,0.568,34.6,0,2015
2,2.0,11.6,0.0,0.0,30,107.0,3.0,12.6,0.214,3.0,0.0,0.0,4.0,0.286,0.214,14.0,2.0,0.5,4.0,0.286,"Air Canada Centre, Toronto, Canada",3,27:51,58.0,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,tor,1.0,3.6,3.0,https://www.basketball-reference.com//players/...,Josh Smith,6.0,8.0,,-3.0,1.0,1.8,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,2.0,11.3,4.0,7.7,0.254,27.7,0,2015
3,0.0,0.0,0.0,0.0,30,116.0,0.0,0.0,0.8,3.0,2.0,0.667,3.0,0.6,0.6,5.0,0.0,,0.0,0.0,"Air Canada Centre, Toronto, Canada",3,25:16,119.0,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,tor,0.0,0.0,1.0,https://www.basketball-reference.com//players/...,Jason Terry,-7.0,8.0,,-3.0,0.0,0.0,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,1.0,16.7,0.0,0.0,0.8,10.3,0,2015
4,1.0,8.4,0.0,0.0,30,112.0,2.0,11.4,1.0,3.0,0.0,,0.0,0.0,1.0,3.0,1.0,0.5,2.0,0.667,"Air Canada Centre, Toronto, Canada",3,20:27,144.0,https://www.basketball-reference.com//teams/TO...,Toronto Raptors,tor,2.0,9.8,4.0,https://www.basketball-reference.com//players/...,Joey Dorsey,5.0,7.0,,-3.0,0.0,0.0,https://www.basketball-reference.com//teams/HO...,Houston Rockets,hou,1.0,20.5,4.0,10.5,0.902,10.4,0,2015


## Player stats aggregated per game

In [7]:
df_test = pd.DataFrame(data = [[1, 2, 3],
                              [1, 2, 3],
                              [1, 2, 3],
                              [2, 2, 3]],
                      columns = ['a', 'b', 'c'])


In [8]:
df_test.shift(periods=1).rolling(window=2).mean()

Unnamed: 0,a,b,c
0,,,
1,,,
2,1.0,2.0,3.0
3,1.0,2.0,3.0


##  Encoding evaluating

## Feature evaluation

In [None]:
feature_df = pd.read_csv(f'{data_path}/{general_feature_data_table_name}.csv', sep = '|')
feature_df.shape

In [None]:
feature_df.head()

In [None]:
from scipy import stats



lgbm_params_classification = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
}

lgbm_params_regression = {
    'boosting_type': 'gbdt',
    'objective': 'l2',
    'metric': 'l1',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
}



def evaluate_features(df, feature_list, target, target_type = 'classification'):
    if target_type == 'classification':
        params = copy.deepcopy(lgbm_params_classification)
    elif target_type == 'regression':
        params = copy.deepcopy(lgbm_params_regression)
    else:
        raise Exception('Invalid target type: {}'.format(target_type))
    
    results = dict()
    for i in feature_list:
        slope, intercept, r_value, p_value, std_err = stats.linregress(df[i], df[target])
        results[i] = {'slope':slope,
                     'intercept':intercept,
                     'r_value':r_value,
                     'r2_value':r_value*r_value,
                     'p_value':p_value,
                     'std_err':std_err}
    
    df_train, df_val = model_selection.train_test_split(df, random_state = 1)
    model = Model('lightgbm', params)
    model.fit(df_train[feature_list], df_train[target])
    
    preds = model.predict(df_val[feature_list])
    preds = np.rint(preds).astype(int)   
    
    if target_type == 'classification':
        model_score = metrics.accuracy_score(df_val[target], preds)
    if target_type == 'regression':
        model_score = metrics.r2_score(df_val[target], preds)
    
    feature_importance = model.evaluate()
    feature_importance['slope'] = feature_importance.apply(lambda x: results[x['column']]['slope'])
    feature_importance['intercept'] = feature_importance.apply(lambda x: results[x['column']]['intercept'])
    feature_importance['r_value'] = feature_importance.apply(lambda x: results[x['column']]['r_value'])
    feature_importance['r2_value'] = feature_importance.apply(lambda x: results[x['column']]['r2_value'])
    feature_importance['p_value'] = feature_importance.apply(lambda x: results[x['column']]['p_value'])
    feature_importance['std_err'] = feature_importance.apply(lambda x: results[x['column']]['std_err'])
    feature_importance['model_performance'] = model_score
    
    return feature_importance, preds

    

In [None]:
n = 100
cols_100 = [i for i in feature_df_win.columns if 'team_aggregate_past_{}'.format(n) in i]

feature_evaluation_100, _ = evaluate_classification_features(feature_df, cols_100, 'win')
feature_evaluation_100.sort_values('feature_importance', ascending = False)

In [None]:
feature_dict = dict()
pred_dict = dict()

for n in [1, 3, 5, 10, 20, 50, 100]:
    cols = [i for i in feature_df_win.columns if 'team_aggregate_past_{}'.format(n) in i]
    temp_df, preds = evaluate_classification_features(feature_df, cols, 'win')
    pred_dict[n]