In [34]:
import sys
sys.path.append('/home/td/Documents/sports_predictor/nba')
sys.path.append('/home/td/Documents/sports_predictor')

import pandas as pd
import lightgbm
import pandas
from sklearn import metrics, model_selection
from nba.common import (
    timeit,
)
import numpy as np
from nba.data_pipeline import load_general_feature_file
from scipy import stats

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


In [2]:
cols_to_drop = []

In [3]:

class Model():
    max_iter = 1000000

    lightgbm_max_iter = 10000
    lightgbm_early_stopping_rounds = 100

    def __init__(self, model_type, model_params):
        self.model_type = model_type
        self.model_params = model_params
        self.transformers_dict = dict()

    @timeit
    def fit(self, x, y):
        print('entered fit, x shape: {}'.format(x.shape))
        self.transformers_dict = dict()
        self.columns = x.columns

        if self.model_type == 'lightgbm':
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
            lgtrain = lightgbm.Dataset(x_train, y_train)
            lgvalid = lightgbm.Dataset(x_val, y_val)

            self.model = lightgbm.train(
                self.model_params,
                lgtrain,
                num_boost_round=self.lightgbm_max_iter,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train', 'valid'],
                early_stopping_rounds=self.lightgbm_early_stopping_rounds,
                verbose_eval=100
            )

    @timeit
    def predict(self, x):
        if self.model_type == 'lightgbm':
            return self.model.predict(x, num_iteration=self.model.best_iteration)
            

    def evaluate(self):
        if self.model_type == 'lightgbm':
            output = []

            for i, j in zip(self.columns, self.model.feature_importance('gain', iteration=self.model.best_iteration)):
                output.append({'column': i, 'feature_importance': j})
            return pd.DataFrame.from_dict(output).sort_values('feature_importance', ascending=False)
        

In [17]:

def gbm_rfe(x, y, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy'):
    features = [i for i in x.columns.tolist() if i not in cols_to_drop]
    x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
    feature_result_dict = dict()
   
    while len(features) > min_features:
        next_step_num_of_features = max(int(len(features)*(1-step_size)), min_features)
        model = Model('lightgbm', lgbm_params)
        model.fit(x_train[features], y_train)
        
        preds_train = model.predict(x_train[features])
        preds_train = np.rint(preds_train).astype(int)
                
        preds_val = model.predict(x_val[features])
        preds_val = np.rint(preds_val).astype(int)
        
        if score_type == 'accuracy':
            feature_result_dict[tuple(features)] = metrics.accuracy_score(y_val, preds_val)
        if score_type == 'r2_score':
            feature_result_dict[tuple(features)] = metrics.r2_score(y_val, preds_val) 
        importances = model.evaluate().sort_values('feature_importance', ascending = False)
        print(len(features), metrics.accuracy_score(y_train, preds_train), metrics.accuracy_score(y_val, preds_val))
        importances = importances[importances['feature_importance'] > 0]

        features = importances['column'].tolist()[:next_step_num_of_features]
       
    features_selected = None
    max_score = 0
    for k, v in feature_result_dict.items():
        if v > max_score:
            features_selected = list(k)
            max_score = v
    return features_selected


In [18]:

def select_linear_features(x_train_glm, y_train, max_feature_corr):
    selected_cols_to_normal_target = set()
    feature_target_corr = []
    for i in x_train_glm.columns:
        # if 'accident_table' not in i and i not in cols_to_drop:
        if i not in cols_to_drop:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i], y_train)
            feature_target_corr.append((i, abs(r_value)))
    feature_target_corr = sorted(feature_target_corr, key = lambda x: x[1], reverse = True)
    selected_cols_to_normal_target.add(feature_target_corr[0][0])
    for i in feature_target_corr:
        too_correlated_to_features = False
        for j in selected_cols_to_normal_target:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i[0]], x_train_glm[j])
            if abs(r_value) > max_feature_corr:
                too_correlated_to_features = True
        if not too_correlated_to_features:
            selected_cols_to_normal_target.add(i[0])

    selected_cols_to_normal_target = list(selected_cols_to_normal_target)
    return selected_cols_to_normal_target


In [19]:
feature_df = load_general_feature_file()
x = feature_df.drop(['win', 'score_diff', 'key'], axis = 1)
y1 = feature_df['win']
y2 = feature_df['score_diff']

function: 'load_general_feature_file' starting
function: 'load_general_feature_file' finished in  2.86 seconds


In [22]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    'bagging_freq': 1,
    "seed":1,
    "data_random_seed":1,
    "bagging_seed":1,
    "data_random_seed":1,
}


model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y1)


In [23]:
# classification_columns = select_linear_features(x_train, y_train, .9)
# x_train.shape, len(classification_columns)


In [24]:
classification_columns = gbm_rfe(x_train, y_train, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy')
x_train.shape, len(classification_columns)



function: 'fit' starting
entered fit, x shape: (5018, 2500)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.151209	valid's binary_error: 0.360159
Early stopping, best iteration is:
[52]	train's binary_error: 0.179378	valid's binary_error: 0.350598
function: 'fit' finished in  12.70 seconds
function: 'predict' starting
function: 'predict' finished in  0.39 seconds
function: 'predict' starting
function: 'predict' finished in  0.35 seconds
2500 0.7777999202869669 0.630005977286312
function: 'fit' starting
entered fit, x shape: (5018, 886)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.163168	valid's binary_error: 0.355378
Early stopping, best iteration is:
[51]	train's binary_error: 0.190805	valid's binary_error: 0.34502
function: 'fit' finished in  5.22 seconds
function: 'predict' starting
function: 'predict' finished in  0.22 seconds
function: 'predict' starting
function: 'predict' finished in  0.11

function: 'predict' starting
function: 'predict' finished in  0.03 seconds
function: 'predict' starting
function: 'predict' finished in  0.03 seconds
190 0.7803905938620964 0.625821876867902
function: 'fit' starting
entered fit, x shape: (5018, 188)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.18363	valid's binary_error: 0.348207
Early stopping, best iteration is:
[28]	train's binary_error: 0.218708	valid's binary_error: 0.337849
function: 'fit' finished in  1.37 seconds
function: 'predict' starting
function: 'predict' finished in  0.03 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
188 0.7514946193702671 0.6228332337118948
function: 'fit' starting
entered fit, x shape: (5018, 173)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.184162	valid's binary_error: 0.322709
Early stopping, best iteration is:
[43]	train's binary_error: 0.212065	valid's binary_error: 0.3

function: 'predict' starting
function: 'predict' finished in  0.02 seconds
95 0.7692307692307693 0.6240286909742977
function: 'fit' starting
entered fit, x shape: (5018, 94)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.191602	valid's binary_error: 0.337052
[200]	train's binary_error: 0.154664	valid's binary_error: 0.333865
Early stopping, best iteration is:
[165]	train's binary_error: 0.164762	valid's binary_error: 0.326693
function: 'fit' finished in  1.85 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
94 0.79473893981666 0.6210400478182905
function: 'fit' starting
entered fit, x shape: (5018, 93)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.199043	valid's binary_error: 0.341833
[200]	train's binary_error: 0.157056	valid's binary_error: 0.328287
Early stopping, best iteration is:
[19

[300]	train's binary_error: 0.124635	valid's binary_error: 0.323506
Early stopping, best iteration is:
[280]	train's binary_error: 0.131012	valid's binary_error: 0.318725
function: 'fit' finished in  1.26 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
65 0.8220406536468713 0.6323968918111178
function: 'fit' starting
entered fit, x shape: (5018, 64)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.216317	valid's binary_error: 0.32988
Early stopping, best iteration is:
[21]	train's binary_error: 0.242626	valid's binary_error: 0.318725
function: 'fit' finished in  0.34 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
64 0.7383419689119171 0.6359832635983264
function: 'fit' starting
entered fit, x shape: (5018, 63)
Training until v

function: 'predict' starting
function: 'predict' finished in  0.02 seconds
51 0.7744121163810282 0.6306037059175135
function: 'fit' starting
entered fit, x shape: (5018, 50)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.208079	valid's binary_error: 0.344223
[200]	train's binary_error: 0.17114	valid's binary_error: 0.341833
[300]	train's binary_error: 0.138453	valid's binary_error: 0.340239
Early stopping, best iteration is:
[273]	train's binary_error: 0.145894	valid's binary_error: 0.332271
function: 'fit' finished in  1.84 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
50 0.8074930251096054 0.6228332337118948
function: 'fit' starting
entered fit, x shape: (5018, 49)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.211533	valid's binary_error: 0.315538
[200]	train's binary_error: 0.178049	

KeyboardInterrupt: 

In [26]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.accuracy_score(y_test, preds)

function: 'fit' starting
entered fit, x shape: (6691, 2500)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.178756	valid's binary_error: 0.365212
Early stopping, best iteration is:
[72]	train's binary_error: 0.195098	valid's binary_error: 0.353855
function: 'fit' finished in  15.93 seconds
function: 'predict' starting
function: 'predict' finished in  0.48 seconds


0.6405199462124608

In [36]:
model.evaluate().head(100)

Unnamed: 0,column,feature_importance
1434,team_pregame_rating_0_diff,3250996.0
1850,home_diff,1238582.0
2089,team_score_diff_past_20_game_avg_diff,899765.2
1851,team_player_game_aggregate_plus_minus_avg_past_20_game_avg_diff,800080.9
5,home,379213.8
1276,team_score_diff_past_5_game_avg_diff,231219.4
2104,team_player_game_aggregate_plus_minus_avg_past_5_game_avg_diff,215971.1
1759,team_score_diff_past_5_game_max_diff,101691.7
2496,team_pregame_rating_1_diff,95695.86
633,team_fg3a_past_5_game_max,91383.85


In [28]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    'bagging_freq': 1,
    "seed":1,
    "data_random_seed":1,
    "bagging_seed":1,
    "data_random_seed":1,
}
model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y2)


In [29]:
# reg_columns = select_linear_features(x_train, y_train, .9)
# x_train.shape, len(reg_columns)

In [30]:
# reg_columns = gbm_rfe(x_train, y_train, lgbm_params, step_size=.1, min_features=8, score_type = 'r2_score')
# x_train.shape, len(reg_columns)

In [32]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.r2_score(y_test, preds)

function: 'fit' starting
entered fit, x shape: (6691, 2500)
Training until validation scores don't improve for 100 rounds
[100]	train's l1: 9.62347	valid's l1: 10.3959
[200]	train's l1: 8.5448	valid's l1: 10.2348
[300]	train's l1: 7.70795	valid's l1: 10.2079
[400]	train's l1: 6.99903	valid's l1: 10.2016
[500]	train's l1: 6.37406	valid's l1: 10.2122
Early stopping, best iteration is:
[401]	train's l1: 6.9925	valid's l1: 10.2003
function: 'fit' finished in  43.51 seconds
function: 'predict' starting
function: 'predict' finished in  0.40 seconds


0.1522225961667436

In [35]:
model.evaluate().head(100)

Unnamed: 0,column,feature_importance
1434,team_pregame_rating_0_diff,3250996.0
1850,home_diff,1238582.0
2089,team_score_diff_past_20_game_avg_diff,899765.2
1851,team_player_game_aggregate_plus_minus_avg_past_20_game_avg_diff,800080.9
5,home,379213.8
1276,team_score_diff_past_5_game_avg_diff,231219.4
2104,team_player_game_aggregate_plus_minus_avg_past_5_game_avg_diff,215971.1
1759,team_score_diff_past_5_game_max_diff,101691.7
2496,team_pregame_rating_1_diff,95695.86
633,team_fg3a_past_5_game_max,91383.85
