In [1]:
import sys
sys.path.append('/home/td/Documents/sports_predictor/nba')
sys.path.append('/home/td/Documents/sports_predictor')

import pandas as pd
import lightgbm
import pandas
from sklearn import metrics, model_selection
from nba.common import (
    timeit,
)
import numpy as np
from nba.data_pipeline import load_general_feature_file
from scipy import stats
import copy

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


Using TensorFlow backend.


In [2]:
cols_to_drop = []

In [3]:

class Model():

    def __init__(self, model_type, model_params, max_iter = 10000, early_stopping_rounds = 100):
        self.model_type = model_type
        self.model_params = model_params
        self.transformers_dict = dict()
        self.lightgbm_early_stopping_rounds = early_stopping_rounds
        self.lightgbm_max_iter = max_iter

    @timeit
    def fit(self, x, y):
        print('entered fit, x shape: {}'.format(x.shape))
        self.transformers_dict = dict()
        self.columns = x.columns

        if self.model_type == 'lightgbm':
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
            lgtrain = lightgbm.Dataset(x_train, y_train)
            lgvalid = lightgbm.Dataset(x_val, y_val)

            self.model = lightgbm.train(
                self.model_params,
                lgtrain,
                num_boost_round=self.lightgbm_max_iter,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train', 'valid'],
                early_stopping_rounds=self.lightgbm_early_stopping_rounds,
                verbose_eval=100
            )

    @timeit
    def predict(self, x):
        if self.model_type == 'lightgbm':
            return self.model.predict(x, num_iteration=self.model.best_iteration)
            

    def evaluate(self):
        if self.model_type == 'lightgbm':
            output = []

            for i, j in zip(self.columns, self.model.feature_importance('gain', iteration=self.model.best_iteration)):
                output.append({'column': i, 'feature_importance': j})
            return pd.DataFrame.from_dict(output).sort_values('feature_importance', ascending=False)
        

In [4]:

def gbm_rfe(x, y, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy'):
    features = [i for i in x.columns.tolist() if i not in cols_to_drop]
    x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
    feature_result_dict = dict()
   
    while len(features) > min_features:
        next_step_num_of_features = max(int(len(features)*(1-step_size)), min_features)
        model = Model('lightgbm', lgbm_params, early_stopping_rounds= 10)
        model.fit(x_train[features], y_train)
        
        preds_train = model.predict(x_train[features])
        preds_train = np.rint(preds_train).astype(int)
                
        preds_val = model.predict(x_val[features])
        preds_val = np.rint(preds_val).astype(int)
        
        if score_type == 'accuracy':
            feature_result_dict[tuple(features)] = metrics.accuracy_score(y_val, preds_val)
        if score_type == 'r2_score':
            feature_result_dict[tuple(features)] = metrics.r2_score(y_val, preds_val) 
        importances = model.evaluate().sort_values('feature_importance', ascending = False)
        print(len(features), metrics.accuracy_score(y_train, preds_train), metrics.accuracy_score(y_val, preds_val))
        importances = importances[importances['feature_importance'] > 0]

        features = importances['column'].tolist()[:next_step_num_of_features]
       
    features_selected = None
    max_score = 0
    for k, v in feature_result_dict.items():
        if v > max_score:
            features_selected = list(k)
            max_score = v
    return features_selected


In [5]:

def select_linear_features(x_train_glm, y_train, max_feature_corr):
    selected_cols_to_normal_target = set()
    feature_target_corr = []
    for i in x_train_glm.columns:
        # if 'accident_table' not in i and i not in cols_to_drop:
        if i not in cols_to_drop:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i], y_train)
            feature_target_corr.append((i, abs(r_value)))
    feature_target_corr = sorted(feature_target_corr, key = lambda x: x[1], reverse = True)
    selected_cols_to_normal_target.add(feature_target_corr[0][0])
    for i in feature_target_corr:
        too_correlated_to_features = False
        for j in selected_cols_to_normal_target:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i[0]], x_train_glm[j])
            if abs(r_value) > max_feature_corr:
                too_correlated_to_features = True
        if not too_correlated_to_features:
            selected_cols_to_normal_target.add(i[0])

    selected_cols_to_normal_target = list(selected_cols_to_normal_target)
    return selected_cols_to_normal_target


In [6]:
feature_df = load_general_feature_file()
x = feature_df.drop(['win', 'score_diff', 'key'], axis = 1)
y1 = feature_df['win']
y2 = feature_df['score_diff']

function: 'load_general_feature_file' starting
function: 'load_general_feature_file' finished in  18.62 seconds


In [7]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127
}

lgbm_params_fast = copy.deepcopy(lgbm_params)
lgbm_params_fast['learning_rate'] = .1


model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y1)


In [8]:
classification_columns = gbm_rfe(x_train, y_train, lgbm_params_fast, step_size=.01, min_features=4, score_type = 'accuracy')
x_train.shape, len(classification_columns)


function: 'fit' starting
entered fit, x shape: (5465, 14436)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[23]	train's binary_error: 0.00146413	valid's binary_error: 0.348208
function: 'fit' finished in  66.34 seconds
function: 'predict' starting
function: 'predict' finished in  4.81 seconds
function: 'predict' starting
function: 'predict' finished in  4.48 seconds
14436 0.9118023787740165 0.637211855104281
function: 'fit' starting
entered fit, x shape: (5465, 2328)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[42]	train's binary_error: 0.000976086	valid's binary_error: 0.356255
function: 'fit' finished in  18.43 seconds
function: 'predict' starting
function: 'predict' finished in  0.37 seconds
function: 'predict' starting
function: 'predict' finished in  0.35 seconds
2328 0.9101555352241537 0.6487376509330406
function: 'fit' starting
entered fit, x shape: (5465, 1932)
Training until v

336 0.905032021957914 0.6383095499451152
function: 'fit' starting
entered fit, x shape: (5465, 332)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[5]	train's binary_error: 0.0700342	valid's binary_error: 0.355523
function: 'fit' finished in  0.99 seconds
function: 'predict' starting
function: 'predict' finished in  0.05 seconds
function: 'predict' starting
function: 'predict' finished in  0.04 seconds
332 0.8585544373284538 0.6383095499451152
function: 'fit' starting
entered fit, x shape: (5465, 272)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[14]	train's binary_error: 0.0156174	valid's binary_error: 0.349671
function: 'fit' finished in  1.31 seconds
function: 'predict' starting
function: 'predict' finished in  0.05 seconds
function: 'predict' starting
function: 'predict' finished in  0.04 seconds
272 0.9008234217749314 0.6394072447859495
function: 'fit' starting
entered fit, x shape:

181 0.9114364135407136 0.641602634467618
function: 'fit' starting
entered fit, x shape: (5465, 179)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	train's binary_error: 0.00146413	valid's binary_error: 0.337966
function: 'fit' finished in  0.84 seconds
function: 'predict' starting
function: 'predict' finished in  0.04 seconds
function: 'predict' starting
function: 'predict' finished in  0.05 seconds
179 0.9143641354071363 0.6443468715697036
function: 'fit' starting
entered fit, x shape: (5465, 177)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	train's binary_error: 0.0317228	valid's binary_error: 0.322604
function: 'fit' finished in  0.39 seconds
function: 'predict' starting
function: 'predict' finished in  0.03 seconds
function: 'predict' starting
function: 'predict' finished in  0.03 seconds
177 0.8955169258920402 0.6350164654226125
function: 'fit' starting
entered fit, x shap

143 0.8922232387923147 0.637211855104281
function: 'fit' starting
entered fit, x shape: (5465, 141)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[31]	train's binary_error: 0.00170815	valid's binary_error: 0.321141
function: 'fit' finished in  1.69 seconds
function: 'predict' starting
function: 'predict' finished in  0.03 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
141 0.9183897529734675 0.6448957189901208
function: 'fit' starting
entered fit, x shape: (5465, 139)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	train's binary_error: 0.00170815	valid's binary_error: 0.326993
function: 'fit' finished in  0.57 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
139 0.9169258920402562 0.641602634467618
function: 'fit' starting
entered fit, x shap

107 0.9086916742909423 0.6410537870472008
function: 'fit' starting
entered fit, x shape: (5465, 105)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	train's binary_error: 0.0431918	valid's binary_error: 0.312363
function: 'fit' finished in  0.52 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
105 0.8894784995425434 0.6427003293084522
function: 'fit' starting
entered fit, x shape: (5465, 103)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	train's binary_error: 0.0380673	valid's binary_error: 0.316752
function: 'fit' finished in  0.69 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
103 0.8922232387923147 0.6256860592755215
function: 'fit' starting
entered fit, x shap

entered fit, x shape: (5465, 84)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	train's binary_error: 0.0175695	valid's binary_error: 0.342356
function: 'fit' finished in  0.27 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
84 0.9011893870082343 0.6558726673984633
function: 'fit' starting
entered fit, x shape: (5465, 83)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[30]	train's binary_error: 0.00219619	valid's binary_error: 0.32041
function: 'fit' finished in  0.85 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
83 0.9182067703568161 0.6421514818880352
function: 'fit' starting
entered fit, x shape: (5465, 82)
Training until validation scores don't improve for 10 rou

Early stopping, best iteration is:
[12]	train's binary_error: 0.0512445	valid's binary_error: 0.334309
function: 'fit' finished in  0.61 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
66 0.8779505946935041 0.654774972557629
function: 'fit' starting
entered fit, x shape: (5465, 65)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[5]	train's binary_error: 0.108834	valid's binary_error: 0.342356
function: 'fit' finished in  0.48 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
65 0.8327538883806038 0.6245883644346871
function: 'fit' starting
entered fit, x shape: (5465, 64)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	train's binary_error: 0.0783309	valid's binary_error: 

function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
48 0.9121683440073193 0.6448957189901208
function: 'fit' starting
entered fit, x shape: (5465, 47)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	train's binary_error: 0.0253782	valid's binary_error: 0.324067
function: 'fit' finished in  0.73 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
47 0.8999085086916743 0.6525795828759605
function: 'fit' starting
entered fit, x shape: (5465, 46)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	train's binary_error: 0.00268424	valid's binary_error: 0.32553
function: 'fit' finished in  1.13 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'pred

function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
30 0.910704483074108 0.6454445664105378
function: 'fit' starting
entered fit, x shape: (5465, 29)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[36]	train's binary_error: 0.00512445	valid's binary_error: 0.316752
function: 'fit' finished in  0.25 seconds
function: 'predict' starting
function: 'predict' finished in  0.02 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
29 0.9169258920402562 0.6350164654226125
function: 'fit' starting
entered fit, x shape: (5465, 28)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	train's binary_error: 0.011469	valid's binary_error: 0.328456
function: 'fit' finished in  0.85 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predi

function: 'predict' starting
function: 'predict' finished in  0.00 seconds
12 0.7785910338517841 0.6437980241492866
function: 'fit' starting
entered fit, x shape: (5465, 11)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[31]	train's binary_error: 0.0395315	valid's binary_error: 0.337235
function: 'fit' finished in  0.13 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.00 seconds
11 0.8860018298261665 0.6399560922063666
function: 'fit' starting
entered fit, x shape: (5465, 10)
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[5]	train's binary_error: 0.197413	valid's binary_error: 0.383321
function: 'fit' finished in  0.11 seconds
function: 'predict' starting
function: 'predict' finished in  0.01 seconds
function: 'predict' starting
function: 'predict' finished in  0.00 seconds
10 0.756084172003

((7287, 14436), 234)

In [9]:
model.fit(x_train[classification_columns], y_train)
preds = model.predict(x_test[classification_columns])
preds = np.rint(preds).astype(int)     
metrics.accuracy_score(y_test, preds)

function: 'fit' starting
entered fit, x shape: (7287, 234)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.0570906	valid's binary_error: 0.362239
[200]	train's binary_error: 0.010796	valid's binary_error: 0.351262
[300]	train's binary_error: 0.00182983	valid's binary_error: 0.349616
Early stopping, best iteration is:
[235]	train's binary_error: 0.0073193	valid's binary_error: 0.34303
function: 'fit' finished in  12.88 seconds
function: 'predict' starting
function: 'predict' finished in  0.05 seconds


0.6270069987649238

In [10]:
model.evaluate().head(100)

Unnamed: 0,column,feature_importance
0,team_pregame_rating_0_diff_vs_opponent_feature,26234.969584
1,home_diff_vs_opponent_feature,10834.238347
2,team_aggregate_past_50_game_skew_win_diff_vs_opponent_feature,3621.864334
8,team_aggregate_past_50_game_avg_player_stats_aggregated_by_game_plus_minus_max_diff_vs_opponent_feature,2847.901426
3,team_aggregate_past_20_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,2232.200192
39,team_aggregate_past_5_game_avg_player_stats_aggregated_by_game_ast_pct_var_diff_vs_opponent_feature,2221.466403
25,team_aggregate_past_50_game_min_player_stats_aggregated_by_game_trb_skew_diff_vs_opponent_feature,2207.079988
23,team_aggregate_past_10_game_var_player_stats_aggregated_by_game_ts_pct_avg_diff_vs_opponent_feature,1974.380166
94,team_aggregate_past_1_game_avg_player_stats_aggregated_by_game_blk_pct_max_diff_vs_opponent_feature,1843.831247
7,team_aggregate_past_10_game_max_player_stats_aggregated_by_game_fg_pct_avg_diff_vs_opponent_feature,1837.428143


In [11]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'l2',
    'metric': 'mae',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
}

lgbm_params_fast = copy.deepcopy(lgbm_params)
lgbm_params_fast['learning_rate'] = .1

model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y2)


In [12]:
reg_columns = gbm_rfe(x_train, y_train, lgbm_params_fast, step_size=.01, min_features=4, score_type = 'r2_score')
x_train.shape, len(reg_columns)

function: 'fit' starting
entered fit, x shape: (5465, 14436)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/td/anaconda3/envs/sports_predictor/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-a60330100772>", line 1, in <module>
    reg_columns = gbm_rfe(x_train, y_train, lgbm_params_fast, step_size=.01, min_features=4, score_type = 'r2_score')
  File "<ipython-input-4-baf2edd796d5>", line 9, in gbm_rfe
    model.fit(x_train[features], y_train)
  File "/home/td/Documents/sports_predictor/nba/common.py", line 53, in timed
    result = method(*args, **kw)
  File "<ipython-input-3-9f22ee4bef3a>", line 28, in fit
    verbose_eval=100
  File "/home/td/anaconda3/envs/sports_predictor/lib/python3.7/site-packages/lightgbm/engine.py", line 249, in train
    booster.update(fobj=fobj)
  File "/home/td/anaconda3/envs/sports_predictor/lib/python3.7/site-packages/lightgbm/basic.py", line 1926, in update
    ctypes.byref(is_finished)))
Keyboard

KeyboardInterrupt: 

In [None]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.r2_score(y_test, preds)

In [None]:
model.evaluate().head(100)