In [1]:
import sys
sys.path.append('/home/td/Documents/sports_predictor/nba')
sys.path.append('/home/td/Documents/sports_predictor')

import pandas as pd
import lightgbm
import pandas
from sklearn import metrics, model_selection
from nba.common import (
    timeit,
)
import numpy as np
from nba.data_pipeline import load_general_feature_file
from scipy import stats

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


Using TensorFlow backend.


In [2]:
cols_to_drop = []

In [3]:

class Model():
    max_iter = 1000000

    lightgbm_max_iter = 10000
    lightgbm_early_stopping_rounds = 100

    def __init__(self, model_type, model_params):
        self.model_type = model_type
        self.model_params = model_params
        self.transformers_dict = dict()

    @timeit
    def fit(self, x, y):
        print('entered fit, x shape: {}'.format(x.shape))
        self.transformers_dict = dict()
        self.columns = x.columns

        if self.model_type == 'lightgbm':
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
            lgtrain = lightgbm.Dataset(x_train, y_train)
            lgvalid = lightgbm.Dataset(x_val, y_val)

            self.model = lightgbm.train(
                self.model_params,
                lgtrain,
                num_boost_round=self.lightgbm_max_iter,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train', 'valid'],
                early_stopping_rounds=self.lightgbm_early_stopping_rounds,
                verbose_eval=100
            )

    @timeit
    def predict(self, x):
        if self.model_type == 'lightgbm':
            return self.model.predict(x, num_iteration=self.model.best_iteration)
            

    def evaluate(self):
        if self.model_type == 'lightgbm':
            output = []

            for i, j in zip(self.columns, self.model.feature_importance('gain', iteration=self.model.best_iteration)):
                output.append({'column': i, 'feature_importance': j})
            return pd.DataFrame.from_dict(output).sort_values('feature_importance', ascending=False)
        

In [4]:

def gbm_rfe(x, y, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy'):
    features = [i for i in x.columns.tolist() if i not in cols_to_drop]
    x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
    feature_result_dict = dict()
   
    while len(features) > min_features:
        next_step_num_of_features = max(int(len(features)*(1-step_size)), min_features)
        model = Model('lightgbm', lgbm_params)
        model.fit(x_train[features], y_train)
        
        preds_train = model.predict(x_train[features])
        preds_train = np.rint(preds_train).astype(int)
                
        preds_val = model.predict(x_val[features])
        preds_val = np.rint(preds_val).astype(int)
        
        if score_type == 'accuracy':
            feature_result_dict[tuple(features)] = metrics.accuracy_score(y_val, preds_val)
        if score_type == 'r2_score':
            feature_result_dict[tuple(features)] = metrics.r2_score(y_val, preds_val) 
        importances = model.evaluate().sort_values('feature_importance', ascending = False)
        print(len(features), metrics.accuracy_score(y_train, preds_train), metrics.accuracy_score(y_val, preds_val))
        importances = importances[importances['feature_importance'] > 0]

        features = importances['column'].tolist()[:next_step_num_of_features]
       
    features_selected = None
    max_score = 0
    for k, v in feature_result_dict.items():
        if v > max_score:
            features_selected = list(k)
            max_score = v
    return features_selected


In [5]:

def select_linear_features(x_train_glm, y_train, max_feature_corr):
    selected_cols_to_normal_target = set()
    feature_target_corr = []
    for i in x_train_glm.columns:
        # if 'accident_table' not in i and i not in cols_to_drop:
        if i not in cols_to_drop:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i], y_train)
            feature_target_corr.append((i, abs(r_value)))
    feature_target_corr = sorted(feature_target_corr, key = lambda x: x[1], reverse = True)
    selected_cols_to_normal_target.add(feature_target_corr[0][0])
    for i in feature_target_corr:
        too_correlated_to_features = False
        for j in selected_cols_to_normal_target:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i[0]], x_train_glm[j])
            if abs(r_value) > max_feature_corr:
                too_correlated_to_features = True
        if not too_correlated_to_features:
            selected_cols_to_normal_target.add(i[0])

    selected_cols_to_normal_target = list(selected_cols_to_normal_target)
    return selected_cols_to_normal_target


In [6]:
feature_df = load_general_feature_file()
x = feature_df.drop(['win', 'score_diff', 'key'], axis = 1)
y1 = feature_df['win']
y2 = feature_df['score_diff']

function: 'load_general_feature_file' starting
function: 'load_general_feature_file' finished in  18.26 seconds


In [7]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     'bagging_freq': 1,
#     "seed":1,
#     "data_random_seed":1,
#     "bagging_seed":1,
#     "data_random_seed":1,
}


model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y1)


In [8]:
# classification_columns = select_linear_features(x_train, y_train, .5)
# x_train.shape, len(classification_columns)


In [9]:
# classification_columns = gbm_rfe(x_train[classification_columns], y_train, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy')
# x_train.shape, len(classification_columns)



In [10]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.accuracy_score(y_test, preds)

function: 'fit' starting
entered fit, x shape: (7287, 14436)
Training until validation scores don't improve for 100 rounds
[100]	train's binary_error: 0.176944	valid's binary_error: 0.33315
Early stopping, best iteration is:
[77]	train's binary_error: 0.189021	valid's binary_error: 0.324369
function: 'fit' finished in  92.51 seconds
function: 'predict' starting
function: 'predict' finished in  4.53 seconds


0.6426512968299711

In [11]:
model.evaluate().head(100)

Unnamed: 0,column,feature_importance
8523,team_pregame_rating_0_diff_vs_opponent_feature,9970.126385
12484,home_diff_vs_opponent_feature,2744.812397
12137,team_aggregate_past_20_game_avg_player_stats_aggregated_by_game_plus_minus_avg_diff_vs_opponent_feature,1641.61669
5,home,1469.515202
12552,team_pregame_rating_1_diff_vs_opponent_feature,1054.915209
14367,team_aggregate_past_50_game_avg_score_diff_diff_vs_opponent_feature,775.231407
12091,team_aggregate_past_10_game_avg_score_diff_diff_vs_opponent_feature,759.596796
13903,team_aggregate_past_20_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,699.212498
14091,team_aggregate_past_100_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,586.068699
13190,team_aggregate_past_50_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,520.475107


In [12]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'l2',
    'metric': 'mae',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     'bagging_freq': 1,
#     "seed":1,
#     "data_random_seed":1,
#     "bagging_seed":1,
#     "data_random_seed":1,
}
model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y2)


In [13]:
# reg_columns = select_linear_features(x_train, y_train, .5)
# x_train.shape, len(reg_columns)

In [14]:
# reg_columns = gbm_rfe(x_train, y_train, lgbm_params, step_size=.01, min_features=8, score_type = 'r2_score')
# x_train.shape, len(reg_columns)

In [15]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.r2_score(y_test, preds)

function: 'fit' starting
entered fit, x shape: (7287, 14436)
Training until validation scores don't improve for 100 rounds
[100]	train's l1: 9.42156	valid's l1: 10.6624
[200]	train's l1: 8.32499	valid's l1: 10.4122
[300]	train's l1: 7.47248	valid's l1: 10.3605
[400]	train's l1: 6.73694	valid's l1: 10.3294
[500]	train's l1: 6.0877	valid's l1: 10.3486
Early stopping, best iteration is:
[405]	train's l1: 6.70187	valid's l1: 10.3293
function: 'fit' finished in  242.95 seconds
function: 'predict' starting
function: 'predict' finished in  4.53 seconds


0.18875806308237408

In [16]:
model.evaluate().head(100)

Unnamed: 0,column,feature_importance
8523,team_pregame_rating_0_diff_vs_opponent_feature,3009380.0
12484,home_diff_vs_opponent_feature,1200653.0
12137,team_aggregate_past_20_game_avg_player_stats_aggregated_by_game_plus_minus_avg_diff_vs_opponent_feature,693305.5
5,home,498325.8
12091,team_aggregate_past_10_game_avg_score_diff_diff_vs_opponent_feature,271138.1
13903,team_aggregate_past_20_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,246256.6
7358,team_aggregate_past_20_game_avg_score_diff_diff_vs_opponent_feature,228321.9
12060,team_aggregate_past_10_game_avg_player_stats_aggregated_by_game_plus_minus_avg_diff_vs_opponent_feature,210819.0
9945,team_aggregate_past_10_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,113295.3
13190,team_aggregate_past_50_game_avg_player_stats_aggregated_by_game_plus_minus_median_diff_vs_opponent_feature,108185.0
