In [None]:
import sys
sys.path.append('/home/td/Documents/sports_predictor/nba')
sys.path.append('/home/td/Documents/sports_predictor')

import pandas as pd
import lightgbm
import pandas
from sklearn import metrics, model_selection
from nba.common import (
    timeit,
)
import numpy as np
from nba.data_pipeline import load_general_feature_file
from scipy import stats
import copy

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


In [None]:
cols_to_drop = []

In [None]:

class Model():

    def __init__(self, model_type, model_params, max_iter = 10000, early_stopping_rounds = 100):
        self.model_type = model_type
        self.model_params = model_params
        self.transformers_dict = dict()
        self.lightgbm_early_stopping_rounds = early_stopping_rounds
        self.lightgbm_max_iter = max_iter

    @timeit
    def fit(self, x, y):
        print('entered fit, x shape: {}'.format(x.shape))
        self.transformers_dict = dict()
        self.columns = x.columns

        if self.model_type == 'lightgbm':
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
            lgtrain = lightgbm.Dataset(x_train, y_train)
            lgvalid = lightgbm.Dataset(x_val, y_val)

            self.model = lightgbm.train(
                self.model_params,
                lgtrain,
                num_boost_round=self.lightgbm_max_iter,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train', 'valid'],
                early_stopping_rounds=self.lightgbm_early_stopping_rounds,
                verbose_eval=100
            )

    @timeit
    def predict(self, x):
        if self.model_type == 'lightgbm':
            return self.model.predict(x, num_iteration=self.model.best_iteration)
            

    def evaluate(self):
        if self.model_type == 'lightgbm':
            output = []

            for i, j in zip(self.columns, self.model.feature_importance('gain', iteration=self.model.best_iteration)):
                output.append({'column': i, 'feature_importance': j})
            return pd.DataFrame.from_dict(output).sort_values('feature_importance', ascending=False)
        

In [None]:

def gbm_rfe(x, y, lgbm_params, step_size=.01, min_features=8, score_type = 'accuracy'):
    features = [i for i in x.columns.tolist() if i not in cols_to_drop]
    x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y)
    feature_result_dict = dict()
   
    while len(features) > min_features:
        next_step_num_of_features = max(int(len(features)*(1-step_size)), min_features)
        model = Model('lightgbm', lgbm_params, early_stopping_rounds= 10)
        model.fit(x_train[features], y_train)
        
        preds_train = model.predict(x_train[features])
        preds_train = np.rint(preds_train).astype(int)
                
        preds_val = model.predict(x_val[features])
        preds_val = np.rint(preds_val).astype(int)
        
        if score_type == 'accuracy':
            feature_result_dict[tuple(features)] = metrics.accuracy_score(y_val, preds_val)
        if score_type == 'r2_score':
            feature_result_dict[tuple(features)] = metrics.r2_score(y_val, preds_val) 
        importances = model.evaluate().sort_values('feature_importance', ascending = False)
        print(len(features), metrics.accuracy_score(y_train, preds_train), metrics.accuracy_score(y_val, preds_val))
        importances = importances[importances['feature_importance'] > 0]

        features = importances['column'].tolist()[:next_step_num_of_features]
       
    features_selected = None
    max_score = 0
    for k, v in feature_result_dict.items():
        if v > max_score:
            features_selected = list(k)
            max_score = v
    return features_selected


In [None]:

def select_linear_features(x_train_glm, y_train, max_feature_corr):
    selected_cols_to_normal_target = set()
    feature_target_corr = []
    for i in x_train_glm.columns:
        # if 'accident_table' not in i and i not in cols_to_drop:
        if i not in cols_to_drop:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i], y_train)
            feature_target_corr.append((i, abs(r_value)))
    feature_target_corr = sorted(feature_target_corr, key = lambda x: x[1], reverse = True)
    selected_cols_to_normal_target.add(feature_target_corr[0][0])
    for i in feature_target_corr:
        too_correlated_to_features = False
        for j in selected_cols_to_normal_target:
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_train_glm[i[0]], x_train_glm[j])
            if abs(r_value) > max_feature_corr:
                too_correlated_to_features = True
        if not too_correlated_to_features:
            selected_cols_to_normal_target.add(i[0])

    selected_cols_to_normal_target = list(selected_cols_to_normal_target)
    return selected_cols_to_normal_target


In [None]:
feature_df = load_general_feature_file()
x = feature_df.drop(['win', 'score_diff', 'key'], axis = 1)
y1 = feature_df['win']
y2 = feature_df['score_diff']

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127
}

lgbm_params_fast = copy.deepcopy(lgbm_params)
lgbm_params_fast['learning_rate'] = .1


model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y1)


In [None]:
classification_columns = gbm_rfe(x_train, y_train, lgbm_params_fast, step_size=.01, min_features=4, score_type = 'accuracy')
x_train.shape, len(classification_columns)


In [None]:
model.fit(x_train[classification_columns], y_train)
preds = model.predict(x_test[classification_columns])
preds = np.rint(preds).astype(int)     
metrics.accuracy_score(y_test, preds)

In [None]:
model.evaluate().head(100)

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'l2',
    'metric': 'mae',
    "learning_rate": 0.01,
    "max_depth": -1,
    'num_leaves': 127,
}

lgbm_params_fast = copy.deepcopy(lgbm_params)
lgbm_params_fast['learning_rate'] = .1

model = Model('lightgbm', lgbm_params)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y2)


In [None]:
reg_columns = gbm_rfe(x_train, y_train, lgbm_params_fast, step_size=.01, min_features=4, score_type = 'r2_score')
x_train.shape, len(reg_columns)

In [None]:
model.fit(x_train, y_train)
preds = model.predict(x_test)
preds = np.rint(preds).astype(int)     
metrics.r2_score(y_test, preds)

In [None]:
model.evaluate().head(100)