In [1]:
from hyperopt import fmin, STATUS_OK, Trials, hp, tpe, rand
import lightgbm
import pandas as pd 
import numpy as np
from pathlib import Path
import os 
import sys 

home_dir = '/Users/Liu'
scripts_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/GBM_model_build')
sys.path.insert(1, scripts_dir)

import model_build_scripts
from model_build_scripts import helpers
from model_build_scripts.NBA_model_build import NBA_Model_Build

In [2]:
#Defines the model scenario in ./configs/scenarios_config.yml
model_name = "GBM_tot_score_prediction_model_build_all_features"

In [3]:
home_dir = Path(home_dir)
scenario_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'scenarios_config.yml'
feature_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'features_config.yml'
model_scenario = NBA_Model_Build(model_name, 
                                 feature_yaml = feature_dir, 
                                 scenario_yaml = scenario_dir)

  config = yaml.load(stream)


In [4]:
model_scenario.load_data()

In [5]:
initial_params = {'objective': 'regression',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'max_depth': 15,
                  'seed': model_scenario.seed,
                  'bagging_freq': 1
                 }

In [13]:
#Runs Recursive Feature Elimination to Remove all features of 0 Importance
post_rfe_model, var_importance  = model_scenario.run_rfe(model_params = initial_params, 
                                                         target = model_scenario.target, 
                                                         X_vars= model_scenario.X_vars,
                                                         model_type = 'regressor')

524 features below threshold
The following features will be removed:
['HT_Num_Wins_L4G', 'Num_Players_1plus_ST_L7G_RT', 'HT_AVG_Win_PTdiff_L10G', 'Num_Players_1plus_BL_L6G_RT', 'RT_cnt_within_5_L10G', 'Num_Players_3plus_ST_L2G_HT', 'Num_Players_3plus_ST_L10G_RT', 'Num_Players_1plus_ST_L10G_RT', 'Num_Players_3plus_ST_L3G_HT', 'HT_cnt_loss_6_plus_L5G', 'HT_num_11_plus_pts_wins_L4G', 'RT_cnt_loss_6_plus_L6G', 'Num_Players_1plus_BL_L2G_RT', 'Num_Players_15plus_TOT_L4G_HT', 'PPG_AVG_L6G_RT', 'Num_Players_3plus_TO_L5G_RT', 'Num_Players_5plus_TO_L6G_RT', 'HT_Num_Losses_L8G', 'BL_LG_RT', 'Num_Players_1plus_TO_L4G_HT', 'Num_Players_2plus_BL_L2G_RT', 'RT_num_11_plus_pts_loss_L9G', 'HT_AVG_PTdiff_L5G', 'Num_Players_3plus_ST_L6G_HT', 'Num_Players_25plus_PPG_L3G_RT', 'TS_PCT_L2G_RT', 'Num_Players_5plus_TO_L9G_RT', 'Num_Players_15plus_APG_LG_HT', 'HT_cnt_loss_6_plus_L7G', 'Num_Players_3plus_BL_L10G_HT', 'APG_AVG_L3G_HT', 'HT_Num_Wins_L8G', 'Num_Players_10plus_APG_L8G_HT', 'BL_AVG_L3G_HT', 'HT_AVG_Wi

In [14]:
initial_params['bagging_freq'] = 1

In [15]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params, hyperparam_obj = model_scenario.run_hyperopt(model_scenario.hyperopt_param_space, 
                                                          model_scenario.X_vars, 
                                                          model_params = initial_params, 
                                                          fmin_max_evals = model_scenario.fmin_max_evals,
                                                          model_type = 'regressor')

100%|██████████| 200/200 [45:55<00:00, 13.78s/trial, best loss: -0.0]  


In [17]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params

{'colsample_bytree': 0.6489325172213725,
 'learning_rate': 0.03265285941393413,
 'min_data_in_leaf': 100.0,
 'num_leaves': 49.0,
 'reg_alpha': 0.17948888280944775,
 'reg_lambda': 0.11267775892310361,
 'subsample': 0.9}

In [18]:
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])

In [19]:
static_params =  {'objective': 'regression',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 25,
                  'max_depth': 15,
                  'seed': model_scenario.seed,
                  'bagging_freq': 1
                 }

In [20]:
final_model = lightgbm.LGBMRegressor(**best_params, **static_params, importance_type = 'gain')

In [21]:
eval_set = [(model_scenario.df_tune[list(post_rfe_model.booster_.feature_name())], 
             model_scenario.df_tune[model_scenario.target])]

In [23]:
final_model.fit(X = model_scenario.df_train[list(post_rfe_model.booster_.feature_name())],
                y = model_scenario.df_train[model_scenario.target],
                eval_set = eval_set,
                verbose = False)

LGBMRegressor(bagging_freq=1, colsample_bytree=0.6489325172213725,
              early_stopping_rounds=25, importance_type='gain',
              learning_rate=0.03265285941393413, max_depth=15,
              min_data_in_leaf=100, num_boost_round=1000, num_leaves=49,
              objective='regression', reg_alpha=0.17948888280944775,
              reg_lambda=0.11267775892310361, seed=2454371, subsample=0.9)

In [24]:
model_predictions = final_model.predict(model_scenario.df_validate[post_rfe_model.booster_.feature_name()])

In [138]:
d = {
    'TEAM_HT' : model_scenario.df_validate['TEAM_HT'],
    'Final_Score_HT' : model_scenario.df_validate['Final_Score_HT'],
    'TEAM_RT' : model_scenario.df_validate['TEAM_RT'],
    'Final_Score_RT': model_scenario.df_validate['Final_Score_RT'],
    'outcome': model_scenario.df_validate['model_scenario.target'],
    'ht_closing_spread': model_scenario.df_validate['CLOSING SPREAD_HT'],
    'model_prediction': model_predictions
}

In [139]:
model_prediction_df = pd.DataFrame(data = d)

In [142]:
model_prediction_df['model_predict'] = model_prediction_df['model_prediction'].apply(lambda x: 1 if x > 0.49 else 0)

In [143]:
(model_prediction_df['model_predict']  == model_prediction_df['outcome']).value_counts()

True     660
False    652
dtype: int64