In [1]:
from hyperopt import fmin, STATUS_OK, Trials, hp, tpe, rand
import lightgbm
import pandas as pd 
import numpy as np
from pathlib import Path
import os 
import sys 

home_dir = '/Users/Liu'
scripts_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/GBM_model_build')
sys.path.insert(1, scripts_dir)

import model_build_scripts
from model_build_scripts import helpers
from model_build_scripts.NBA_model_build import NBA_Model_Build

In [2]:
#Defines the model scenario in ./configs/scenarios_config.yml
model_name = "GBM_three_way_outcome_model_build_all_features"

In [3]:
home_dir = Path(home_dir)
scenario_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'scenarios_config.yml'
feature_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'features_config.yml'
model_scenario = NBA_Model_Build(model_name, 
                                 feature_yaml = feature_dir, 
                                 scenario_yaml = scenario_dir)

  config = yaml.load(stream)


In [4]:
model_scenario.load_data()

In [5]:
initial_params = {'objective': 'multiclass',
                  'metric': 'multi_logloss',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'num_classes': 3,
                  'max_depth': 15,
                  'seed': model_scenario.seed
                 }

In [6]:
#Runs Recursive Feature Elimination to Remove all features of 0 Importance
post_rfe_model, var_importance  = model_scenario.run_rfe(model_params= initial_params, 
                                                         target = model_scenario.target, X_vars= model_scenario.X_vars)



288 features below threshold
The following features will be removed:
['Num_Players_3plus_TO_L5G_HT', 'Num_Players_2plus_BL_L7G_RT', 'HT_Win_LG', 'HT_cnt_within_5_L3G', 'Num_Players_1plus_BL_L9G_RT', 'Num_Players_15plus_TOT_L4G_RT', 'Num_Players_25plus_PPG_LG_HT', 'Num_Players_5plus_APG_L3G_RT', 'Num_Players_3plus_ST_L10G_HT', 'Num_Players_5plus_TOT_L6G_RT', 'Num_Players_5plus_TO_L5G_HT', 'Num_Players_5plus_APG_L8G_HT', 'Num_Players_15plus_TOT_L10G_RT', 'Num_Players_3plus_BL_L6G_RT', 'Num_Players_2plus_BL_L8G_HT', 'Num_Players_10plus_APG_L9G_RT', 'Num_Players_10plus_APG_L8G_RT', 'Num_Players_5plus_TO_L7G_RT', 'Num_Players_25plus_PPG_L2G_RT', 'Num_Players_30plus_PPG_L5G_HT', 'Num_Players_15plus_TOT_L8G_RT', 'Num_Players_20plus_PPG_L5G_HT', 'Num_Players_3plus_ST_L3G_RT', 'HT_num_11_plus_pts_wins_L4G', 'Num_Players_15plus_APG_L7G_HT', 'Num_Players_30plus_PPG_L5G_RT', 'Num_Players_3plus_ST_L8G_RT', 'Num_Players_5plus_APG_L6G_RT', 'HT_Num_Wins_L8G', 'RT_cnt_within_5_L8G', 'RT_Num_Losses_L10G

In [7]:
initial_params['bagging_freq'] = 1

In [8]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params, hyperparam_obj = model_scenario.run_hyperopt(model_scenario.hyperopt_param_space, 
                                                          list(post_rfe_model.booster_.feature_name()), 
                                                          model_params = initial_params, 
                                                          fmin_max_evals = model_scenario.fmin_max_evals)

100%|██████████| 200/200 [1:04:48<00:00, 19.44s/trial, best loss: -0.42337959448864404]


In [9]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params

{'colsample_bytree': 0.7956792341863412,
 'learning_rate': 0.024056645123924907,
 'min_data_in_leaf': 90.0,
 'num_leaves': 37.0,
 'reg_alpha': 0.12682563184942588,
 'reg_lambda': 0.4946371233393574,
 'subsample': 0.65}

In [21]:
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])

In [22]:
static_params =  {'objective': 'multiclass',
                  'metric': 'multi_logloss',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'num_classes': 3,
                  'max_depth': 15,
                  'seed': model_scenario.seed
                    }

In [26]:
final_model = lightgbm.LGBMModel(**best_params, **static_params, importance_type = 'gain')

In [27]:
eval_set = [(model_scenario.df_tune[list(post_rfe_model.booster_.feature_name())], 
             model_scenario.df_tune[model_scenario.target])]

In [28]:
final_model.fit(X = model_scenario.df_train[list(post_rfe_model.booster_.feature_name())],
                y = model_scenario.df_train[model_scenario.target],
                eval_set = eval_set)

[1]	valid_0's multi_logloss: 1.07574
Training until validation scores don't improve for 20 rounds
[2]	valid_0's multi_logloss: 1.07403
[3]	valid_0's multi_logloss: 1.07253
[4]	valid_0's multi_logloss: 1.0711
[5]	valid_0's multi_logloss: 1.0696
[6]	valid_0's multi_logloss: 1.06816
[7]	valid_0's multi_logloss: 1.06679
[8]	valid_0's multi_logloss: 1.06542
[9]	valid_0's multi_logloss: 1.06412
[10]	valid_0's multi_logloss: 1.06314
[11]	valid_0's multi_logloss: 1.06206
[12]	valid_0's multi_logloss: 1.06103
[13]	valid_0's multi_logloss: 1.05985
[14]	valid_0's multi_logloss: 1.05875
[15]	valid_0's multi_logloss: 1.05795
[16]	valid_0's multi_logloss: 1.05702
[17]	valid_0's multi_logloss: 1.05611
[18]	valid_0's multi_logloss: 1.05525
[19]	valid_0's multi_logloss: 1.05449
[20]	valid_0's multi_logloss: 1.05369
[21]	valid_0's multi_logloss: 1.05274
[22]	valid_0's multi_logloss: 1.05196
[23]	valid_0's multi_logloss: 1.05132
[24]	valid_0's multi_logloss: 1.05055
[25]	valid_0's multi_logloss: 1.0499
[

KeyboardInterrupt: 