In [1]:
from hyperopt import fmin, STATUS_OK, Trials, hp, tpe, rand
import lightgbm
import pandas as pd 
import numpy as np
from pathlib import Path
import os 
import sys 

home_dir = '/Users/Liu'
scripts_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/GBM_model_build')
sys.path.insert(1, scripts_dir)

import model_build_scripts
from model_build_scripts import helpers
from model_build_scripts.NBA_model_build import NBA_Model_Build

In [2]:
#Defines the model scenario in ./configs/scenarios_config.yml
model_name = "GBM_three_way_outcome_model_build_all_features"

In [3]:
home_dir = Path(home_dir)
scenario_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'scenarios_config.yml'
feature_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'features_config.yml'
model_scenario = NBA_Model_Build(model_name, 
                                 feature_yaml = feature_dir, 
                                 scenario_yaml = scenario_dir)

  config = yaml.load(stream)


In [4]:
model_scenario.load_data()

In [5]:
initial_params = {'objective': 'multiclass',
                  'metric': 'multi_logloss',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'num_classes': 3,
                  'max_depth': 15,
                  'seed': model_scenario.seed
                 }

In [6]:
#Runs Recursive Feature Elimination to Remove all features of 0 Importance
post_rfe_model, var_importance  = model_scenario.run_rfe(model_params= initial_params, 
                                                         target = model_scenario.target, X_vars= model_scenario.X_vars)



288 features below threshold
The following features will be removed:
['Num_Players_3plus_TO_L5G_HT', 'Num_Players_2plus_BL_L7G_RT', 'HT_Win_LG', 'HT_cnt_within_5_L3G', 'Num_Players_1plus_BL_L9G_RT', 'Num_Players_15plus_TOT_L4G_RT', 'Num_Players_25plus_PPG_LG_HT', 'Num_Players_5plus_APG_L3G_RT', 'Num_Players_3plus_ST_L10G_HT', 'Num_Players_5plus_TOT_L6G_RT', 'Num_Players_5plus_TO_L5G_HT', 'Num_Players_5plus_APG_L8G_HT', 'Num_Players_15plus_TOT_L10G_RT', 'Num_Players_3plus_BL_L6G_RT', 'Num_Players_2plus_BL_L8G_HT', 'Num_Players_10plus_APG_L9G_RT', 'Num_Players_10plus_APG_L8G_RT', 'Num_Players_5plus_TO_L7G_RT', 'Num_Players_25plus_PPG_L2G_RT', 'Num_Players_30plus_PPG_L5G_HT', 'Num_Players_15plus_TOT_L8G_RT', 'Num_Players_20plus_PPG_L5G_HT', 'Num_Players_3plus_ST_L3G_RT', 'HT_num_11_plus_pts_wins_L4G', 'Num_Players_15plus_APG_L7G_HT', 'Num_Players_30plus_PPG_L5G_RT', 'Num_Players_3plus_ST_L8G_RT', 'Num_Players_5plus_APG_L6G_RT', 'HT_Num_Wins_L8G', 'RT_cnt_within_5_L8G', 'RT_Num_Losses_L10G

In [7]:
initial_params['bagging_freq'] = 1

In [8]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params, hyperparam_obj = model_scenario.run_hyperopt(model_scenario.hyperopt_param_space, 
                                                          list(post_rfe_model.booster_.feature_name()), 
                                                          model_params = initial_params, 
                                                          fmin_max_evals = model_scenario.fmin_max_evals)

100%|██████████| 200/200 [1:04:48<00:00, 19.44s/trial, best loss: -0.42337959448864404]


In [9]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params

{'colsample_bytree': 0.7956792341863412,
 'learning_rate': 0.024056645123924907,
 'min_data_in_leaf': 90.0,
 'num_leaves': 37.0,
 'reg_alpha': 0.12682563184942588,
 'reg_lambda': 0.4946371233393574,
 'subsample': 0.65}

In [21]:
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])

In [22]:
static_params =  {'objective': 'multiclass',
                  'metric': 'multi_logloss',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'num_classes': 3,
                  'max_depth': 15,
                  'seed': model_scenario.seed
                    }

In [26]:
final_model = lightgbm.LGBMModel(**best_params, **static_params, importance_type = 'gain')

In [27]:
eval_set = [(model_scenario.df_tune[list(post_rfe_model.booster_.feature_name())], 
             model_scenario.df_tune[model_scenario.target])]

In [29]:
final_model.fit(X = model_scenario.df_train[list(post_rfe_model.booster_.feature_name())],
                y = model_scenario.df_train[model_scenario.target],
                eval_set = eval_set,
                verbose = False)

LGBMModel(colsample_bytree=0.7956792341863412, early_stopping_rounds=20,
          importance_type='gain', learning_rate=0.024056645123924907,
          max_depth=15, metric='multi_logloss', min_data_in_leaf=90,
          num_boost_round=1000, num_classes=3, num_leaves=37,
          objective='multiclass', reg_alpha=0.12682563184942588,
          reg_lambda=0.4946371233393574, seed=2454371, subsample=0.65,
          verbose=-1)

In [34]:
model_predictions = final_model.predict(model_scenario.df_validate[post_rfe_model.booster_.feature_name()])

In [35]:
y_pred =  [np.argmax(i) for i in model_predictions ]

In [61]:
d = {
    'TEAM_HT' : model_scenario.df_validate['TEAM_HT'],
    'Final_Score_HT' : model_scenario.df_validate['Final_Score_HT'],
    'TEAM_RT' : model_scenario.df_validate['TEAM_RT'],
    'Final_Score_RT': model_scenario.df_validate['Final_Score_RT'],
    'outcome': model_scenario.df_validate['outcome'],
    'model_prediction_0': model_predictions[:, 0],
    'model_prediction_1': model_predictions[:, 1],
    'model_prediction_2': model_predictions[:, 2],
    'model_prediction': y_pred,
}

In [62]:
model_prediction_df = pd.DataFrame(data = d)

In [70]:
(model_prediction_df.outcome == model_prediction_df.model_prediction).value_counts()

False    659
True     653
dtype: int64

In [77]:
model_prediction_65 = model_prediction_df[(model_prediction_df.model_prediction_0 > 0.65) |
                                          (model_prediction_df.model_prediction_1 > 0.65) |
                                          (model_prediction_df.model_prediction_2 > 0.65) ]
model_prediction_60 = model_prediction_df[(model_prediction_df.model_prediction_0 > 0.60) |
                                          (model_prediction_df.model_prediction_1 > 0.60) |
                                          (model_prediction_df.model_prediction_2 > 0.60) ]

In [78]:
(model_prediction_60.outcome == model_prediction_60.model_prediction).value_counts()

True     152
False     79
dtype: int64

In [93]:
model_prediction_60

Unnamed: 0,TEAM_HT,Final_Score_HT,TEAM_RT,Final_Score_RT,outcome,model_prediction_0,model_prediction_1,model_prediction_2,model_prediction
11718,Indiana,94,Houston,98,1,0.159817,0.232409,0.607775,2
11724,Golden State,117,Memphis,101,2,0.125191,0.159691,0.715118,2
11737,Utah,117,Dallas,102,2,0.168482,0.224843,0.606675,2
11739,LA Lakers,114,Minnesota,110,1,0.202192,0.191512,0.606297,2
11743,Golden State,111,Milwaukee,134,0,0.208388,0.188782,0.602829,2
...,...,...,...,...,...,...,...,...,...
12840,Toronto,115,Orlando,96,2,0.096495,0.189980,0.713525,2
12844,Golden State,121,LA Clippers,129,0,0.137720,0.197303,0.664977,2
12845,Houston,100,Utah,93,2,0.118844,0.240399,0.640757,2
12849,Toronto,108,Philadelphia,95,2,0.156479,0.235291,0.608230,2


In [92]:
final_model._Booster.feature_name()

['Num_Players_1plus_BL_L2G_HT',
 'RT_cnt_wins_6_plus_L8G',
 'HT_Num_Wins_L6G',
 'FT_PCT_L9G_HT',
 'RT_AVG_PTdiff_L7G',
 'RT_AVG_Win_PTdiff_L10G',
 'RT_AVG_Loss_PTdiff_L4G',
 'Num_Players_5plus_TOT_L3G_RT',
 'Num_Players_15plus_PPG_L9G_HT',
 'PPG_AVG_L10G_RT',
 'FG_PCT_L6G_RT',
 'Num_Players_20plus_PPG_L10G_RT',
 'BL_AVG_L8G_HT',
 'Num_Players_2plus_BL_L3G_HT',
 'PACE_L7G_HT',
 '3PT_PCT_LG_RT',
 'Num_Players_1plus_BL_LG_RT',
 'POSS_L7G_RT',
 'Num_Players_15plus_PPG_L3G_HT',
 'RT_num_11_plus_pts_loss_L6G',
 'Num_Players_1plus_BL_L5G_HT',
 '3PT_PCT_L10G_RT',
 'TOT_AVG_L2G_RT',
 'APG_AVG_L7G_HT',
 'PPG_AVG_L9G_HT',
 'OEFF_LG_RT',
 'DEFF_LG_RT',
 'Num_Players_10plus_TOT_LG_RT',
 'BL_AVG_L6G_RT',
 'OEFF_L2G_HT',
 'Num_Players_2plus_ST_L2G_HT',
 'Num_Players_2plus_ST_L2G_RT',
 'POSS_L2G_RT',
 'POSS_L4G_HT',
 'HT_num_11_plus_pts_wins_L8G',
 'Num_Players_1plus_TO_L9G_RT',
 'Num_Players_30plus_PPG_L4G_HT',
 'Num_Players_15plus_PPG_L5G_HT',
 'ST_AVG_L5G_RT',
 'Num_Players_10plus_TOT_L5G_RT',
 '3P