In [1]:
from hyperopt import fmin, STATUS_OK, Trials, hp, tpe, rand
import lightgbm
import pandas as pd 
import numpy as np
from pathlib import Path
import os 
import sys 

home_dir = '/Users/Liu'
scripts_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/GBM_model_build')
sys.path.insert(1, scripts_dir)

import model_build_scripts
from model_build_scripts import helpers
from model_build_scripts.NBA_model_build import NBA_Model_Build

In [2]:
#Defines the model scenario in ./configs/scenarios_config.yml
model_name = "GBM_over_under_outcome_model_build_all_features"

In [3]:
home_dir = Path(home_dir)
scenario_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'scenarios_config.yml'
feature_dir = home_dir / 'NBA_Pro_Line_Analytics' / 'GBM_model_build' / 'features_config.yml'
model_scenario = NBA_Model_Build(model_name, 
                                 feature_yaml = feature_dir, 
                                 scenario_yaml = scenario_dir)

  config = yaml.load(stream)


In [4]:
model_scenario.load_data()

In [5]:
initial_params = {'objective': 'binary',
                  'metric': 'auc',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 20,
                  'max_depth': 15,
                  'seed': model_scenario.seed,
                  'bagging_freq': 1
                 }

In [6]:
#Runs Recursive Feature Elimination to Remove all features of 0 Importance
post_rfe_model, var_importance  = model_scenario.run_rfe(model_params = initial_params, 
                                                         target = model_scenario.target, X_vars= model_scenario.X_vars)



618 features below threshold
The following features will be removed:
['Num_Players_3plus_BL_L7G_HT', 'RT_cnt_wins_6_plus_L5G', 'Num_Players_2plus_BL_L8G_RT', 'POSS_L5G_HT', 'FG_PCT_L9G_HT', 'POSS_L5G_RT', 'Num_Players_2plus_ST_L2G_RT', 'Num_Players_20plus_PPG_L9G_RT', 'Num_Players_3plus_BL_L7G_RT', 'Num_Players_5plus_APG_L4G_HT', 'Num_Players_2plus_ST_LG_HT', 'RT_cnt_within_5_L9G', 'RT_cnt_loss_6_plus_L4G', 'HT_cnt_within_5_LG', '3PT_PCT_L9G_HT', 'POSS_L10G_HT', 'DEFF_LG_RT', 'Num_Players_3plus_ST_L3G_RT', 'HT_AVG_Win_PTdiff_L8G', 'Num_Players_15plus_TOT_L7G_RT', 'Num_Players_5plus_APG_L8G_HT', 'Num_Players_20plus_PPG_L5G_HT', 'Num_Players_20plus_PPG_L4G_RT', 'DEFF_L7G_HT', 'Num_Players_3plus_ST_L5G_HT', 'RT_cnt_wins_6_plus_L2G', 'Num_Players_2plus_BL_L6G_HT', 'Num_Players_5plus_APG_L6G_RT', 'RT_AVG_PTdiff_L10G', 'Num_Players_3plus_TO_L9G_HT', 'RT_cnt_within_5_L4G', 'HT_AVG_Loss_PTdiff_L10G', 'Num_Players_10plus_APG_L2G_HT', 'Num_Players_2plus_BL_LG_HT', 'APG_AVG_L8G_HT', 'RT_num_11_pl

In [7]:
initial_params['bagging_freq'] = 1

In [8]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params, hyperparam_obj = model_scenario.run_hyperopt(model_scenario.hyperopt_param_space, 
                                                          model_scenario.X_vars, 
                                                          model_params = initial_params, 
                                                          fmin_max_evals = model_scenario.fmin_max_evals)

100%|██████████| 200/200 [11:07<00:00,  3.34s/trial, best loss: -0.5]


In [9]:
#Runs Hyperparameter tuning (Bayesian Optimization) - current scenario set to complete 200 rounds 
best_params

{'colsample_bytree': 0.6489325172213725,
 'learning_rate': 0.03265285941393413,
 'min_data_in_leaf': 100.0,
 'num_leaves': 49.0,
 'reg_alpha': 0.17948888280944775,
 'reg_lambda': 0.11267775892310361,
 'subsample': 0.9}

In [10]:
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])

In [11]:
static_params =  {'objective': 'binary',
                  'metric': 'binary_logloss',
                  'num_boost_round': 1000,
                  'early_stopping_rounds': 25,
                  'max_depth': 15,
                  'seed': model_scenario.seed
                 }

In [12]:
final_model = lightgbm.LGBMModel(**best_params, **static_params, importance_type = 'gain')

In [13]:
eval_set = [(model_scenario.df_tune[list(post_rfe_model.booster_.feature_name())], 
             model_scenario.df_tune[model_scenario.target])]

In [14]:
final_model.fit(X = model_scenario.df_train[list(post_rfe_model.booster_.feature_name())],
                y = model_scenario.df_train[model_scenario.target],
                eval_set = eval_set,
                verbose = False)

LGBMModel(colsample_bytree=0.6489325172213725, early_stopping_rounds=25,
          importance_type='gain', learning_rate=0.03265285941393413,
          max_depth=15, metric='binary_logloss', min_data_in_leaf=100,
          num_boost_round=1000, num_leaves=49, objective='binary',
          reg_alpha=0.17948888280944775, reg_lambda=0.11267775892310361,
          seed=2454371, subsample=0.9)

In [17]:
model_predictions = final_model.predict(model_scenario.df_validate[post_rfe_model.booster_.feature_name()])

In [26]:
model_scenario.df_validate['CLOSING TOTAL_HT']

11578    211.5
11579    220.5
11580    222.0
11581    213.0
11582    204.5
         ...  
12885    213.5
12886    209.5
12887    215.0
12888    217.0
12889    211.5
Name: CLOSING TOTAL_HT, Length: 1312, dtype: float64

In [46]:
d = {
    'TEAM_HT' : model_scenario.df_validate['TEAM_HT'],
    'Final_Score_HT' : model_scenario.df_validate['Final_Score_HT'],
    'TEAM_RT' : model_scenario.df_validate['TEAM_RT'],
    'Final_Score_RT': model_scenario.df_validate['Final_Score_RT'],
    'over_under_line': model_scenario.df_validate['CLOSING TOTAL_HT'],
    'outcome': model_scenario.df_validate[model_scenario.target],
    'model_predictions': model_predictions
}

In [47]:
df_results = pd.DataFrame(data = d)

In [62]:
df_results['model_predict'] = df_results['model_predictions'].apply(lambda x: 1 if x > 0.49 else 0)