# Github Link 

# [MilteStone2 Branch](https://github.com/theAayushbajaj/NHL-Milestone-Project/tree/milestone-2-dev/src)

Resources 

- [1-educative](https://www.educative.io/answers/classification-using-xgboost-in-python) 

- [2-datacamp](https://www.datacamp.com/tutorial/xgboost-in-python)

- [3-mljar](https://mljar.com/blog/xgboost-save-load-python/)

In [119]:
from comet_ml import Experiment
import pickle 
import os 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
# Sckit learn modules and classes 
from sklearn.metrics import confusion_matrix, f1_score, \
precision_score, recall_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
# encoding 
from sklearn.preprocessing import OneHotEncoder 
from category_encoders import TargetEncoder
import xgboost as xgb
import utils # functions
import warnings 

In [2]:
warnings.filterwarnings("ignore")
# warnings.filterwarnings("default")

In [3]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
seed = 200 

In [4]:
df_base = pd.read_csv('baseline_model_data.csv')

In [10]:
# adding space to the white space in column names 
cols_rename_base = utils.renamer(df_base) 
df_base.rename(columns=cols_rename_base, inplace=True)
print(df_base.columns)

Index(['season', 'shot_distance', 'shot_angle', 'empty_net', 'is_goal'], dtype='object')


In [13]:
# filter season 2020 for test and the remaining goes to train 
df_base_test = df_base[df_base.season == 2020]
df_base_train = df_base[df_base.season < 2020]

# cols for question 1 
cols = ['shot_distance', 'shot_angle']

# features 
df_base_train_X = df_base_train.loc[:, cols] 
df_base_test_X = df_base_test.loc[:, cols] 
# targets 
df_base_train_y = df_base_train.iloc[:, -1] 
df_base_test_y = df_base_test.iloc[:, -1] 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_base_train_X, 
                                                    df_base_train_y, 
                                                    random_state=seed, 
                                                    test_size=0.2, 
                                                    stratify=df_base_train_y)

In [87]:
# Create regression matrices
train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)

In [91]:
params = {"objective": "binary:logistic", "tree_method": "hist", 
         "max_depth":10, "learning_rate": 0.01,
         "seed": seed}
# "tree_method": "gpu_hist"

In [92]:
model = xgb.train(params, train, evals=[(train, "train"), (test, "validation")], 
          num_boost_round=100, early_stopping_rounds=20) 

[0]	train-logloss:0.33352	validation-logloss:0.33355
[1]	train-logloss:0.33282	validation-logloss:0.33287
[2]	train-logloss:0.33212	validation-logloss:0.33220
[3]	train-logloss:0.33144	validation-logloss:0.33154
[4]	train-logloss:0.33076	validation-logloss:0.33089
[5]	train-logloss:0.33010	validation-logloss:0.33025
[6]	train-logloss:0.32944	validation-logloss:0.32961
[7]	train-logloss:0.32880	validation-logloss:0.32899
[8]	train-logloss:0.32816	validation-logloss:0.32838
[9]	train-logloss:0.32753	validation-logloss:0.32777
[10]	train-logloss:0.32692	validation-logloss:0.32718
[11]	train-logloss:0.32631	validation-logloss:0.32659
[12]	train-logloss:0.32571	validation-logloss:0.32601
[13]	train-logloss:0.32511	validation-logloss:0.32544
[14]	train-logloss:0.32453	validation-logloss:0.32488
[15]	train-logloss:0.32395	validation-logloss:0.32432
[16]	train-logloss:0.32339	validation-logloss:0.32378
[17]	train-logloss:0.32283	validation-logloss:0.32324
[18]	train-logloss:0.32228	validation-

In [102]:
print('best_iteration', model.best_iteration)

best_iteration 99


In [115]:
y_prob = model.predict(test, iteration_range=(0, model.best_iteration+1))
y_pred = np.array([1 if prob >= 0.5 else 0 for prob in y_pred]) 

In [116]:
y_prob

array([0.16531676, 0.11945973, 0.08808973, ..., 0.14233795, 0.15339181,
       0.19194219], dtype=float32)

In [117]:
cfn_mat = confusion_matrix(y_test, y_pred)
print(cfn_mat)

[[53946     0]
 [ 5655     0]]


In [122]:
accuracy = accuracy_score(y_test, y_pred)
fsc = f1_score(y_test, y_pred)
psc = precision_score(y_test, y_pred)
rsc = recall_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [126]:
print(f'accuracy {accuracy}')
print(f'f1_score {fsc}')
print(f'precision_score {psc}')
print(f'recall_score {rsc}')

accuracy 0.9051190416268183
f1_score 0.0
precision_score 0.0
recall_score 0.0


In [127]:
print(report)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     53946
           1       0.00      0.00      0.00      5655

    accuracy                           0.91     59601
   macro avg       0.45      0.50      0.48     59601
weighted avg       0.82      0.91      0.86     59601

