In [None]:
import os
import numpy as np
import pandas as pd
from plt_rcs import *
import hds
plt.rc(group = 'figure', figsize = (4, 4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('WhiteWine.pkl')

In [None]:
globals().update(objs)

In [None]:
X_tr, X_vl, y_tr, y_vl = X_train, X_valid, y_train, y_valid

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(n_estimators = 1000, early_stopping_rounds = 50)

In [None]:
model.fit(X = X_tr, y = y_tr, eval_set = [(X_vl, y_vl)], verbose = False)

In [None]:
model.score(X = X_tr, y = y_tr)
# 0.9830805134189031
model.score(X = X_vl, y = y_vl)
# 0.8639455782312925

In [None]:
pd.Series(data = model.feature_importances_, 
          index = model.feature_names_in_) \
  .sort_values(ascending = False)

In [None]:
hds.plot.feature_importance(model)

In [None]:
model.get_booster().num_boosted_rounds()
# 109

In [None]:
model.best_iteration
# 58

In [None]:
model.best_score
# 0.351749884845728

In [None]:
y_pred = model.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred)

In [None]:
y_prob = model.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(k_neighbors = 5, random_state = 0)

In [None]:
X_bal, y_bal = smote.fit_resample(X = X_tr, y = y_tr)

In [None]:
model_bal = XGBClassifier(n_estimators = 1000, early_stopping_rounds = 50)

In [None]:
model_bal.fit(X = X_bal, y = y_bal, eval_set = [(X_vl, y_vl)], verbose = False)

In [None]:
model_bal.score(X = X_bal, y = y_bal)

In [None]:
model_bal.score(X = X_vl, y = y_vl)

In [None]:
model_bal.get_booster().num_boosted_rounds()

In [None]:
model_bal.best_iteration

In [None]:
y_pred_bal = model_bal.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred_bal)

In [None]:
y_prob_bal = model_bal.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob_bal, color = 'blue')

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob_bal, color = 'blue')

In [None]:
y_vl.value_counts()

In [None]:
ratio = 1157 / 313

In [None]:
model_wgt = XGBClassifier(n_estimators = 1000, 
                          early_stopping_rounds = 50, 
                          scale_pos_weight = ratio)

In [None]:
model_wgt.fit(X = X_tr, y = y_tr, eval_set = [(X_vl, y_vl)], verbose = False)

In [None]:
model_wgt.score(X = X_tr, y = y_tr)

In [None]:
model_wgt.score(X = X_vl, y = y_vl)

In [None]:
y_pred_wgt = model_wgt.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred_wgt)

In [None]:
y_prob_wgt = model_wgt.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob_wgt, color = 'blue')

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob_wgt, color = 'blue')

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_true = y_vl, y_score = y_prob[:, 1])

In [None]:
roc_auc_score(y_true = y_vl, y_score = y_prob_wgt[:, 1])