In [None]:
import os
import numpy as np
import pandas as pd
from plt_rcs import *
import hds
plt.rc(group = 'figure', figsize = (4, 4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('WhiteWine.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_tr, X_vl, y_tr, y_vl = X_train, X_valid, y_train, y_valid

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier(subsample = 0.8, random_state = 0)

In [None]:
model.fit(X = X_tr, y = y_tr)

In [None]:
model.score(X = X_tr, y = y_tr)
# 0.8780630105017503
model.score(X = X_vl, y = y_vl)
# 0.8258503401360544

In [None]:
pd.Series(data = model.feature_importances_, index = model.feature_names_in_).sort_values(ascending = False)
# alcohol                0.426315
# volatile acidity       0.106041
# pH                     0.073835
# free sulfur dioxide    0.070229
# density                0.062731
# chlorides              0.058014
# sulphates              0.057779
# residual sugar         0.054972
# fixed acidity          0.048334
# citric acid            0.041750
# dtype: float64

In [None]:
hds.plot.feature_importance(model)

In [None]:
model.n_estimators_

In [None]:
model.oob_scores_

In [None]:
sns.lineplot(x = range(model.n_estimators_), 
            y = model.oob_scores_, 
            color = 'red', 
            linewidth = 0.5)
plt.show()

In [None]:
y_pred = model.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred)

In [None]:
y_prob = model.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
model = GradientBoostingClassifier(n_estimators = 3000,
                                   subsample = 0.8, 
                                   random_state = 0)

In [None]:
model.fit(X = X_tr, y = y_tr)

In [None]:
model.score(X = X_tr, y = y_tr)

In [None]:
model.score(X = X_vl, y = y_vl)

In [None]:
sns.lineplot(x = range(model.n_estimators_), 
             y = model.oob_scores_, 
             color = 'red',
             linewidth = 0.5);

In [None]:
y_pred = model.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred)

In [None]:
y_prob = model.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(k_neighbors = 5, random_state = 0)

In [None]:
X_bal, y_bal = smote.fit_resample(X = X_tr, y = y_tr)

In [None]:
model_bal = GradientBoostingClassifier(n_estimators = 3000, 
                                       subsample = 0.8, 
                                       random_state = 0)

In [None]:
model_bal.fit(X = X_bal, y = y_bal)

In [None]:
model_bal.score(X = X_bal, y = y_bal)

In [None]:
model_bal.score(X = X_vl, y = y_vl)

In [None]:
sns.lineplot(x = range(model_bal.n_estimators_), 
             y = model_bal.oob_scores_, 
             color = 'red', 
             linewidth = 0.5);

In [None]:
y_pred_bal = model_bal.predict(X = X_vl)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred_bal)

In [None]:
hds.stat.clfmetrics(y_true = y_vl, y_pred = y_pred)

In [None]:
y_prob_bal = model_bal.predict_proba(X = X_vl)

In [None]:
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.roc_curve(y_true = y_vl, y_prob = y_prob_bal, color = 'blue')

In [None]:
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob, color = 'red')
hds.plot.pr_curve(y_true = y_vl, y_prob = y_prob_bal, color = 'blue')