# Training with scikit-learn

In [None]:
import os
import pickle

from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, auc, precision_recall_curve, f1_score
from sklearn.model_selection import train_test_split

In [None]:
data_dir = 'data_0408_0'
train_file_path = os.path.join(data_dir, 'train.csv')

In [None]:
train = pd.read_csv(train_file_path)
train.info()

In [None]:
train.sample(10)

In [None]:
train.coupon_used.value_counts(normalize=True)

In [None]:
y_train = train['coupon_used']
X_train = train.drop(['coupon_used'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
X_all = X_train.append(X_test)
y_all = y_train.append(y_test)

In [None]:
def plot_pr_curve(probs, preds, y, legend=''):
    precision, recall, _ = precision_recall_curve(y, probs)
    f1_, auc_ = f1_score(y, preds), auc(recall, precision)
    # summarize scores
    print(f'{legend}:\nf1={round(f1_, 3)} auc={round(auc_, 3)}')
    # plot the precision-recall curves
    no_skill = len(y[y==1]) / len(y)
    pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    pyplot.plot(recall, precision, marker='.', label='GBM')
    # axis labels
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()

In [None]:
gbm_params = {
    'n_estimators': 70,
    'max_depth': 10,
    'max_leaf_nodes': 994
}

## 1. Training with no balancing

In [None]:
gbm = GradientBoostingClassifier(**gbm_params)
gbm.fit(X_train, y_train)

#### 1.1. Evaluating on the test dataset

In [None]:
probs = gbm.predict_proba(X_test)[:, 1]
preds = gbm.predict(X_test)

In [None]:
plot_pr_curve(probs, preds, y=y_test, legend='GBM trained on an unbalanced dataset, evaluated on the test dataset')

In [None]:
pd.crosstab(y_test, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_test, preds))

#### 1.2 Evaluating on the whole dataset

In [None]:
probs = gbm.predict_proba(X_all)[:, 1]
preds = gbm.predict(X_all)

In [None]:
plot_pr_curve(probs, preds, y=y_all, legend='GBM trained on an unbalanced dataset, evaluated on the whole dataset')

In [None]:
pd.crosstab(y_all, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_all, preds))

#### 1.3. Pickle the model

In [None]:
with open(os.path.join(data_dir, 'pickled_model_gbm_no_balancing'), 'wb') as f:
    pickle.dump(gbm, f)

## 2. Training with balancing - SMOTE + Tomek

#### 2.1. Balancing

In [None]:
# NOTE! This takes very long
smt = SMOTETomek()
X_train_smt, y_train_smt = smt.fit_resample(X_train, y_train)

In [None]:
y_train_smt.coupon_used.value_counts(normalize=True)

#### 2.2. Training

In [None]:
gbm_smt = GradientBoostingClassifier(**gbm_params)
gbm_smt.fit(X_train_smt, y_train_smt)

#### 2.3. Evaluating on the test dataset

In [None]:
probs = gbm_smt.predict_proba(X_test)[:, 1]
preds = gbm_smt.predict(X_test)

In [None]:
plot_pr_curve(probs, preds, y=y_test,
              legend='GBM trained on a balanced dataset (SMOTE+Tomek), evaluated on the test dataset')

In [None]:
pd.crosstab(y_test, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_test, preds))

#### 2.4. Evaluating on the entire dataset

In [None]:
probs = gbm_smt.predict_proba(X_all)[:, 1]
preds = gbm_smt.predict(X_all)

In [None]:
plot_pr_curve(probs, preds, y=y_all,
              legend='GBM trained on a balanced dataset (SMOTE+Tomek), evaluated on the whole dataset')

In [None]:
pd.crosstab(y_all, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_all, preds))

#### 2.5. Pickle the model

In [None]:
with open(os.path.join(data_dir, 'pickled_model_gbm_smote_tomek'), 'wb') as f:
    pickle.dump(gbm_smt, f)

## 3. Training with balancing (SMOTE)

#### 3.1. Balancing

In [None]:
smote = SMOTE(sampling_strategy=0.5)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [None]:
y_train_sm.value_counts(normalize=True)

#### 3.2 Training

In [None]:
gbm_sm = GradientBoostingClassifier(**gbm_params)
gbm_sm.fit(X_train_sm, y_train_sm)

#### 3.3 Evaluating on the test dataset

In [None]:
probs = gbm_sm.predict_proba(X_test)[:, 1]
preds = gbm_sm.predict(X_test)

In [None]:
plot_pr_curve(probs, preds, y=y_test,
              legend='GBM trained on a balanced dataset (SMOTE), evaluated on the test dataset')

In [None]:
pd.crosstab(y_test, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_test, preds))

#### 3.4 Evaluating on the entire dataset

In [None]:
probs = gbm_sm.predict_proba(X_all)[:, 1]
preds = gbm_sm.predict(X_all)

In [None]:
plot_pr_curve(probs, preds, y=y_all,
              legend='GBM trained on a balanced dataset (SMOTE), evaluated on the whole dataset')

In [None]:
pd.crosstab(y_all, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
print(classification_report(y_all, preds))

#### 3.5. Pickle the model

In [None]:
with open(os.path.join(data_dir, 'pickled_model_gbm_smote'), 'wb') as f:
    pickle.dump(gbm_sm, f)