# Training a model using GBM from scikit-learn

In [2]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import xgboost as xgb 
from sklearn import ensemble
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../amex-data/coupon-based/final_data/final_training_data.csv')

In [7]:
df.drop(['coupon_id', 'customer_id'], axis=1, inplace=True)

In [8]:
df.columns

Index(['redemption_status', 'brand', 'category', 'coupon_used_x',
       'discount_mean', 'discount_sum', 'item_counts', 'no_of_customers',
       'price_mean', 'price_sum', 'quantity_mean', 'quantity_sum_x',
       'tran_counts', 'campaign_type', 'campaign_duration', 'age_range',
       'marital_status', 'rented', 'family_size', 'no_of_children',
       'income_bracket', 'mean_discount', 'coupon_used_y', 'day', 'dow',
       'no_of_items', 'month', 'mean_quantity', 'mean_price', 'ddiscount_sum',
       'customer_id_count', 'quantity_sum_y', 'pprice_sum'],
      dtype='object')

In [9]:
train, test = train_test_split(df, test_size=0.3)

## SMOTE oversampling

In [10]:
train.redemption_status.value_counts()

0    54331
1      527
Name: redemption_status, dtype: int64

In [11]:
sm = SMOTE(sampling_strategy=0.2)
train_y = train['redemption_status']
train_x = train.drop(['redemption_status'], axis=1)
train_x, train_y = sm.fit_resample(train_x, train_y)

In [12]:
train_y.value_counts()

0    54331
1    10866
Name: redemption_status, dtype: int64

## Training with GBM
Using parameters from AutoML-trained model

In [13]:
# opcja 1
params = {
    'n_estimators': 191,
    'max_depth': 15,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'ls',
    'max_leaf_nodes': 627
}

# opcja 2
# params = {
#     'n_estimators': 252,
#     'max_depth': 15,
#     'min_samples_split': 5,
#     'learning_rate': 0.03,
#     'loss': 'ls',
#     'max_leaf_nodes': 469
# }

In [23]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(train_x, train_y)

GradientBoostingRegressor(learning_rate=0.01, max_depth=15, max_leaf_nodes=627,
                          min_samples_split=5, n_estimators=191)

## Testing and evaluating

In [None]:
# test_y = test['redemption_status']
# test_x = test.drop(['redemption_status'], axis=1)

In [None]:
# pred_y = pd.Series(reg.predict(test_x))

In [None]:
# pred_y.head()

In [None]:
# pred_y = pred_y.apply(lambda x: 1 if x > 0.25 else 0)

In [None]:
# pred_y.value_counts()

In [None]:
# confusion_matrix(test_y, pred_y)
# pd.crosstab(test_y, pred_y, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
# print(classification_report(test_y, pred_y))

## Predict on the entire dataset

In [24]:
real_res = df['redemption_status']
all_set = df.drop(['redemption_status'], axis=1)

pred_all = pd.Series(reg.predict(all_set))
pred_all = pred_all.apply(lambda x: 1 if x > 0.10 else 0)
pred_all.value_counts()

0    75547
1     2822
dtype: int64

In [25]:
pd.crosstab(real_res, pred_all, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,75456,2184,77640
1,91,638,729
All,75547,2822,78369


In [26]:
print(classification_report(real_res, pred_all))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     77640
           1       0.23      0.88      0.36       729

    accuracy                           0.97     78369
   macro avg       0.61      0.92      0.67     78369
weighted avg       0.99      0.97      0.98     78369



# XGboost

In [17]:
model = xgb.XGBClassifier(base_score=0.15)
model.fit(train_x, train_y)





XGBClassifier(base_score=0.15, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
real_res = df['redemption_status']
all_set = df.drop(['redemption_status'], axis=1)

pred_all_xgb = pd.Series(model.predict(all_set))
pred_all_xgb.value_counts()

0    77784
1      585
dtype: int64

In [19]:
pd.crosstab(real_res, pred_all_xgb, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,77541,99,77640
1,243,486,729
All,77784,585,78369


In [21]:
print(classification_report(real_res, pred_all_xgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     77640
           1       0.83      0.67      0.74       729

    accuracy                           1.00     78369
   macro avg       0.91      0.83      0.87     78369
weighted avg       1.00      1.00      1.00     78369



## Export model

In [29]:
import os
import pickle

In [30]:
if not os.path.exists('pickled_models'):
    os.mkdir ('pickled_models')

In [32]:
with open('pickled_models/scikit_regressor', 'wb') as f:
    pickle.dump(reg, f)

In [35]:
# Test exported model
with open('pickled_models/scikit_regressor', 'rb') as f:
    preg_model = pickle.load(f)

ppred_all = pd.Series(preg_model.predict(all_set))
ppred_all = ppred_all.apply(lambda x: 1 if x > 0.10 else 0)
print(classification_report(real_res, ppred_all))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     77640
           1       0.23      0.88      0.36       729

    accuracy                           0.97     78369
   macro avg       0.61      0.92      0.67     78369
weighted avg       0.99      0.97      0.98     78369



In [37]:
ppred_all.loc[ppred_all == 1]

19       1
107      1
138      1
177      1
181      1
        ..
78244    1
78284    1
78298    1
78342    1
78346    1
Length: 2822, dtype: int64