# Training on data with remapped categories

In [1]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn import ensemble
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('prepped-data/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289552 entries, 0 to 289551
Data columns (total 22 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   redemption_status                289552 non-null  int64  
 1   age_range                        289552 non-null  int64  
 2   marital_status                   289552 non-null  int64  
 3   family_size                      289552 non-null  int64  
 4   no_of_children                   289552 non-null  int64  
 5   income_bracket                   289552 non-null  int64  
 6   gender                           289552 non-null  int64  
 7   mean_discount_used_by_cust       289552 non-null  float64
 8   unique_items_bought_by_cust      289552 non-null  int64  
 9   mean_selling_price_paid_by_cust  289552 non-null  float64
 10  mean_quantity_bought_by_cust     289552 non-null  float64
 11  total_discount_used_by_cust      289552 non-null  float64
 12  to

### Oversampling

In [3]:
data.redemption_status.value_counts(normalize=True)

0    0.989007
1    0.010993
Name: redemption_status, dtype: float64

In [4]:
data_y = data['redemption_status']
data_x = data.drop(['redemption_status'], axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(data_x, data_y)

In [5]:
smote = SMOTE(sampling_strategy=0.3)
xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)
ytrain_resampled.value_counts(normalize=True)

0    0.769232
1    0.230768
Name: redemption_status, dtype: float64

### Training

In [6]:
params = {
    'n_estimators': 191,
    'max_depth': 15,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'max_leaf_nodes': 627
}
gbm = ensemble.GradientBoostingClassifier(**params)
gbm.fit(xtrain_resampled, ytrain_resampled)

GradientBoostingClassifier(learning_rate=0.01, max_depth=15, max_leaf_nodes=627,
                           min_samples_split=5, n_estimators=191)

### Evaluating

In [7]:
preds = pd.Series(gbm.predict(data_x))

In [8]:
preds.value_counts()

0    287892
1      1660
dtype: int64

In [9]:
data_y.value_counts()

0    286369
1      3183
Name: redemption_status, dtype: int64

In [10]:
pd.crosstab(data_y, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,285596,773,286369
1,2296,887,3183
All,287892,1660,289552


In [11]:
print(classification_report(data_y, preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    286369
           1       0.53      0.28      0.37      3183

    accuracy                           0.99    289552
   macro avg       0.76      0.64      0.68    289552
weighted avg       0.99      0.99      0.99    289552



In [12]:
proba = pd.Series(gbm.predict_proba(data_x)[:,1])

In [13]:
preds_02 = proba.apply(lambda x: 1 if x > 0.2 else 0)

In [14]:
pd.crosstab(data_y, preds_02, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,280176,6193,286369
1,1281,1902,3183
All,281457,8095,289552


In [15]:
print(classification_report(data_y, preds_02))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    286369
           1       0.23      0.60      0.34      3183

    accuracy                           0.97    289552
   macro avg       0.62      0.79      0.66    289552
weighted avg       0.99      0.97      0.98    289552



## Pickle model

In [16]:
import os
import pickle

In [17]:
if not os.path.exists('pickled_models'):
    os.mkdir ('pickled_models')

In [18]:
with open('pickled_models/scikit_classifier', 'wb') as f:
    pickle.dump(gbm, f)