# Training a model using GBM from scikit-learn

In [22]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn import ensemble
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_csv('final_data/final_training_data.csv')

In [14]:
train, test = train_test_split(train_df, test_size=0.3)

## SMOTE oversampling

In [16]:
train.redemption_status.value_counts()

0    54371
1      487
Name: redemption_status, dtype: int64

In [15]:
sm = SMOTE(sampling_strategy=0.2)
train_y = train['redemption_status']
train_x = train.drop(['redemption_status'], axis=1)
train_x, train_y = sm.fit_resample(train_x, train_y)

In [18]:
train_y.value_counts()

0    54371
1    10874
Name: redemption_status, dtype: int64

## Training with GBM
Using parameters from AutoML-trained model

In [38]:
params = {
    'n_estimators': 191,
    'max_depth': 15,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'ls',
    'max_leaf_nodes': 627
}

In [39]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(train_x, train_y)

GradientBoostingRegressor(learning_rate=0.01, max_depth=15, max_leaf_nodes=627,
                          min_samples_split=5, n_estimators=191)

## Testing and evaluating

In [40]:
test_y = test['redemption_status']
test_x = test.drop(['redemption_status'], axis=1)

In [41]:
pred_y = pd.Series(reg.predict(test_x))

In [42]:
pred_y.head()

0    0.024848
1    0.024848
2    0.026183
3    0.025061
4    0.025588
dtype: float64

In [43]:
pred_y = pred_y.apply(lambda x: 1 if x > 0.66 else 0)

In [44]:
# confusion_matrix(test_y, pred_y)
pd.crosstab(test_y, pred_y, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6939,28,6967
1,78,1,79
All,7017,29,7046


In [45]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     23269
           1       0.36      0.16      0.22       242

    accuracy                           0.99     23511
   macro avg       0.67      0.58      0.61     23511
weighted avg       0.98      0.99      0.99     23511

