# AmEx Training using AutoML and Oversampling

Oversampling inspiration from https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

In [103]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

In [52]:
h2o.init()

  "This may interfere with your H2O Connection." % name)
  "This may interfere with your H2O Connection." % name)


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 54 mins
H2O_cluster_timezone:,Europe/Warsaw
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.4
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_ugolowic_jpz9si
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.662 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [53]:
data = pd.read_csv('final_data/final_training_data.csv')

In [54]:
data.redemption_status.value_counts()

0    77640
1      729
Name: redemption_status, dtype: int64

In [55]:
data.redemption_status.value_counts(normalize=True)

0    0.990698
1    0.009302
Name: redemption_status, dtype: float64

In [56]:
# Split input and ouput
data_y = data['redemption_status']
data_x = data.drop(['redemption_status'], axis=1)

In [98]:
test = h2o.import_file('final_data/final_test_data.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Preparing data using various oversampling algorithms

### SMOTE

In [57]:
smt = SMOTE()
data_x_sm, data_y_sm = smt.fit_resample(data_x, data_y)

In [58]:
data_y_sm.value_counts(normalize=True)  # This does not seem OK :D

1    0.5
0    0.5
Name: redemption_status, dtype: float64

In [59]:
smt = SMOTE(sampling_strategy=0.2)
data_x_sm_02, data_y_sm_02 = smt.fit_resample(data_x, data_y)
data_y_sm_02.value_counts(normalize=True)

0    0.833333
1    0.166667
Name: redemption_status, dtype: float64

### ADASYN

In [60]:
ada = ADASYN(sampling_strategy=0.2, random_state=130)
data_x_ada_02, data_y_ada_02 = ada.fit_resample(data_x, data_y)
data_y_ada_02.value_counts(normalize=True)

0    0.832842
1    0.167158
Name: redemption_status, dtype: float64

### SMOTE + Tomek

In [61]:
smtom = SMOTETomek(sampling_strategy=0.2, random_state=139)
data_x_tomek_02, data_y_tomek_02 = smtom.fit_resample(data_x, data_y)
data_y_tomek_02.value_counts(normalize=True)

0    0.836972
1    0.163028
Name: redemption_status, dtype: float64

### SMOTE+ENN

In [62]:
smenn = SMOTEENN(sampling_strategy=0.2)
data_x_enn_02, data_y_enn_02 = smenn.fit_resample(data_x, data_y)
data_y_enn_02.value_counts(normalize=True)

0    0.861739
1    0.138261
Name: redemption_status, dtype: float64

## Performance comparison using AutoML

### SMOTE

In [63]:
data_x_sm_02['redemption_status'] = data_y_sm_02
h2o_data_sm_02 = h2o.H2OFrame(data_x_sm_02)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [64]:
# Factorize the target column
h2o_data_sm_02['redemption_status'] = h2o_data_sm_02['redemption_status'].asfactor()
y = 'redemption_status'
x = h2o_data_sm_02.columns
x.remove(y)

In [65]:
aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=h2o_data_sm_02)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [66]:
lb_sm = aml.leaderboard
lb_sm.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_5_AutoML_20210225_165106,0.999502,0.0186934,0.997909,0.0141937,0.0703995,0.00495609
GBM_4_AutoML_20210225_165106,0.999457,0.0189456,0.997776,0.0134274,0.0697612,0.00486662
StackedEnsemble_AllModels_AutoML_20210225_165106,0.999434,0.0226206,0.997658,0.0137622,0.0717617,0.00514974
StackedEnsemble_BestOfFamily_AutoML_20210225_165106,0.999419,0.0224902,0.997472,0.0135433,0.0716506,0.00513381
GBM_3_AutoML_20210225_165106,0.999268,0.0209003,0.997155,0.0147347,0.0727992,0.00529972
DRF_1_AutoML_20210225_165106,0.999187,0.0563899,0.996245,0.0151404,0.101024,0.0102058
GBM_2_AutoML_20210225_165106,0.999135,0.022326,0.996716,0.0157586,0.0750374,0.00563061
XGBoost_1_AutoML_20210225_165106,0.998988,0.0247944,0.996164,0.0191847,0.0798934,0.00638296
XGBoost_2_AutoML_20210225_165106,0.998962,0.0253108,0.995965,0.0182445,0.0805656,0.00649082
GBM_1_AutoML_20210225_165106,0.9988,0.0261319,0.995385,0.0171561,0.0796025,0.00633656




In [67]:
leader_sm = aml.leader

In [99]:
pred_sm = leader_sm.predict(test)
pred_sm.as_data_frame().predict.value_counts(normalize=True)

gbm prediction progress: |████████████████████████████████████████████████| 100%


0    0.992255
1    0.007745
Name: predict, dtype: float64

### ADASYN

In [68]:
data_x_ada_02['redemption_status'] = data_y_ada_02
h2o_data_ada_02 = h2o.H2OFrame(data_x_ada_02) 
h2o_data_ada_02['redemption_status'] = h2o_data_ada_02['redemption_status'].asfactor()
x = h2o_data_ada_02.columns
x.remove(y)
aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=h2o_data_ada_02)

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


In [69]:
lb_ada = aml.leaderboard
lb_ada.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_5_AutoML_20210225_172027,0.999432,0.0193585,0.997727,0.0134042,0.0713448,0.00509008
StackedEnsemble_AllModels_AutoML_20210225_172027,0.999428,0.0233448,0.997541,0.0143598,0.073241,0.00536424
StackedEnsemble_BestOfFamily_AutoML_20210225_172027,0.99942,0.0230424,0.997393,0.013488,0.0726295,0.00527504
GBM_4_AutoML_20210225_172027,0.999418,0.0197991,0.997629,0.0150459,0.0716215,0.00512964
GBM_3_AutoML_20210225_172027,0.999259,0.0211704,0.997146,0.0154954,0.0735977,0.00541662
DRF_1_AutoML_20210225_172027,0.999146,0.0532698,0.995976,0.0141788,0.10104,0.010209
GBM_2_AutoML_20210225_172027,0.99912,0.0223341,0.996711,0.0151368,0.0750379,0.00563069
XGBoost_2_AutoML_20210225_172027,0.998921,0.0256518,0.995949,0.0179606,0.0814322,0.0066312
XGBoost_1_AutoML_20210225_172027,0.998904,0.0252466,0.995933,0.0178063,0.0799535,0.00639256
GBM_1_AutoML_20210225_172027,0.998754,0.0259034,0.995596,0.0189606,0.0802157,0.00643456




In [70]:
leader_ada = aml.leader

In [100]:
pred_ada = leader_ada.predict(test)
pred_ada.as_data_frame().predict.value_counts(normalize=True)

gbm prediction progress: |████████████████████████████████████████████████| 100%


0    0.989587
1    0.010413
Name: predict, dtype: float64

### SMOTE + Tomek

In [71]:
data_x_tomek_02['redemption_status'] = data_y_tomek_02
h2o_data_tomek_02 = h2o.H2OFrame(data_x_tomek_02)
h2o_data_tomek_02['redemption_status'] = h2o_data_tomek_02['redemption_status'].asfactor()
x = h2o_data_tomek_02.columns
x.remove(y)
aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=h2o_data_tomek_02)

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


In [72]:
lb_tomek = aml.leaderboard
lb_tomek.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_4_AutoML_20210225_174742,0.999535,0.0176272,0.998045,0.0131909,0.067369,0.00453858
GBM_5_AutoML_20210225_174742,0.999505,0.0177264,0.997981,0.0125581,0.0681404,0.00464312
StackedEnsemble_BestOfFamily_AutoML_20210225_174742,0.999483,0.0213904,0.997847,0.0125461,0.0694555,0.00482407
StackedEnsemble_AllModels_AutoML_20210225_174742,0.999469,0.0215139,0.997906,0.0129138,0.0697782,0.00486899
GBM_3_AutoML_20210225_174742,0.999319,0.0193487,0.997396,0.0145358,0.0700341,0.00490478
DRF_1_AutoML_20210225_174742,0.999209,0.04983,0.996108,0.0139146,0.0983134,0.00966552
GBM_2_AutoML_20210225_174742,0.999167,0.0209618,0.996933,0.0158281,0.0724659,0.0052513
XGBoost_1_AutoML_20210225_174742,0.999095,0.0227212,0.996572,0.0177141,0.0766384,0.00587344
XGBoost_2_AutoML_20210225_174742,0.999042,0.0239419,0.996219,0.0187734,0.0785926,0.0061768
XGBoost_3_AutoML_20210225_174742,0.998858,0.0243212,0.995957,0.0180819,0.0783373,0.00613673




In [73]:
leader_tomek = aml.leader

In [101]:
pred_tomek = leader_tomek.predict(test)
pred_tomek.as_data_frame().predict.value_counts(normalize=True)

gbm prediction progress: |████████████████████████████████████████████████| 100%


0    0.987895
1    0.012105
Name: predict, dtype: float64

### SMOTE + ENN

In [74]:
data_x_enn_02['redemption_status'] = data_y_enn_02
h2o_data_enn_02 = h2o.H2OFrame(data_x_enn_02)
h2o_data_enn_02['redemption_status'] = h2o_data_enn_02['redemption_status'].asfactor()
x = h2o_data_enn_02.columns
x.remove(y)
aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=h2o_data_enn_02)

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


In [75]:
lb_enn = aml.leaderboard
lb_enn.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_5_AutoML_20210225_181503,0.999766,0.0112397,0.998782,0.00952398,0.0536275,0.00287591
GBM_4_AutoML_20210225_181503,0.999743,0.011255,0.998834,0.00832442,0.0532021,0.00283046
StackedEnsemble_AllModels_AutoML_20210225_181503,0.999719,0.0142188,0.998579,0.00950856,0.0553215,0.00306047
StackedEnsemble_BestOfFamily_AutoML_20210225_181503,0.999704,0.0140184,0.998475,0.00978918,0.0549231,0.00301655
GBM_3_AutoML_20210225_181503,0.999675,0.0122479,0.998461,0.00972541,0.0550445,0.00302989
GBM_2_AutoML_20210225_181503,0.999625,0.0132983,0.998186,0.0107208,0.0574437,0.00329978
DRF_1_AutoML_20210225_181503,0.999538,0.0396007,0.997298,0.010839,0.0845004,0.00714032
XGBoost_1_AutoML_20210225_181503,0.999461,0.0162006,0.997439,0.0134833,0.0642422,0.00412706
GBM_1_AutoML_20210225_181503,0.99945,0.0155388,0.997479,0.0115811,0.0617412,0.00381198
XGBoost_2_AutoML_20210225_181503,0.999394,0.0177069,0.997069,0.0132413,0.0668524,0.00446925




In [77]:
leader_enn = aml.leader

In [102]:
pred_enn = leader_enn.predict(test)
pred_enn.as_data_frame().predict.value_counts(normalize=True)

gbm prediction progress: |████████████████████████████████████████████████| 100%


0    0.996356
1    0.003644
Name: predict, dtype: float64

## Generate predictions on the test set

In [88]:
test = h2o.import_file('final_data/final_test_data.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [89]:
test.head()

coupon_id,customer_id,brand,category,coupon_used_x,discount_mean,discount_sum,item_counts,no_of_customers,price_mean,price_sum,quantity_mean,quantity_sum_x,tran_counts,campaign_type,campaign_duration,age_range,marital_status,rented,family_size,no_of_children,income_bracket,mean_discount,coupon_used_y,day,dow,no_of_items,month,mean_quantity,mean_price,ddiscount_sum,customer_id_count,quantity_sum_y,pprice_sum
869,967,1075,6,59,-0.738141,-2288.55,72,20.4028,138.225,553742.0,1.13038,4162,3655,0,32,2,1,0,1,0,5,-1.55056,81,14,4,658,5,12.2203,116.037,-2111.86,1362,16644,158043.0
389,1566,57,11,36,-1.99762,-1243.08,33,13.1818,165.314,104871.0,1.03848,731,676,1,70,1,0,0,2,0,9,-0.559929,31,1,5,1214,6,106.124,95.9067,-1489.97,2661,282396,255208.0
981,510,1335,6,107,-0.703917,-2518.38,19,38.3684,100.892,119132.0,1.20549,1495,1307,0,32,1,1,0,2,1,1,-1.45234,56,13,4,1019,5,110.536,85.0182,-2384.75,1642,181500,139600.0
1069,361,1996,6,47,-0.928593,-1351.66,74,18.7432,181.726,383205.0,1.02916,2138,2042,1,32,0,1,0,1,0,3,0.0,0,6,4,323,6,38.7365,88.204,0.0,573,22196,50540.9
498,811,209,6,84,-4.86612,-1729.35,18,19.4444,113.856,64051.1,1.11,619,567,1,32,3,0,0,2,0,5,-0.0675057,3,15,5,720,6,1.31084,85.6419,-89.04,1319,1729,112962.0
44,1498,1124,6,96,-0.453697,-2178.17,95,23.5053,73.5302,261342.0,1.35929,5077,3712,1,32,3,0,0,2,0,5,-0.248955,4,7,1,509,5,35.4685,98.4193,-162.07,651,23090,64071.0
713,682,765,11,6,-5.66682,-391.82,13,6.53846,288.826,18709.4,1.02298,96,90,0,55,3,0,0,2,0,5,-1.25585,30,1,2,529,5,1.08784,101.456,-929.33,740,805,75077.8
1079,1186,2024,6,27,-0.26243,-717.67,62,11.5161,98.6714,102104.0,1.08138,1205,1086,0,32,3,0,0,2,0,5,-0.022038,1,14,3,721,4,167.245,119.514,-26.71,1212,202701,144851.0
199,1023,56,6,1,-0.296833,-17.81,20,30.65,49.246,48436.3,1.25077,1239,949,0,55,0,1,1,4,3,1,-0.182917,11,16,4,1230,6,4.48099,75.0086,-500.46,2736,12260,205224.0
583,315,458,6,43,-2.22524,-890.5,5,29.4,101.713,21409.4,1.1288,289,240,0,32,2,1,0,1,0,3,-1.16578,19,18,3,448,5,654.847,84.1069,-905.81,777,508816,65351.0




In [90]:
preds = aml.leader.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [95]:
preds.head()

predict,p0,p1
0,0.106508,0.893492
0,0.999988,1.15025e-05
0,0.589452,0.410548
0,0.999993,7.43459e-06
0,0.999999,1.11999e-06
0,0.999991,9.117e-06
0,0.999275,0.00072518
0,0.999965,3.47853e-05
0,0.999999,1.0308e-06
0,0.974443,0.0255567




In [97]:
preds.as_data_frame().predict.value_counts(normalize=True)

0    0.996356
1    0.003644
Name: predict, dtype: float64