In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import scipy.stats as stats

In [24]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city,eopen,tenure,recency
0,0,2012-09-28,2013-11-08,2013-11-08,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL,29,1.112329,5.065753
1,1,2010-12-19,2011-01-04,2014-01-19,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL,89,3.087671,4.868493
2,0,2010-10-22,2011-03-28,2011-03-28,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM,0,0.430137,7.684932
3,1,2010-11-27,2010-11-29,2013-01-28,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM,27,2.172603,5.843836
4,1,2008-11-17,2010-12-10,2014-01-14,46,80.434783,15.217391,175.1,0.141176,1,1,0,Wednesday,DEL,38,5.161644,4.882192


In [25]:
df['created'] = df['created'].apply(lambda x: pd.to_datetime(str(x)))
df['firstorder'] = df['firstorder'].apply(lambda x: pd.to_datetime(str(x)))
df['lastorder'] = df['lastorder'].apply(lambda x: pd.to_datetime(str(x)))

In [26]:
df.dtypes

retained               int64
created       datetime64[ns]
firstorder    datetime64[ns]
lastorder     datetime64[ns]
esent                  int64
eopenrate            float64
eclickrate           float64
avgorder             float64
ordfreq              float64
paperless              int64
refill                 int64
doorstep               int64
favday                object
city                  object
eopen                  int64
tenure               float64
recency              float64
dtype: object

In [27]:
df1 = df.drop(['eopenrate','avgorder','tenure'],axis=1)

In [28]:
num_data = df.select_dtypes(np.number)

In [183]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, recall_score, classification_report, precision_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from xgboost import XGBClassifier

In [29]:
favday_df = pd.get_dummies(df1[['favday','city']],drop_first=True)
favday_df.head()

Unnamed: 0,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA
0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,0
3,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,1,0


In [30]:
df1 = pd.concat([num_data,favday_df,df1[['paperless','refill','doorstep']]],axis=1)
df1.head()

Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,eopen,...,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA,paperless.1,refill.1,doorstep.1
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,29,...,0,0,0,0,0,1,0,0,0,0
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,89,...,0,0,0,0,0,1,0,1,1,1
2,0,0,0.0,0.0,54.96,0.0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,1,30,90.0,13.333333,111.91,0.00885,0,0,0,27,...,0,0,0,0,1,0,0,0,0,0
4,1,46,80.434783,15.217391,175.1,0.141176,1,1,0,38,...,0,0,0,1,0,1,0,1,1,0


In [31]:
X = df1.drop('retained',axis=1)
y = df1['retained']

sc = StandardScaler()
sc_data = sc.fit_transform(X)
X_sc = pd.DataFrame(sc_data,columns=X.columns)
X_sc.head()

Unnamed: 0,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,eopen,tenure,...,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA,paperless.1,refill.1,doorstep.1
0,0.070237,2.50329,-0.214185,-1.220879,-0.305422,-1.409335,-0.330809,-0.198011,2.547955,0.866589,...,-0.090043,-0.451048,-0.534848,-0.433334,-0.7894,1.56243,-0.610868,-1.409335,-0.330809,-0.198011
1,4.020476,2.254823,0.457053,0.610978,1.526047,0.709554,3.022894,5.050218,9.443172,3.385594,...,-0.090043,-0.451048,-0.534848,-0.433334,-0.7894,1.56243,-0.610868,0.709554,3.022894,5.050218
2,-1.665474,-0.86876,-0.541198,-0.14989,-0.305422,-1.409335,-0.330809,-0.198011,-0.784734,-0.003358,...,-0.090043,2.217058,-0.534848,-0.433334,1.266785,-0.640029,-0.610868,-1.409335,-0.330809,-0.198011
3,0.130089,2.166085,0.723254,1.35834,-0.216193,-1.409335,-0.330809,-0.198011,2.318114,2.218676,...,-0.090043,-0.451048,-0.534848,-0.433334,1.266785,-0.640029,-0.610868,-1.409335,-0.330809,-0.198011
4,1.087723,1.843541,0.901926,3.031826,1.11805,0.709554,3.022894,-0.198011,3.582237,6.030374,...,-0.090043,-0.451048,-0.534848,2.307691,-0.7894,1.56243,-0.610868,0.709554,3.022894,-0.198011


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, random_state = 10, test_size = 0.3)

In [33]:
df1['retained']

0        0
1        1
2        0
3        1
4        1
        ..
25360    1
25361    1
25362    1
25363    1
25364    1
Name: retained, Length: 25365, dtype: int64

# Gaussian NB 

## Hyper Parameter tuning

In [163]:
nb = GaussianNB()
nb_model = nb.fit(X_train,y_train)
pred = nb_model.predict(X_test)

In [39]:
nb.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [164]:
#Preliminar modeling
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

from sklearn.model_selection import cross_validate
pre_score = cross_validate(estimator = nb,
                             X = X_train, 
                             y = y_train,
                             scoring = ['accuracy','recall','roc_auc','precision','f1'],
                             cv = cv_method,
                             verbose = 1,
                          return_train_score=True
                          )

#print('Naive-Bayes mean score: %5.3f' %np.mean(pre_score))
#print(pre_score)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    3.0s finished


In [47]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [165]:
for k,v in pre_score.items():
    print(k,':',round(np.mean(v),3))

fit_time : 0.033
score_time : 0.055
test_accuracy : 0.883
train_accuracy : 0.882
test_recall : 0.886
train_recall : 0.886
test_roc_auc : 0.923
train_roc_auc : 0.924
test_precision : 0.964
train_precision : 0.964
test_f1 : 0.923
train_f1 : 0.923


In [166]:
nb.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [167]:
from sklearn.naive_bayes import GaussianNB

np.random.seed(999)

nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator=nb_classifier, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring = 'f1')

#Data_transformed = PowerTransformer().fit_transform(Data)

gs_NB.fit(X_train, y_train)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=999),
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='f1', verbose=1)

In [70]:
gs_NB.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=999),
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring=['accuracy'], verbose=1)>

In [100]:
gs_NB.best_params_

{'var_smoothing': 1.0}

In [168]:
# Based on F1
nb = GaussianNB(var_smoothing = 1.0)
nb.fit(X_train, y_train)
y_test_pred = nb.predict(X_test)
y_train_pred = nb.predict(X_train)
print('Precision Score', round(precision_score(y_test, y_test_pred),4),
                                    '\nRecall Score', round(recall_score(y_test, y_test_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_test, y_test_pred),4),
                                    '\nf1-score', round(f1_score(y_test, y_test_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_test, y_test_pred),4))

Precision Score 0.9408 
Recall Score 0.934 
Accuracy Score 0.9004 
f1-score 0.9374 
AUC Score 0.8506


In [169]:
print('Precision Score', round(precision_score(y_train, y_train_pred),4),
                                    '\nRecall Score', round(recall_score(y_train, y_train_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_train, y_train_pred),4),
                                    '\nf1-score', round(f1_score(y_train, y_train_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_train, y_train_pred),4))

Precision Score 0.9441 
Recall Score 0.9305 
Accuracy Score 0.9005 
f1-score 0.9372 
AUC Score 0.856


In [170]:
nb = GaussianNB(var_smoothing = 1.0)
nb.fit(X_train, y_train)
y_test_pred = nb.predict(X_test)
y_train_pred = nb.predict(X_train)
print('Precision Score', round(precision_score(y_test, y_test_pred),4),
                                    '\nRecall Score', round(recall_score(y_test, y_test_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_test, y_test_pred),4),
                                    '\nf1-score', round(f1_score(y_test, y_test_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_test, y_test_pred),4))

Precision Score 0.9408 
Recall Score 0.934 
Accuracy Score 0.9004 
f1-score 0.9374 
AUC Score 0.8506


In [171]:
print('Precision Score', round(precision_score(y_train, y_train_pred),4),
                                    '\nRecall Score', round(recall_score(y_train, y_train_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_train, y_train_pred),4),
                                    '\nf1-score', round(f1_score(y_train, y_train_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_train, y_train_pred),4))

Precision Score 0.9441 
Recall Score 0.9305 
Accuracy Score 0.9005 
f1-score 0.9372 
AUC Score 0.856


In [182]:
from sklearn.metrics import plot_roc_curve,roc_curve
y_pred = nb.predict_proba(X_test)
fpr,tpr,thres = roc_curve(y_test,y_test_pred)
thresholds = pd.DataFrame({'fpr':fpr,'tpr':tpr,'thres':thres})
thresholds

Unnamed: 0,fpr,tpr,thres
0,0.0,0.0,2
1,0.232877,0.934013,1
2,1.0,1.0,0


In [185]:
print(classification_report(y_test,y_test_pred))
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1533
           1       0.94      0.93      0.94      6077

    accuracy                           0.90      7610
   macro avg       0.84      0.85      0.85      7610
weighted avg       0.90      0.90      0.90      7610



              precision    recall  f1-score   support

           0       0.74      0.78      0.76      3575
           1       0.94      0.93      0.94     14180

    accuracy                           0.90     17755
   macro avg       0.84      0.86      0.85     17755
weighted avg       0.90      0.90      0.90     17755



# Bagging Classifier with decision tree

In [91]:
bg = BaggingClassifier()
bg_model = bg.fit(X_train,y_train)
pred = bg_model.predict(X_test)

In [93]:
bg.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [95]:

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

from sklearn.model_selection import cross_validate
pre_score = cross_validate(estimator = bg,
                             X = X_train, 
                             y = y_train,
                             scoring = ['accuracy','recall','roc_auc','precision','f1'],
                             cv = cv_method,
                             verbose = 1,
                          return_train_score=True
                          )

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   20.8s finished


In [97]:
for k,v in pre_score.items():
    print(k,':',round(np.mean(v),3))

fit_time : 1.109
score_time : 0.065
test_accuracy : 0.961
train_accuracy : 0.997
test_recall : 0.98
train_recall : 0.998
test_roc_auc : 0.97
train_roc_auc : 1.0
test_precision : 0.972
train_precision : 0.998
test_f1 : 0.976
train_f1 : 0.998


In [143]:
bg = BaggingClassifier()

params_bg = {
 'max_features': range(1,21,3),
 #'max_samples': [1,2,3,4],
 #'n_estimators': [5,10],
    'random_state':[100]
 }

gs_bg = GridSearchCV(estimator=bg, 
                     param_grid=params_bg, 
                     cv=cv_method,
                     verbose=1, 
                     scoring = 'accuracy')

#Data_transformed = PowerTransformer().fit_transform(Data)

gs_bg.fit(X_train, y_train)

Fitting 15 folds for each of 7 candidates, totalling 105 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=999),
             estimator=BaggingClassifier(),
             param_grid={'max_features': range(1, 21, 3),
                         'random_state': [100]},
             scoring='accuracy', verbose=1)

In [144]:
gs_bg.best_params_

{'max_features': 19, 'random_state': 100}

In [145]:
bg = BaggingClassifier(max_features= 19, random_state= 100)
bg.fit(X_train, y_train)
y_test_pred = nb.predict(X_test)
y_train_pred = nb.predict(X_train)

In [146]:
print('Precision Score', round(precision_score(y_train, y_train_pred),4),
                                    '\nRecall Score', round(recall_score(y_train, y_train_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_train, y_train_pred),4),
                                    '\nf1-score', round(f1_score(y_train, y_train_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_train, y_train_pred),4))

Precision Score 0.9441 
Recall Score 0.9305 
Accuracy Score 0.9005 
f1-score 0.9372 
AUC Score 0.856


In [147]:
print('Precision Score', round(precision_score(y_test, y_test_pred),4),
                                    '\nRecall Score', round(recall_score(y_test, y_test_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_test, y_test_pred),4),
                                    '\nf1-score', round(f1_score(y_test, y_test_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_test, y_test_pred),4))

Precision Score 0.9408 
Recall Score 0.934 
Accuracy Score 0.9004 
f1-score 0.9374 
AUC Score 0.8506


# Bagging classifier with NB

In [187]:
bg = BaggingClassifier()
gnb = GaussianNB(var_smoothing = 1.0)

bag_model=BaggingClassifier(base_estimator=gnb, n_estimators=10, bootstrap=True)
bag_model=bag_model.fit(X_train,y_train)
y_test_pred=bag_model.predict(X_test)
y_train_pred = nb.predict(X_train)

#Data_transformed = PowerTransformer().fit_transform(Data)

#gs_bg.fit(X_train, y_train)

In [134]:
bg.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [138]:
print('Precision Score', round(precision_score(y_train, y_train_pred),4),
                                    '\nRecall Score', round(recall_score(y_train, y_train_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_train, y_train_pred),4),
                                    '\nf1-score', round(f1_score(y_train, y_train_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_train, y_train_pred),4))

Precision Score 0.9441 
Recall Score 0.9305 
Accuracy Score 0.9005 
f1-score 0.9372 
AUC Score 0.856


In [139]:
print('Precision Score', round(precision_score(y_test, y_test_pred),4),
                                    '\nRecall Score', round(recall_score(y_test, y_test_pred),4),
                                    '\nAccuracy Score', round(accuracy_score(y_test, y_test_pred),4),
                                    '\nf1-score', round(f1_score(y_test, y_test_pred),4),
                                    '\nAUC Score' , round(roc_auc_score(y_test, y_test_pred),4))

Precision Score 0.9426 
Recall Score 0.9319 
Accuracy Score 0.9003 
f1-score 0.9372 
AUC Score 0.8534


In [188]:
print(classification_report(y_test,y_test_pred))
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76      1533
           1       0.94      0.93      0.94      6077

    accuracy                           0.90      7610
   macro avg       0.84      0.85      0.85      7610
weighted avg       0.90      0.90      0.90      7610

              precision    recall  f1-score   support

           0       0.74      0.78      0.76      3575
           1       0.94      0.93      0.94     14180

    accuracy                           0.90     17755
   macro avg       0.84      0.86      0.85     17755
weighted avg       0.90      0.90      0.90     17755



In [140]:
GaussianNB?

In [148]:
GridSearchCV?

In [149]:
param = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

In [152]:
gb_tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = param, scoring='accuracy',n_jobs=4, cv=5)
gb_tuning.fit(X_train,y_train)

print("accuracy after Hyperparameter tuning :",gb_tuning.grid_scores_),
print("tuned hyper-parameters :(best parameters) ",gb_tuning.best_params_), 
print("Best Score :",gb_tuning.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'

In [160]:
    gb_tuning.best_index_

20