In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import scipy.stats as stats
from sklearn.metrics import confusion_matrix,classification_report,precision_score,f1_score,recall_score,roc_auc_score,accuracy_score
from sklearn.metrics import plot_roc_curve

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, recall_score, classification_report 

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

In [3]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city,eopen,tenure,recency
0,0,2012-09-28,2013-11-08,2013-11-08,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL,29.0,406,1849
1,1,2010-12-19,2011-01-04,2014-01-19,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL,88.0,1127,1777
2,0,2010-10-22,2011-03-28,2011-03-28,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM,0.0,157,2805
3,1,2010-11-27,2010-11-29,2013-01-28,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM,27.0,793,2133
4,1,2008-11-17,2010-12-10,2014-01-14,46,80.434783,15.217391,175.1,0.141176,1,1,0,Wednesday,DEL,37.0,1884,1782


In [4]:
df.dtypes

retained        int64
created        object
firstorder     object
lastorder      object
esent           int64
eopenrate     float64
eclickrate    float64
avgorder      float64
ordfreq       float64
paperless       int64
refill          int64
doorstep        int64
favday         object
city           object
eopen         float64
tenure          int64
recency         int64
dtype: object

In [5]:
df['created'] = df['created'].apply(lambda x: pd.to_datetime(str(x)))
df['firstorder'] = df['firstorder'].apply(lambda x: pd.to_datetime(str(x)))
df['lastorder'] = df['lastorder'].apply(lambda x: pd.to_datetime(str(x)))

In [6]:
df.dtypes

retained               int64
created       datetime64[ns]
firstorder    datetime64[ns]
lastorder     datetime64[ns]
esent                  int64
eopenrate            float64
eclickrate           float64
avgorder             float64
ordfreq              float64
paperless              int64
refill                 int64
doorstep               int64
favday                object
city                  object
eopen                float64
tenure                 int64
recency                int64
dtype: object

In [7]:
# As per the statistical tests conducted, avgorder and tenure variables are not significant.
# So we can ignore the variables. From vif values, we can see that eopenrate can be dropeed
cat_cols = ['paperless','refill','doorstep','favday','city']
num_cols = ['esent','eopen','eclickrate','ordfreq','recency']

In [8]:
df1 = df.copy()

In [9]:
favday_df = pd.get_dummies(df1[['favday','city']],drop_first=True)
favday_df.head()

Unnamed: 0,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA
0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,0
3,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,1,0


In [10]:
df1 = pd.concat([df1[num_cols],favday_df,df1[['paperless','refill','doorstep','retained']]],axis=1)
df1.head()

Unnamed: 0,esent,eopen,eclickrate,ordfreq,recency,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA,paperless,refill,doorstep,retained
0,29,29.0,3.448276,0.0,1849,1,0,0,0,0,0,0,1,0,0,0,0,0
1,95,88.0,10.526316,0.181641,1777,0,0,0,0,0,0,0,1,0,1,1,1,1
2,0,0.0,0.0,0.0,2805,0,0,0,1,0,0,1,0,0,0,0,0,0
3,30,27.0,13.333333,0.00885,2133,1,0,0,0,0,0,1,0,0,0,0,0,1
4,46,37.0,15.217391,0.141176,1782,0,0,0,0,0,1,0,1,0,1,1,0,1


In [11]:
df1.head()

Unnamed: 0,esent,eopen,eclickrate,ordfreq,recency,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA,paperless,refill,doorstep,retained
0,29,29.0,3.448276,0.0,1849,1,0,0,0,0,0,0,1,0,0,0,0,0
1,95,88.0,10.526316,0.181641,1777,0,0,0,0,0,0,0,1,0,1,1,1,1
2,0,0.0,0.0,0.0,2805,0,0,0,1,0,0,1,0,0,0,0,0,0
3,30,27.0,13.333333,0.00885,2133,1,0,0,0,0,0,1,0,0,0,0,0,1
4,46,37.0,15.217391,0.141176,1782,0,0,0,0,0,1,0,1,0,1,1,0,1


In [12]:
X = df1.drop('retained',axis=1)
y=df1['retained']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size = 0.3)

In [13]:
from sklearn.linear_model import LogisticRegression
regression = LogisticRegression()
regression.fit(X_train, y_train )
print("Train Accuracy:",regression.score(X_train, y_train))
print("Test Accuracy:",regression.score(X_test, y_test))

Train Accuracy: 0.9394536750211208
Test Accuracy: 0.9369250985545335


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print("Train Accuracy:",gb.score(X_train, y_train))
print("Test Accuracy:",gb.score(X_test, y_test))

Train Accuracy: 0.965418192058575
Test Accuracy: 0.9614980289093298


In [15]:
score_card = pd.DataFrame(columns=['Model','Precision Score','Recall Score','Accuracy Score','f1-score','AUC Score'])

def scores(model_name,y_test,y_pred):
    precision = round(precision_score(y_test,y_pred),2)
    recall    = round(recall_score(y_test,y_pred),2)
    accuracy  = round(accuracy_score(y_test,y_pred),2)   
    f1        = round(f1_score(y_test,y_pred),2)
    auc       = round(roc_auc_score(y_test, y_pred),4)
    global score_card
    score_card = score_card.append({'Model' : model_name,                              
                                    'Precision Score': precision_score(y_test, y_pred),
                                    'Recall Score': recall_score(y_test, y_pred),
                                    'Accuracy Score': accuracy_score(y_test, y_pred),
                                    'f1-score': f1_score(y_test, y_pred),
                                    'AUC Score' : roc_auc_score(y_test, y_pred)
                                    },ignore_index = True)
    return score_card.sort_values(by='f1-score',ascending= False).reset_index(drop=True)
    
# Logistic Regression
def log_reg(X_train,X_test, y_train,y_test):
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    pred = lr.predict(X_test)
    name = 'Logistic Regression'
    log_score = scores(name,y_test,pred)
    train_pred = lr.predict(X_train)
    return log_score




#Gradient Boosting Classifier
def gboost(X_train,X_test, y_train,y_test):
    gb = GradientBoostingClassifier()
    gb_model = gb.fit(X_train,y_train)
    pred = gb_model.predict(X_test)
    name = 'Gradient Boosting'
    gb_score = scores(name,y_test,pred)
    return gb_score



In [16]:
gboost(X_train, X_test, y_train, y_test)

Unnamed: 0,Model,Precision Score,Recall Score,Accuracy Score,f1-score,AUC Score
0,Gradient Boosting,0.967054,0.985355,0.961498,0.976119,0.926141


In [17]:
log_reg(X_train,X_test, y_train,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Model,Precision Score,Recall Score,Accuracy Score,f1-score,AUC Score
0,Gradient Boosting,0.967054,0.985355,0.961498,0.976119,0.926141
1,Logistic Regression,0.957496,0.963798,0.936925,0.960636,0.897098


In [18]:
LR = LogisticRegression()
GB = GradientBoostingClassifier()

In [19]:
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1533
           1       0.96      0.96      0.96      6077

    accuracy                           0.94      7610
   macro avg       0.91      0.90      0.90      7610
weighted avg       0.94      0.94      0.94      7610



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1533
           1       0.96      0.96      0.96      6077

    accuracy                           0.94      7610
   macro avg       0.91      0.90      0.90      7610
weighted avg       0.94      0.94      0.94      7610



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
y_pred_train = LR.predict(X_train)
y_prob_train = LR.predict_proba(X_train)[:,1]

y_pred_test = LR.predict(X_test)
y_prob_test = LR.predict_proba(X_test)[:,1]

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [22]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,classification_report,roc_curve

In [42]:
print('Train data score for Logistic Regression Before HyperParameter Tuning')
print('Confusion matrix: \n',confusion_matrix(y_train,y_pred_train))
print('Accuracy Score: ',accuracy_score(y_train,y_pred_train))
print('roc_auc_score: ',roc_auc_score(y_train,y_prob_train))
print('Recall Score: ',recall_score(y_train,y_pred_train))
print('f1 Score: ',f1_score(y_train,y_pred_train))
print('Precision: ',precision_score(y_train,y_pred_train))

Train data score for Logistic Regression Before HyperParameter Tuning
Confusion matrix: 
 [[ 2961   614]
 [  461 13719]]
Accuracy Score:  0.9394536750211208
roc_auc_score:  0.966419373292434
Recall Score:  0.9674894217207334
f1 Score:  0.9622978992038719
Precision:  0.9571617944603363


In [43]:
print('Test data score Logistic Regression Before HyperParameter Tuning')
print('Confusion matrix: \n ',confusion_matrix(y_test,y_pred_test))
print('Accuracy Score: ',accuracy_score(y_test,y_pred_test))
print('roc_auc_score: ',roc_auc_score(y_test,y_prob_test))
print('Recall Score: ',recall_score(y_test,y_pred_test))
print('f1 Score: ',f1_score(y_test,y_pred_test))
print('Precision: ',precision_score(y_test,y_pred_test))

Test data score Logistic Regression Before HyperParameter Tuning
Confusion matrix: 
  [[1273  260]
 [ 220 5857]]
Accuracy Score:  0.9369250985545335
roc_auc_score:  0.9649724598678773
Recall Score:  0.9637979266085239
f1 Score:  0.9606363785468264
Precision:  0.957495504332189


In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
LR =LogisticRegression()
LR_cv=GridSearchCV(LR,grid,cv=10)
LR_cv.fit(X_train,y_train)

print("tuned hyperparameters :(best parameters) ",LR_cv.best_params_)
print("accuracy of LR after Hyperparameter tuning:",LR_cv.best_score_)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

tuned hyperparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2'}
accuracy of LR after Hyperparameter tuning: 0.9404672947595483


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [45]:
y_pred_train_hp = LR_cv.predict(X_train)
y_prob_train_hp = LR_cv.predict_proba(X_train)[:,1]

y_pred_test_hp = LR_cv.predict(X_test)
y_prob_test_hp = LR_cv.predict_proba(X_test)[:,1]

In [46]:
print('Train data score for Logistic Regression After HyperParameter Tuning')
print('Confusion matrix: \n',confusion_matrix(y_train,y_pred_train_hp))
print('Accuracy Score: ',accuracy_score(y_train,y_pred_train_hp))
print('roc_auc_score: ',roc_auc_score(y_train,y_prob_train_hp))
print('Recall Score: ',recall_score(y_train,y_pred_train_hp))
print('f1 Score: ',f1_score(y_train,y_pred_train_hp))
print('Precision: ',precision_score(y_train,y_pred_train_hp))

Train data score for Logistic Regression After HyperParameter Tuning
Confusion matrix: 
 [[ 2942   633]
 [  423 13757]]
Accuracy Score:  0.9405237961137708
roc_auc_score:  0.9669084892540464
Recall Score:  0.9701692524682651
f1 Score:  0.9630381519075953
Precision:  0.9560111188325225


In [47]:
print('Test data score Logistic Regression After HyperParameter Tuning')
print('Confusion matrix: \n ',confusion_matrix(y_test,y_pred_test_hp))
print('Accuracy Score: ',accuracy_score(y_test,y_pred_test_hp))
print('roc_auc_score: ',roc_auc_score(y_test,y_prob_test_hp))
print('Recall Score: ',recall_score(y_test,y_pred_test_hp))
print('f1 Score: ',f1_score(y_test,y_pred_test_hp))
print('Precision: ',precision_score(y_test,y_pred_test_hp))

Test data score Logistic Regression After HyperParameter Tuning
Confusion matrix: 
  [[1265  268]
 [ 203 5874]]
Accuracy Score:  0.938107752956636
roc_auc_score:  0.9658026408428215
Recall Score:  0.9665953595524107
f1 Score:  0.9614534740977165
Precision:  0.9563660045587756


In [70]:
#Hyper-Parameter Tuning - LogisticReg
LR_cv.fit(X_train, y_train)
pred = LR_cv.predict(X_test)
print(classification_report(y_test,pred))

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

              precision    recall  f1-score   support

           0       0.86      0.83      0.84      1533
           1       0.96      0.97      0.96      6077

    accuracy                           0.94      7610
   macro avg       0.91      0.90      0.90      7610
weighted avg       0.94      0.94      0.94      7610



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [74]:
#Hyper-Parameter Tuning - LogisticRegression
LR_cv.fit(X_test, y_test)
pred = LR_cv.predict(X_train)
print(classification_report(y_train,pred))

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.86      0.83      0.84      3575
           1       0.96      0.96      0.96     14180

    accuracy                           0.94     17755
   macro avg       0.91      0.90      0.90     17755
weighted avg       0.94      0.94      0.94     17755



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
GB.fit(X_train, y_train)
pred = GB.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.94      0.87      0.90      1533
           1       0.97      0.99      0.98      6077

    accuracy                           0.96      7610
   macro avg       0.95      0.93      0.94      7610
weighted avg       0.96      0.96      0.96      7610



In [27]:
y_pred_train_gb = GB.predict(X_train)
y_prob_train_gb = GB.predict_proba(X_train)[:,1]

y_pred_test_gb = GB.predict(X_test)
y_prob_test_gb = GB.predict_proba(X_test)[:,1]

In [57]:
print('Train data score for Gradient Boosting Before HyperParameter Tuning')
print('Confusion matrix: \n',confusion_matrix(y_train,y_pred_train_gb))
print('Accuracy Score: ',accuracy_score(y_train,y_pred_train_gb))
print('roc_auc_score: ',roc_auc_score(y_train,y_prob_train_gb))
print('f1 Score: ',f1_score(y_train,y_pred_train_gb))
print('Precision: ',precision_score(y_train,y_pred_train_gb))

print('Recall Score: ',recall_score(y_train,y_pred_train_gb))

Train data score for Gradient Boosting Before HyperParameter Tuning
Confusion matrix: 
 [[ 3130   445]
 [  169 14011]]
Accuracy Score:  0.965418192058575
roc_auc_score:  0.9851610462879855
f1 Score:  0.9785584578851794
Precision:  0.9692169341449917
Recall Score:  0.9880818053596615


In [52]:
print('Test data score Gradient Boosting Before HyperParameter Tuning')
print('Confusion matrix: \n ',confusion_matrix(y_test,y_pred_test_gb))
print('Accuracy Score: ',accuracy_score(y_test,y_pred_test_gb))
print('roc_auc_score: ',roc_auc_score(y_test,y_prob_test_gb))
print('f1 Score: ',f1_score(y_test,y_pred_test_gb))
print('Precision: ',precision_score(y_test,y_pred_test_gb))
print('Recall Score: ',recall_score(y_test,y_pred_test_gb))

Test data score Gradient Boosting Before HyperParameter Tuning
Confusion matrix: 
  [[1329  204]
 [  89 5988]]
Accuracy Score:  0.9614980289093298
roc_auc_score:  0.9803579116923165
f1 Score:  0.9761186730784905
Precision:  0.9670542635658915


In [30]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint as sp_randint

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
### HYperParameter Tuning

In [31]:
#print("tuned hpyerparameters :(best parameters) ",gb_cv.best_params_)
#print("accuracy after Hyperparameter tuning :",gb_cv.best_score_)

In [32]:
param = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

In [34]:
gb_tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = param, scoring='f1',n_jobs=4,cv=5)
gb_tuning.fit(X_train,y_train)


print("tuned hyper-parameters :(best parameters) ",gb_tuning.best_params_), 


tuned hyper-parameters :(best parameters)  {'learning_rate': 0.1, 'n_estimators': 750}


(None,)

In [63]:
print("accuracy of GB after Hyperparameter tuning:",gb_tuning.best_score_)

accuracy of GB after Hyperparameter tuning: 0.9788710468648952


In [66]:
y_pred_train_gbhp = gb_tuning.predict(X_train)
y_prob_train_gbhp = gb_tuning.predict_proba(X_train)[:,1]

y_pred_test_gbhp = gb_tuning.predict(X_test)
y_prob_test_gbhp = gb_tuning.predict_proba(X_test)[:,1]

In [67]:
print('Train data score for Gradient Boosting After HyperParameter Tuning')
print('Confusion matrix: \n',confusion_matrix(y_train,y_pred_train_gbhp))
print('Accuracy Score: ',accuracy_score(y_train,y_pred_train_gbhp))
print('roc_auc_score: ',roc_auc_score(y_train,y_prob_train_gbhp))
print('f1 Score: ',f1_score(y_train,y_pred_train_gbhp))
print('Precision: ',precision_score(y_train,y_pred_train_gbhp))
print('Recall Score: ',recall_score(y_train,y_pred_train_gbhp))

Train data score for Gradient Boosting After HyperParameter Tuning
Confusion matrix: 
 [[ 3384   191]
 [   61 14119]]
Accuracy Score:  0.9858068149816953
roc_auc_score:  0.9985147701381835
f1 Score:  0.9911547911547912
Precision:  0.9866526904262753
Recall Score:  0.9956981664315938


In [68]:
print('Test data score Gradient Boosting After HyperParameter Tuning')
print('Confusion matrix: \n ',confusion_matrix(y_test,y_pred_test_gbhp))
print('Accuracy Score: ',accuracy_score(y_test,y_pred_test_gbhp))
print('roc_auc_score: ',roc_auc_score(y_test,y_prob_test_gbhp))
print('f1 Score: ',f1_score(y_test,y_pred_test_gbhp))
print('Precision: ',precision_score(y_test,y_pred_test_gbhp))
print('Recall Score: ',recall_score(y_test,y_pred_test_gbhp))

Test data score Gradient Boosting After HyperParameter Tuning
Confusion matrix: 
  [[1352  181]
 [  98 5979]]
Accuracy Score:  0.9633377135348226
roc_auc_score:  0.9812984399703695
f1 Score:  0.9772002941897523
Precision:  0.9706168831168831
Recall Score:  0.983873621852888


In [71]:
#Hyper-Parameter Tuning - Gradient Boosting
gb_tuning.fit(X_train, y_train)
pred = gb_tuning.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91      1533
           1       0.97      0.98      0.98      6077

    accuracy                           0.96      7610
   macro avg       0.95      0.93      0.94      7610
weighted avg       0.96      0.96      0.96      7610



In [75]:
#Hyper-Parameter Tuning - Gradient Boosting
gb_tuning.fit(X_test, y_test)
pred = gb_tuning.predict(X_train)


In [76]:
print(classification_report(y_train,pred))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      3575
           1       0.97      0.99      0.98     14180

    accuracy                           0.96     17755
   macro avg       0.96      0.93      0.94     17755
weighted avg       0.96      0.96      0.96     17755



In [77]:
from sklearn.metrics import plot_confusion_matrix

In [78]:
print('Logistic Regression :\n')
print(plot_confusion_matrix(LR,'Logistic Regression',X_train,X_test,y_train,y_test))
print('*'*85)


print('Gradient Boosting :\n')
print(plot_confusion_matrix(GB,'Gradient Boosting',X_train,X_test,y_train,y_test))
print('*'*85)



Logistic Regression :



NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print('Logistic Regression :\n')
print(get_train_report(LR,X_train,y_train))
print(get_test_report(LR,X_test,y_test))
print('*'*85)



print('Gradient Boosting :\n')
print(get_train_report(GB,X_train,y_train))
print(get_test_report(GB,X_test,y_test))
print('*'*85)



In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

param = [{'alpha': np.arange(0,100,0.1)}]
gd = GridSearchCV(estimator = Ridge(),param_grid = param,scoring = 'r2',cv = 3)

gd.fit(X,y)

GridSearchCV(cv=3, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,
        1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,
        2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,
        3...
       94.6, 94.7, 94.8, 94.9, 95. , 95.1, 95.2, 95.3, 95.4, 95.5, 95.6,
       95.7, 95.8, 95.9, 96. , 96.1, 96.2, 96.3, 96.4, 96.5, 96.6, 96.7,
       96.8, 96.9, 97. , 97.1, 97.2, 97.3, 97.4, 97.5, 97.6, 97.7, 97.8,
       97.9, 98. , 98.1, 98.2, 98.3, 98.4, 98.5, 98.6, 98.7, 98.8, 98.9,
       99. , 99.1, 99.2, 99.3, 99.4, 99.5, 99.6, 99.7, 99.8, 99.9])}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
     

In [71]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Variables'] = X_train.columns
vif['vif'] = [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif

Unnamed: 0,Variables,vif
0,esent,4.224116
1,eopen,2.124148
2,eclickrate,1.586146
3,ordfreq,1.112784
4,recency,6.641108
5,favday_Monday,2.112319
6,favday_Saturday,1.236667
7,favday_Sunday,1.063136
8,favday_Thursday,1.774605
9,favday_Tuesday,2.035919
