#**Imports and Pre-requisites** 

In [1]:
from google.colab import drive
drive.mount("/gdrive")
%cd /gdrive/My Drive/CIS_508/Colab Notebooks/Projects/4.Target Marketing
! pwd

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/CIS_508/Colab Notebooks/Projects/4.Target Marketing
/gdrive/My Drive/CIS_508/Colab Notebooks/Projects/4.Target Marketing


In [0]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter

from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as SMOTE_Pipeline

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

# **EDA and Data Preprocessing**

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
train.shape, test.shape

((4521, 17), (45211, 17))

In [4]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [5]:
Xtrain = train.drop(columns = ['y']).copy()
ytrain = train['y']
Xtrain.shape, ytrain.shape

((4521, 16), (4521,))

In [6]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 16 columns):
age          4521 non-null int64
job          4521 non-null object
marital      4521 non-null object
education    4521 non-null object
default      4521 non-null object
balance      4521 non-null int64
housing      4521 non-null object
loan         4521 non-null object
contact      4521 non-null object
day          4521 non-null int64
month        4521 non-null object
duration     4521 non-null int64
campaign     4521 non-null int64
pdays        4521 non-null int64
previous     4521 non-null int64
poutcome     4521 non-null object
dtypes: int64(7), object(9)
memory usage: 565.2+ KB


In [7]:
Xtrain.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [8]:
Xtrain.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

In [9]:
cols = Xtrain.columns
num_cols = [x for x in cols if Xtrain[x].dtypes in ('int64', 'float64')]
cat_cols = [x for x in cols if Xtrain[x].dtypes not in ('int64', 'float64')]
num_cols, cat_cols

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome'])

In [10]:
len(cols), len(num_cols), len(cat_cols)

(16, 7, 9)

In [0]:
le = LabelEncoder()
temp = ytrain.copy()
ytrain = pd.DataFrame(le.fit_transform(temp), columns = ['Target'], index = temp.index)

In [12]:
cat_transformer = Pipeline(steps = [
                                    ('ohe', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
                                  ])
cat_transformer

Pipeline(memory=None,
         steps=[('ohe',
                 OneHotEncoder(categories='auto', drop=None,
                               dtype=<class 'numpy.float64'>,
                               handle_unknown='ignore', sparse=False))],
         verbose=False)

In [13]:
pp = ColumnTransformer(transformers = [('cat', cat_transformer, cat_cols)],
                       remainder = 'passthrough')
pp

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 Pipeline(memory=None,
                                          steps=[('ohe',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                sparse=False))],
                                          verbose=False),
                                 ['job', 'marital', 'education', 'default',
                                  'housing', 'loan', 'contact', 'month',
                                  'poutcome'])],
                  verbose=False)

# **SMOTE to handle imbalanced class problem**

In [14]:
Counter(ytrain['Target'])

Counter({0: 4000, 1: 521})

In [0]:
sm = SMOTE(ratio = 0.9, random_state = 6)

# **Model Building**

## Model 1: XGBoost Classifier

In [0]:
model1 = SMOTE_Pipeline(steps = [('pp', pp),
                                 ('sm', sm),
                                 ('estimator', XGBClassifier())
                  ])

In [17]:
model1.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                                     

In [0]:
params1 = {
    'sm__ratio':[0.5, 0.9],
    'estimator__n_estimators': stats.randint(150, 1000),
    'estimator__learning_rate': stats.uniform(0.01, 0.6),
    'estimator__subsample': stats.uniform(0.3, 0.9),
    'estimator__max_depth': [3, 4, 5, 6, 7, 8, 9],
    'estimator__colsample_bytree': stats.uniform(0.5, 0.9),
    'estimator__min_child_weight': [1, 2, 3, 4]
    }

In [19]:
clf1 = RandomizedSearchCV(model1, params1, n_iter = 60, scoring = 'roc_auc', cv = 5, n_jobs = -1, random_state = 6, verbose = True)
clf1.fit(Xtrain, ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  7.4min


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pp',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('ohe',
                                                                                                OneHotEncoder(categories='auto',
                                                                                              

In [20]:
clf1.best_score_

0.9066822115384616

In [21]:
clf1.best_params_

{'estimator__colsample_bytree': 0.756634609641987,
 'estimator__learning_rate': 0.010511485659043997,
 'estimator__max_depth': 7,
 'estimator__min_child_weight': 2,
 'estimator__n_estimators': 807,
 'estimator__subsample': 0.7705035712356856,
 'sm__ratio': 0.9}

## Model 2: Gradient Boosting Classifier

In [0]:
model2 = SMOTE_Pipeline(steps = [
                                 ('pp', pp),
                                 ('sm', sm),
                                 ('estimator', GradientBoostingClassifier())
                                 ])

In [23]:
model2.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                                     

In [0]:
params2 = {
    'estimator__learning_rate': [0.99, 0.5, 0.1, 0.01],
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [3, 10], 
    'estimator__min_samples_split': [5, 10],
    'estimator__subsample': [0.9, 0.5, 0.1],
    'estimator__max_features': ['sqrt', 'log2'], 
    'estimator__random_state': [6],
    'estimator__warm_start': [True],
    'estimator__verbose': [True]
}

In [25]:
clf2 = RandomizedSearchCV(model2, params2, cv = 5, verbose = True, random_state = 6, n_iter = 60, scoring = 'roc_auc')
clf2.fit(Xtrain, ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.1206           0.2285            0.25s
         2           0.9001           0.2171            0.25s
         3           0.7915           0.1240            0.25s
         4           0.7142           0.0876            0.25s
         5           0.6040           0.0706            0.24s
         6           0.5960           0.0511            0.24s
         7           0.5609           0.0309            0.23s
         8           0.5039           0.0225            0.23s
         9           0.5431           0.0110            0.22s
        10           0.4657           0.0305            0.22s
        20           0.3212           0.0000            0.18s
        30           2.7716          -0.0129            0.16s
        40           0.2642          -0.0219            0.13s
        50           2.9245           0.0013            0.11s
       

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        40           0.2489          -0.0005            0.61s
        50           0.2346          -0.0043            0.57s
        60           0.2101          -0.0083            0.53s
        70           0.2080           0.0001            0.49s
        80           0.1946          -0.0025            0.45s
        90           0.1799          -0.0024            0.41s
       100           0.1787          -0.0030            0.37s
       200      117006.9529          -0.0010            0.00s
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.1563           0.2183            0.75s
         2           0.9316           0.2321            0.78s
         3           0.7918           0.1273            0.77s
         4           0.7334           0.0526            0.75s
         5           0.6428           0.0800            0.76s
         6           0.6066           0.0534            0.76s
    

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  4.0min finished


        30           0.6170           0.0117            1.06s
        40           0.5335           0.0066            0.97s
        50           0.4732           0.0048            0.91s
        60           0.4261           0.0044            0.84s
        70           0.4011           0.0048            0.77s
        80           0.3678           0.0019            0.71s
        90           0.3478           0.0014            0.65s
       100           0.3320           0.0027            0.59s
       200           0.2424          -0.0003            0.00s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pp',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('ohe',
                                                                                                OneHotEncoder(categories='auto',
                                                                                              

In [26]:
clf2.best_score_

0.9065751144688645

In [27]:
clf2.best_params_

{'estimator__learning_rate': 0.1,
 'estimator__max_depth': 3,
 'estimator__max_features': 'sqrt',
 'estimator__min_samples_split': 5,
 'estimator__n_estimators': 200,
 'estimator__random_state': 6,
 'estimator__subsample': 0.9,
 'estimator__verbose': True,
 'estimator__warm_start': True}

# Model 3: MLP Classifier

In [0]:
model3 = SMOTE_Pipeline(steps = [
                                 ('pp', pp),
                                 ('sm', sm),
                                 ('estimator', MLPClassifier())
                                 ])

In [29]:
model3.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                                     

In [0]:
params3 = {
    'estimator__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'estimator__activation': ['tanh', 'relu'],
    'estimator__solver': ['sgd', 'adam'],
    'estimator__alpha': [0.0001, 0.05],
    'estimator__learning_rate': ['constant','adaptive'],
}

In [31]:
clf3 = RandomizedSearchCV(model3, params3, cv =5, n_iter = 60, verbose = True, random_state = 6, scoring = 'roc_auc')
clf3.fit(Xtrain, ytrain)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 26.1min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pp',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('ohe',
                                                                                                OneHotEncoder(categories='auto',
                                                                                              

In [32]:
clf3.best_score_

0.8661834706959706

In [33]:
clf3.best_params_

{'estimator__activation': 'tanh',
 'estimator__alpha': 0.0001,
 'estimator__hidden_layer_sizes': (100,),
 'estimator__learning_rate': 'adaptive',
 'estimator__solver': 'adam'}

# Model 4: SVC Classifier

In [0]:
model4 = SMOTE_Pipeline(steps = [
                                 ('pp', pp),
                                 ('sm', sm),
                                 ('estimator', LinearSVC())
])

In [35]:
model4.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                                     

In [0]:
params4 = {
    'estimator__penalty':['l1', 'l2'],
    'estimator__loss':['hinge', 'squared_hinge'],
    'estimator__C': [0.1, 1, 10, 100, 1000],
    'estimator__tol': [1, 0.1, 0.01, 0.001, 0.0001]
}

In [37]:
clf4 = RandomizedSearchCV(model4, params4, cv = 5, n_iter = 50, scoring = 'roc_auc', verbose = True, random_state = 6)
clf4.fit(Xtrain, ytrain)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  1.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pp',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('ohe',
                                                                                                OneHotEncoder(categories='auto',
                                                                                              

In [38]:
clf4.best_score_

0.8157305860805861

In [39]:
clf4.best_params_

{'estimator__C': 10,
 'estimator__loss': 'hinge',
 'estimator__penalty': 'l2',
 'estimator__tol': 0.01}

# **Model Selection**

In [50]:
test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [51]:
Xtest = test.drop(columns = ['y'])
ytest = test['y']
Xtest.shape, ytest.shape

((45211, 16), (45211,))

In [52]:
temp = ytest.copy()
ytest = pd.DataFrame(le.transform(temp), columns = ['Target'], index = temp.index)
ytest.head()

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0


In [59]:
clf1_y_pred = clf1.predict(Xtest)
print("----------------XGBoost: Test Metrics------------------")
print("\nConfusion Matrix:\n", confusion_matrix(ytest, clf1_y_pred))
print("\nAUC:",roc_auc_score(ytest, clf1_y_pred))

----------------XGBoost: Test Metrics------------------

Confusion Matrix:
 [[38668  1254]
 [ 2782  2507]]

AUC: 0.7212956975309643


In [60]:
clf2_y_pred = clf2.predict(Xtest)
print("----------------GBM Classifier: Test Metrics------------------")
print("\nConfusion Matrix:\n", confusion_matrix(ytest, clf2_y_pred))
print("\nAUC:",roc_auc_score(ytest, clf2_y_pred))

----------------GBM Classifier: Test Metrics------------------

Confusion Matrix:
 [[38576  1346]
 [ 2888  2401]]

AUC: 0.7101226527671481


In [62]:
clf3_y_pred = clf3.predict(Xtest)
print("----------------XGBoost: Test Metrics------------------")
print("\nConfusion Matrix:\n", confusion_matrix(ytest, clf3_y_pred))
print("\nAUC:",roc_auc_score(ytest, clf3_y_pred))

----------------XGBoost: Test Metrics------------------

Confusion Matrix:
 [[33442  6480]
 [ 1156  4133]]

AUC: 0.8095583229801421


In [63]:
clf4_y_pred = clf4.predict(Xtest)
print("----------------XGBoost: Test Metrics------------------")
print("\nConfusion Matrix:\n", confusion_matrix(ytest, clf4_y_pred))
print("\nAUC:",roc_auc_score(ytest, clf4_y_pred))

----------------XGBoost: Test Metrics------------------

Confusion Matrix:
 [[34765  5157]
 [ 2426  2863]]

AUC: 0.7060676311812383
