In [1]:
import pandas as pd
bank_df = pd.read_csv( 'bank.csv')
bank_df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no


In [2]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB


In [3]:
# Dealing with imbalanced datasets
bank_df.subscribed.value_counts()

no     4000
yes     521
Name: subscribed, dtype: int64

In [4]:
## Importing resample from *sklearn.utils* package.
from sklearn.utils import resample

# Separate the case of yes-subscribes and no-subscribes
bank_subscribed_no = bank_df[bank_df.subscribed == 'no']
bank_subscribed_yes = bank_df[bank_df.subscribed == 'yes']

##Upsample the yes-subscribed cases.
df_minority_upsampled = resample(bank_subscribed_yes,
replace=True, # sample with replacement
n_samples=2000)

# Combine majority class with upsampled minority class
new_bank_df = pd.concat([bank_subscribed_no, df_minority_upsampled])

In [5]:
bank_subscribed_yes

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
13,20,student,single,secondary,no,502,no,no,1,0,yes
30,68,retired,divorced,secondary,no,4189,no,no,2,0,yes
33,32,management,single,tertiary,no,2536,yes,no,6,0,yes
34,49,technician,married,tertiary,no,1235,no,no,3,0,yes
36,78,retired,divorced,primary,no,229,no,no,1,0,yes
...,...,...,...,...,...,...,...,...,...,...,...
4494,26,technician,single,secondary,no,668,yes,no,3,0,yes
4503,60,self-employed,married,primary,no,362,no,yes,6,0,yes
4504,42,blue-collar,single,secondary,no,1080,yes,yes,3,4,yes
4505,32,admin.,single,secondary,no,620,yes,no,3,0,yes


In [6]:
new_bank_df[new_bank_df.duplicated()]

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
3475,32,technician,single,secondary,no,0,no,no,2,0,no
4037,42,management,divorced,tertiary,no,0,no,no,2,0,no
4103,46,management,divorced,tertiary,no,25,no,no,2,0,no
4402,41,entrepreneur,married,primary,no,39,yes,no,2,0,yes
3818,53,services,divorced,secondary,no,765,yes,no,3,7,yes
...,...,...,...,...,...,...,...,...,...,...,...
3789,47,management,married,tertiary,no,0,no,no,1,0,yes
944,40,blue-collar,married,secondary,no,1451,no,no,1,0,yes
1904,28,student,single,tertiary,no,0,no,no,2,7,yes
3005,60,admin.,married,secondary,no,3735,no,no,1,9,yes


In [7]:
df_minority_upsampled

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
1698,31,management,single,tertiary,no,3283,no,no,1,1,yes
1151,44,admin.,married,secondary,no,205,no,no,1,0,yes
2972,56,admin.,married,secondary,no,45,no,no,1,0,yes
4402,41,entrepreneur,married,primary,no,39,yes,no,2,0,yes
2122,48,unemployed,married,secondary,no,817,no,no,1,4,yes
...,...,...,...,...,...,...,...,...,...,...,...
3789,47,management,married,tertiary,no,0,no,no,1,0,yes
944,40,blue-collar,married,secondary,no,1451,no,no,1,0,yes
1904,28,student,single,tertiary,no,0,no,no,2,7,yes
3005,60,admin.,married,secondary,no,3735,no,no,1,9,yes


In [8]:
bank_df.balance.value_counts()

0       357
2        24
1        15
4        13
5        11
       ... 
539       1
1041      1
464       1
7702      1
1137      1
Name: balance, Length: 2353, dtype: int64

In [9]:
from sklearn.utils import shuffle
new_bank_df = shuffle(new_bank_df)
new_bank_df

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
4196,48,blue-collar,married,secondary,no,480,yes,no,1,2,yes
1067,44,management,married,secondary,no,8262,yes,no,16,0,no
725,31,management,married,tertiary,no,1010,yes,no,3,9,no
3115,39,self-employed,married,tertiary,no,585,yes,no,1,3,no
3219,72,retired,married,primary,no,763,no,no,2,0,yes
...,...,...,...,...,...,...,...,...,...,...,...
1360,31,technician,single,secondary,no,1014,yes,no,4,0,no
2361,37,services,divorced,secondary,no,532,yes,no,1,0,no
961,41,blue-collar,married,secondary,no,552,yes,no,1,0,no
4035,47,technician,married,secondary,no,2246,yes,no,1,0,no


In [10]:
# Assigning list of all column names in the DataFrame
X_features = list( new_bank_df.columns )
# Remove the response variable from the list
X_features.remove( 'subscribed' )
X_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [11]:
## get_dummies() will convert all the columns with data type as objects
encoded_bank_df = pd.get_dummies( new_bank_df[X_features], drop_first = True )
X = encoded_bank_df

In [12]:
# Encoding the subscribed column and assigning to Y
Y = new_bank_df.subscribed.map( lambda x: int( x == 'yes') )

In [13]:
list(map(lambda x: x**2,[0,1,2,3,4]))

[0, 1, 4, 9, 16]

In [14]:
X.head()

Unnamed: 0,age,balance,current-campaign,previous-campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing-loan_yes,personal-loan_yes
4196,48,480,1,2,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
1067,44,8262,16,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
725,31,1010,3,9,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
3115,39,585,1,3,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
3219,72,763,2,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
Y.head()

4196    1
1067    0
725     0
3115    0
3219    1
Name: subscribed, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( X,Y,test_size = 0.3,random_state = 42 )

In [17]:
## Importing Adaboost classifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
## Initializing base classifier
dt_clf = DecisionTreeClassifier(criterion = 'gini',max_depth = 1 )
logreg_clf = LogisticRegression(solver='lbfgs',max_iter=200)
## Initilizing adaboost classifier with 50 classifers
ada_dt = AdaBoostClassifier(dt_clf, n_estimators=50)
ada_logreg = AdaBoostClassifier(logreg_clf, n_estimators=50)
## Fitting adaboost model to training set
ada_dt.fit(train_X, train_y )
ada_logreg.fit(train_X, train_y )

AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=200))

In [18]:
#print(ada_dt.feature_importances_)

from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

predicted_classes_ada_dt = ada_dt.predict(test_X)
conf_mat_ada_dt = confusion_matrix(test_y,predicted_classes_ada_dt)
accuracy_ada_dt = accuracy_score(test_y,predicted_classes_ada_dt)
ada_roc_auc_dt = roc_auc_score(test_y, ada_dt.predict_proba(test_X)[:,1])

predicted_classes_ada_logreg = ada_logreg.predict(test_X)
conf_mat_ada_logreg = confusion_matrix(test_y,predicted_classes_ada_logreg)
accuracy_ada_logreg = accuracy_score(test_y,predicted_classes_ada_logreg)
ada_roc_auc_logreg = roc_auc_score(test_y, ada_logreg.predict_proba(test_X)[:,1])

print("Confusion matrix (AdaBoost-dt):")
print(conf_mat_ada_dt)
print("accuracy score (AdaBoost-dt)", accuracy_ada_dt)
print("AUC (AdaBoost-dt)", ada_roc_auc_dt)

print("Confusion matrix (AdaBoost-logreg):")
print(conf_mat_ada_logreg)
print("accuracy score (AdaBoost-logreg)", accuracy_ada_logreg)
print("AUC (AdaBoost-logreg)", ada_roc_auc_logreg)

Confusion matrix (AdaBoost-dt):
[[1093  116]
 [ 404  187]]
accuracy score (AdaBoost-dt) 0.7111111111111111
AUC (AdaBoost-dt) 0.714903312578112
Confusion matrix (AdaBoost-logreg):
[[1179   30]
 [ 528   63]]
accuracy score (AdaBoost-logreg) 0.69
AUC (AdaBoost-logreg) 0.6737203629294672


In [19]:
#Gradient Boosting
## Importing Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
## Initializing Gradient Boosting with 500 estimators and max depth as 10.
gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
## Fitting gradient boosting model to training set
gboost_clf.fit(train_X, train_y )

GradientBoostingClassifier(max_depth=10, n_estimators=500)

In [20]:
predicted_classes_gboost = gboost_clf.predict(test_X)
conf_mat_gboost = confusion_matrix(test_y,predicted_classes_gboost)
accuracy_gboost = accuracy_score(test_y,predicted_classes_gboost)
gboost_roc_auc = roc_auc_score(test_y, gboost_clf.predict_proba(test_X)[:,1])

print("Confusion matrix (gBoost):")
print(conf_mat_gboost)
print("accuracy score (gBoost)", accuracy_gboost)
print("AUC (gBoost)", gboost_roc_auc)

Confusion matrix (gBoost):
[[1162   47]
 [  22  569]]
accuracy score (gBoost) 0.9616666666666667
AUC (gBoost) 0.9821222388767828


In [21]:
#10 fold cross validation - to validate if there is overfitting
import numpy as np
from sklearn.model_selection import cross_val_score
gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
cv_scores = cross_val_score( gboost_clf, train_X, train_y, cv = 10, scoring = 'roc_auc' )
print( cv_scores )
print( "Mean Accuracy: ", np.mean(cv_scores), " with standard deviation of: ",np.std(cv_scores))

[0.96319174 0.97170747 0.98792547 0.97323267 0.95803147 0.96878416
 0.97452909 0.95917537 0.97376649 0.96045918]
Mean Accuracy:  0.9690803117174575  with standard deviation of:  0.008718973268286329


In [23]:
import xgboost as xgb
#https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
#pip install xgboost
xgboost_clf = xgb.XGBClassifier(n_estimators=500, max_depth=20)
#print(xgboost_clf)

xgboost_clf.fit(train_X, train_y )


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=20, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
predicted_classes_xgboost = xgboost_clf.predict(test_X)
conf_mat_xgboost = confusion_matrix(test_y,predicted_classes_xgboost)
accuracy_xgboost = accuracy_score(test_y,predicted_classes_xgboost)
xgboost_roc_auc = roc_auc_score(test_y, xgboost_clf.predict_proba(test_X)[:,1])

print("Confusion matrix (XgBoost):")
print(conf_mat_xgboost)
print("accuracy score (XgBoost)", accuracy_xgboost)
print("AUC (XgBoost)", xgboost_roc_auc)

Confusion matrix (XgBoost):
[[1120   89]
 [  19  572]]
accuracy score (XgBoost) 0.94
AUC (XgBoost) 0.9775933180223338


In [25]:
#10 fold cross validation
import numpy as np
from sklearn.model_selection import cross_val_score
xgboost_clf = xgb.XGBClassifier( n_estimators=500, max_depth=10)
cv_scores = cross_val_score( xgboost_clf, train_X, train_y, cv = 10, scoring = 'roc_auc' )
print( cv_scores )
print( "Mean Accuracy: ", np.mean(cv_scores), " with standard deviation of: ",np.std(cv_scores))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0.96179364 0.9613615  0.98304482 0.97343603 0.95088843 0.95777727
 0.96451359 0.95360838 0.96296296 0.94413265]
Mean Accuracy:  0.9613519266854154  with standard deviation of:  0.010526740049233551


In [26]:
# Voting Ensemble for Classification

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# create the sub models
estimators = []
model1 = LogisticRegression(solver='lbfgs',max_iter=400)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(gamma='auto',probability=True)
estimators.append(('svm', model3))
# create the ensemble model
Votingensemble = VotingClassifier(estimators,voting ='soft')
results = model_selection.cross_val_score(Votingensemble, train_X, train_y, cv=10, scoring = 'roc_auc')

print(results.mean())

0.9620388452338153


In [27]:
#SVM
from sklearn.svm import SVC
model3 = SVC(gamma='auto',probability=True)
results = model_selection.cross_val_score(model3, train_X, train_y, cv=10, scoring = 'roc_auc')
print(results.mean())

0.9374201782672957
