In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Data Visulaization
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,classification_report,precision_score,recall_score,accuracy_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('bank-additional-full.csv',sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
print('shape: ', df.shape)

shape:  (41188, 21)


In [4]:
#checking for null
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Exploring Categorical data

In [5]:
#job column
print('Jobs unque values: ', df['job'].unique())

Jobs unque values:  ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']


In [6]:
#martial column
print('Marital unique values: ', df['marital'].unique())

Marital unique values:  ['married' 'single' 'divorced' 'unknown']


In [7]:
#Education column
print('Education unique values: ', df['education'].unique())

Education unique values:  ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']


In [8]:
#default column
print('Default unique values: ', df['default'].unique())

Default unique values:  ['no' 'unknown' 'yes']


In [9]:
#housing
print('Housing: ', df['housing'].unique())

Housing:  ['no' 'yes' 'unknown']


In [10]:
#loan
print('Loan: ', df['loan'].unique())

Loan:  ['no' 'yes' 'unknown']


In [11]:
df.education.value_counts()

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64

In [12]:
#removing unknown with respect to education column
df = df[df.education != 'unknown']

In [13]:
#education column after removing unknown
df.education.value_counts()

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: education, dtype: int64

In [14]:
#removing unknown with respect to job and marital columns
df = df[df.job != 'unknown']
df = df[df.marital != 'unknown']

In [15]:
#shape after removing unknowns
df.shape

(39191, 21)

In [16]:
#Label Encoding
label_encoding = LabelEncoder()

df['job'] = label_encoding.fit_transform(df['job'])
df['marital'] = label_encoding.fit_transform(df['marital'])
df['education'] = label_encoding.fit_transform(df['education'])
df['day_of_week'] = label_encoding.fit_transform(df['day_of_week'])

In [17]:
# converting categorical values into numerical values
bank = df
bank["housing"] = bank["housing"].map({"no":0,"yes":1})
bank["loan"] = bank["loan"].map({"no":0,"yes":1})
bank["y"] = bank["y"].map({"no":0,"yes":1})
bank.month = pd.to_datetime(bank.month, format = "%b").dt.month

In [18]:
#null values
bank.isnull().sum()

age                 0
job                 0
marital             0
education           0
default             0
housing           946
loan              946
contact             0
month               0
day_of_week         0
duration            0
campaign            0
pdays               0
previous            0
poutcome            0
emp.var.rate        0
cons.price.idx      0
cons.conf.idx       0
euribor3m           0
nr.employed         0
y                   0
dtype: int64

In [19]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,no,0.0,0.0,telephone,5,1,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,unknown,0.0,0.0,telephone,5,1,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,no,1.0,0.0,telephone,5,1,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,no,0.0,0.0,telephone,5,1,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,no,0.0,1.0,telephone,5,1,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [20]:
#dropping nullvalues and columns which aren't required
bank.drop(["poutcome", "contact"], axis = 1, inplace = True)
bank.dropna(inplace = True)

In [21]:
bank.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,month,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,no,0.0,0.0,5,1,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,unknown,0.0,0.0,5,1,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,no,1.0,0.0,5,1,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,no,0.0,0.0,5,1,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,no,0.0,1.0,5,1,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
5,45,7,1,2,unknown,0.0,0.0,5,1,198,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
6,59,0,1,5,no,0.0,0.0,5,1,139,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
8,24,9,2,5,no,1.0,0.0,5,1,380,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
9,25,7,2,3,no,1.0,0.0,5,1,50,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
11,25,7,2,3,no,1.0,0.0,5,1,222,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0


In [23]:
bank.shape

(38245, 19)

In [24]:
#converting default into numerical values
bank["default"] = bank["default"].map({"no":0,"yes":1})
bank.isnull().sum()

age                  0
job                  0
marital              0
education            0
default           7757
housing              0
loan                 0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [30]:
#KNN imputation on default column
from sklearn.impute import KNNImputer
imputed_matrix = KNNImputer(n_neighbors=3).fit_transform(bank)
impute_df = pd.DataFrame(imputed_matrix, columns = bank.columns)
impute_df['default'] = impute_df['default'].\
                map(lambda x: 1 if x>0.5 else 0).astype('int')
impute_df

Unnamed: 0,age,job,marital,education,default,housing,loan,month,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56.0,3.0,1.0,0.0,0,0.0,0.0,5.0,1.0,261.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
1,57.0,7.0,1.0,3.0,0,0.0,0.0,5.0,1.0,149.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
2,37.0,7.0,1.0,3.0,0,1.0,0.0,5.0,1.0,226.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
3,40.0,0.0,1.0,1.0,0,0.0,0.0,5.0,1.0,151.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
4,56.0,7.0,1.0,3.0,0,0.0,1.0,5.0,1.0,307.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38240,73.0,5.0,1.0,5.0,0,1.0,0.0,11.0,0.0,334.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,1.0
38241,46.0,1.0,1.0,5.0,0,0.0,0.0,11.0,0.0,383.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,0.0
38242,56.0,5.0,1.0,6.0,0,1.0,0.0,11.0,0.0,189.0,2.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,0.0
38243,44.0,9.0,1.0,5.0,0,0.0,0.0,11.0,0.0,442.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,1.0


In [31]:
impute_df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Target column

In [32]:
y = impute_df['y']

In [33]:
impute_df.drop(columns=['y'],inplace=True)
impute_df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,month,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56.0,3.0,1.0,0.0,0,0.0,0.0,5.0,1.0,261.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0
1,57.0,7.0,1.0,3.0,0,0.0,0.0,5.0,1.0,149.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0
2,37.0,7.0,1.0,3.0,0,1.0,0.0,5.0,1.0,226.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0
3,40.0,0.0,1.0,1.0,0,0.0,0.0,5.0,1.0,151.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0
4,56.0,7.0,1.0,3.0,0,0.0,1.0,5.0,1.0,307.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0


In [37]:
#Target column distirbution
y.value_counts()

0.0    33987
1.0     4258
Name: y, dtype: int64

Balancing the data using SMOTE:

In [38]:
smt = SMOTE()
impute_df, y = smt.fit_sample(impute_df, y)
np.bincount(y)

array([33987, 33987])

In [39]:
#train-test split
X_train,X_test,y_train,y_test = train_test_split(impute_df,y,test_size=0.3,random_state=42,stratify=y)

In [40]:
print('Shape of Training set : ' , [X_train.shape,y_train.shape])
print('Shape of Test set : ' , [X_test.shape,y_test.shape])

Shape of Training set :  [(47581, 18), (47581,)]
Shape of Test set :  [(20393, 18), (20393,)]


#### Logistic Regression

In [53]:
reg_clf = LogisticRegression().fit(X_train,y_train)

reg_pred = reg_clf.predict(X_test)
 
print('F1 score score : ' , f1_score(y_test,reg_pred))
F1_score_log = f1_score(y_test,reg_pred)

print("Accuracy:",accuracy_score(y_test, reg_pred))
Accuracy_log = accuracy_score(y_test, reg_pred)
print("Precision:",precision_score(y_test, reg_pred))
Precision_log = precision_score(y_test, reg_pred)
print("Recall:",recall_score(y_test, reg_pred))
Recall_log = recall_score(y_test, reg_pred)

F1 score score :  0.8532553035844915
Accuracy: 0.8524493698818222
Precision: 0.848578911630614
Recall: 0.8579835229501765


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [51]:
#confusion matrix
log_cf_mat = confusion_matrix(y_test,reg_pred)
log_cf_mat

array([[8636, 1561],
       [1448, 8748]])

#### Decision Tree Classifier

In [54]:
#Decision tree
tree_clf = DecisionTreeClassifier().fit(X_train,y_train)

tree_pred = tree_clf.predict(X_test)

print('F1 score score : ' , f1_score(y_test,tree_pred))
F1_score_tree = f1_score(y_test,tree_pred)

print("Accuracy:",accuracy_score(y_test, tree_pred))
Accuracy_tree = accuracy_score(y_test, tree_pred)

print("Precision:",precision_score(y_test, tree_pred))
Precision_tree = precision_score(y_test, tree_pred)

print("Recall:",recall_score(y_test, tree_pred))
Recall_tree = recall_score(y_test, tree_pred)


F1 score score :  0.9286937796701413
Accuracy: 0.9285539155592605
Precision: 0.926834033408225
Recall: 0.9305610043154178


In [64]:
#confusion matrix
tree_mat = confusion_matrix(y_test, tree_pred)
tree_mat

array([[9448,  749],
       [ 708, 9488]])

#### RandomForest Classifier

In [59]:
rf_clf = RandomForestClassifier(n_estimators = 1000,random_state=55).fit(X_train,y_train)

rf_pred = rf_clf.predict(X_test)

print('F1 score score : ' , f1_score(y_test,rf_pred))
F1_score_rf = f1_score(y_test, rf_pred)

print("Accuracy:",accuracy_score(y_test, rf_pred))
Accuracy_rf = accuracy_score(y_test, rf_pred)

print("Precision:",precision_score(y_test, rf_pred))
Precision_rf = precision_score(y_test, rf_pred)

print("Recall:",recall_score(y_test, rf_pred))
Recall_rf = recall_score(y_test, rf_pred)

F1 score score :  0.9534646243617798
Accuracy: 0.9530721325945177
Precision: 0.9455106567653583
Recall: 0.9615535504119262


In [60]:
#confusion matrix
rf_mat = confusion_matrix(y_test, rf_pred)
rf_mat

array([[9632,  565],
       [ 392, 9804]])

#### XtremeGradient boosting

In [61]:
alg = XGBClassifier(learning_rate = 0.1, n_estimators = 2400, max_depth = 10,
                        min_child_weight = 3, gamma = 0.2, subsample = 0.6, colsample_bytree = 1.0,
                        objective ='binary:logistic', nthread = 8, scale_pos_weight = 1, seed = 42)
alg.fit(X_train,y_train)
alg_pred = alg.predict(X_test)

print('F1 score score : ' , f1_score(y_test,alg_pred))
F1_score_xgb = f1_score(y_test,alg_pred)

print("Accuracy:",accuracy_score(y_test, alg_pred))
Accuracy_xgb = accuracy_score(y_test, alg_pred)

print("Precision:",precision_score(y_test, alg_pred))
Precision_xgb = precision_score(y_test, alg_pred)

print("Recall:",recall_score(y_test, alg_pred))
Recall_xgb = recall_score(y_test, alg_pred)

F1 score score :  0.9479740361919748
Accuracy: 0.9481194527533958
Precision: 0.9505917159763314
Recall: 0.9453707336210279


In [62]:
#confusion matrix
xgb_cf = confusion_matrix(y_test, alg_pred)
xgb_cf

array([[9696,  501],
       [ 557, 9639]])

In [63]:
Models= pd.DataFrame({
'F1 score' : [F1_score_log,F1_score_tree,F1_score_rf,F1_score_xgb]
,'Accuracy' : [Accuracy_log,Accuracy_tree,Accuracy_rf,Accuracy_xgb]
,'Precision' : [Precision_log,Precision_tree,Precision_rf,Precision_xgb]
,'Recall' : [Recall_log,Recall_tree,Recall_rf,Recall_xgb]}
,index=['Logistic Regression','Decision Tree','Random Forest', 'XGB'])
Models

Unnamed: 0,F1 score,Accuracy,Precision,Recall
Logistic Regression,0.853255,0.852449,0.848579,0.857984
Decision Tree,0.928694,0.928554,0.926834,0.930561
Random Forest,0.953465,0.953072,0.945511,0.961554
XGB,0.947974,0.948119,0.950592,0.945371
