In [None]:
#Import necessary library
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Defing file location
url = 'https://raw.githubusercontent.com/tuhin-datascience/data/main/bank-full.csv'

In [None]:
# Creating a dataframe from the file
df = pd.read_csv(url, sep=';')

In [None]:
# First five rows of the data frame
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
# Statistics of the numerical columns
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [None]:
# Statistics of the categorical columns
df.describe(include=object)

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211
unique,12,3,4,2,2,2,3,12,4,2
top,blue-collar,married,secondary,no,yes,no,cellular,may,unknown,no
freq,9732,27214,23202,44396,25130,37967,29285,13766,36959,39922


In [None]:
#Finding the unique values of the job column
df['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [None]:
# Count of each of the values for the job column
df['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

In [None]:
# Exploring the count of values of each the categoriacal columns
for x in df.columns:
  if df.dtypes[x] == 'object':
    print(x)
    print(df[x].unique())
    print(df[x].value_counts())
    print('----------------------------------')

job
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64
----------------------------------
marital
['married' 'single' 'divorced']
married     27214
single      12790
divorced     5207
Name: marital, dtype: int64
----------------------------------
education
['tertiary' 'secondary' 'unknown' 'primary']
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64
----------------------------------
default
['no' 'yes']
no     44396
yes      815
Name: default, dtype: int64
----------------------------------
housing
['yes' 'no']
yes    25130
no     20081
Name: housing

In [None]:
# Spliting into dependent and independent variables
X = df.drop(['y'], axis=1)
y = df['y']

In [None]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [None]:
# Finding the shape of the dependent variables.
X.shape

(45211, 16)

In [None]:
# Converting the categorical variable to numerical variable for 'housing' column
housing_dummy = pd.get_dummies(df['housing'], drop_first=True)
print(housing_dummy)

       yes
0        1
1        1
2        1
3        1
4        0
...    ...
45206    0
45207    0
45208    0
45209    0
45210    0

[45211 rows x 1 columns]


In [None]:
# Converting the categorical variable to numerical variable for all categorical columns
dummies = pd.Series([])

for x in X.columns:
  if X.dtypes[x] == 'object':
    print(x)
    dummy = pd.get_dummies(df[x], drop_first=True)
    dummies = pd.concat([dummies ,dummy], axis=1)

print(dummies)
dummies.shape

job
marital
education
default
housing
loan
contact
month
poutcome
        0  blue-collar  entrepreneur  housemaid  ...  sep  other  success  unknown
0     NaN            0             0          0  ...    0      0        0        1
1     NaN            0             0          0  ...    0      0        0        1
2     NaN            0             1          0  ...    0      0        0        1
3     NaN            1             0          0  ...    0      0        0        1
4     NaN            0             0          0  ...    0      0        0        1
...    ..          ...           ...        ...  ...  ...    ...      ...      ...
45206 NaN            0             0          0  ...    0      0        0        1
45207 NaN            0             0          0  ...    0      0        0        1
45208 NaN            0             0          0  ...    0      0        1        0
45209 NaN            1             0          0  ...    0      0        0        1
45210 NaN            

  """Entry point for launching an IPython kernel.


(45211, 36)

In [None]:
# Removing all the categorical columns from the dataframe.
for x in X.columns:
  if X.dtypes[x] == 'object':
    X.drop(x, axis=1, inplace=True)

X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [None]:
# Concatenate al the numerical columns together.
X = pd.concat([X, dummies], axis=1)

In [None]:
X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,0,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown,married,single,secondary,tertiary,unknown.1,yes,yes.1,yes.2,telephone,unknown.2,aug,dec,feb,jan,jul,jun,mar,may,nov,oct,sep,other,success,unknown.3
0,58,2143,5,261,1,-1,0,,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [None]:
X.shape

(45211, 43)

In [None]:
X.drop(0,axis=1, inplace=True)

In [None]:
 y = pd.get_dummies(y, drop_first=True)

In [None]:
# Converting the dependent variable to a numerical type 
y.head()

Unnamed: 0,yes
0,0
1,0
2,0
3,0
4,0


In [None]:
y.shape

(45211, 1)

In [None]:
# Splitting up the dependent and independents variable into Train and Test, we have 20% of the record for testing the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(36168, 42)
(9043, 42)
(36168, 1)
(9043, 1)


In [None]:
# Importing the StandardScaler library
from sklearn.preprocessing import StandardScaler

In [None]:
# Converting all values into strandardised values for the training dataset
sc = StandardScaler()
X_train = sc.fit_transform(X_train)


In [None]:
print(X_train)


[[ 1.51506117 -0.23163698 -0.69851397 ... -0.20517128 -0.18404781
   0.47153594]
 [-0.36959746  0.10603762 -0.81871672 ... -0.20517128 -0.18404781
   0.47153594]
 [-0.55806332  0.47063537 -1.66013596 ... -0.20517128 -0.18404781
  -2.12072912]
 ...
 [-0.55806332 -0.34030268 -0.57831122 ... -0.20517128 -0.18404781
   0.47153594]
 [ 2.74008927 -0.24526074  1.34493276 ... -0.20517128 -0.18404781
  -2.12072912]
 [ 0.47849892 -0.42658649 -0.21770297 ... -0.20517128 -0.18404781
   0.47153594]]


In [None]:
# Converting all values into strandardised values for the testing dataset
X_test = sc.transform(X_test)

In [None]:
# Importing Logistic Regression Libreary
from sklearn.linear_model import LogisticRegression

In [None]:
# Defing the Logistic Regression Classifier
lr = LogisticRegression()
classifier = lr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
# Prediction of independent variable based on the defined model
y_pred = classifier.predict(X_test)

In [None]:
# Importing different classification measurement matrix.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Calculating the Confusion Matrix
print(confusion_matrix(y_test, y_pred))

[[7754  226]
 [ 690  373]]


In [None]:
# Calculating precision, recall and F1-Score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7980
           1       0.62      0.35      0.45      1063

    accuracy                           0.90      9043
   macro avg       0.77      0.66      0.70      9043
weighted avg       0.88      0.90      0.89      9043



In [None]:
#Calcualting Accuracy values
print(accuracy_score(y_test, y_pred))

0.8987061815769103


### **K Fold Cross Validation**