# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Read Data

In [2]:
df = pd.read_csv("bank-full.csv", sep = ";")

In [3]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


# Label Encoding On Data

In [5]:
catagorical_columns = df.dtypes[df.dtypes == 'object']
numerical_columns = df.dtypes[df.dtypes != 'object']
print(catagorical_columns)
label_encoder = LabelEncoder()
for column in catagorical_columns.index:
    df[column] = label_encoder.fit_transform(df[column])

job          object
marital      object
education    object
default      object
housing      object
loan         object
contact      object
month        object
poutcome     object
y            object
dtype: object


# Define X & y

In [6]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [7]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,977,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


In [8]:
y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int32

# Train_Test_Split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [10]:
X_train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
13932,57,0,0,1,0,658,0,0,0,10,5,724,1,-1,0,3
9894,37,11,1,3,0,1699,0,0,2,9,6,63,1,-1,0,3
39946,35,9,0,1,0,2823,1,0,0,2,6,102,4,96,2,0
9217,35,0,1,1,0,214,1,1,2,5,6,247,1,-1,0,3
4124,38,7,2,2,0,323,1,0,2,19,8,138,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30403,35,4,2,2,0,995,0,0,0,5,3,39,1,-1,0,3
21243,35,4,1,2,0,750,1,0,0,18,1,233,12,-1,0,3
42613,35,4,1,2,0,323,0,0,0,11,4,261,2,-1,0,3
43567,70,5,1,1,0,616,0,0,0,27,0,149,2,182,1,0


In [11]:
X_test

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
14001,37,4,1,2,0,1403,1,0,0,10,5,91,15,-1,0,3
32046,36,7,2,1,0,-253,1,0,0,14,0,1073,1,-1,0,3
13318,47,1,1,0,0,-406,0,1,1,8,5,243,5,-1,0,3
42991,57,3,1,0,0,501,1,1,0,11,3,105,7,188,2,0
14237,33,1,1,1,1,-406,1,1,0,14,5,332,2,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18448,53,1,1,0,0,252,0,0,0,31,5,727,4,-1,0,3
6654,28,4,2,2,0,1996,1,1,2,28,8,55,1,-1,0,3
17106,28,7,1,1,0,7,0,0,0,25,5,50,3,-1,0,3
9025,59,5,1,1,0,0,0,0,2,5,6,812,1,-1,0,3


In [12]:
y_train

13932    1
9894     0
39946    0
9217     0
4124     0
        ..
30403    0
21243    0
42613    1
43567    0
2732     0
Name: y, Length: 36168, dtype: int32

In [13]:
y_test

14001    0
32046    1
13318    0
42991    0
14237    0
        ..
18448    1
6654     0
17106    0
9025     1
13171    0
Name: y, Length: 9043, dtype: int32

# Logistic Regression Using Sklearn

In [14]:
model = LogisticRegression(max_iter=5000)

In [15]:
model.fit(X_train,y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
confusion_matrix(y_test, y_pred)

array([[7813,  167],
       [ 842,  221]], dtype=int64)

In [18]:
accuracy_score(y_test,y_pred)

0.8884219838549154

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      7980
           1       0.57      0.21      0.30      1063

    accuracy                           0.89      9043
   macro avg       0.74      0.59      0.62      9043
weighted avg       0.86      0.89      0.86      9043



# Logistic Regression Using Gradient Descent

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [22]:
class LogisticRegressionGD:
    def __init__(self, learningRate=0.01,num_it = 5000):
        self.learningRate = learningRate
        self.num_it = num_it
        self.features = None
        self.bias = None

    def sigmoid(self, z):
        warnings.filterwarnings('ignore')
        return 1/(1+np.exp(-z))

    def fitGD(self,X,y):
        num_samples,num_features = X.shape
        self.features = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_it):
            linear_model = np.dot(X,self.features) + self.bias
            y_pred = self.sigmoid(linear_model)

            w = (1/num_samples)*np.dot(X.T,(y_pred-y))
            b = (1/num_samples)*np.sum(y_pred-y)
            self.features -= self.learningRate*w
            self.bias -= self.learningRate*b

    def predict(self,X):
        linear_model = np.dot(X,self.features) + self.bias
        y_pred = self.sigmoid(linear_model)
        y_pred_cls = [1 if i>0.5 else 0 for i in y_pred]
        return np.array(y_pred_cls)

    def accuracy(self, y_test,y_pred):
        accuracy = np.sum(y_test == y_pred)/len(y_test)
        return accuracy

In [23]:
model = LogisticRegressionGD(learningRate=0.01,num_it=5000)

In [24]:
model.fitGD(X_train,y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
confusion_matrix(y_test,y_pred)

array([[7681,  299],
       [ 864,  199]], dtype=int64)

In [27]:
model.accuracy(y_test,y_pred)

0.8713922370894615

In [28]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      7980
           1       0.40      0.19      0.25      1063

    accuracy                           0.87      9043
   macro avg       0.65      0.57      0.59      9043
weighted avg       0.84      0.87      0.85      9043

