In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("dataset/diadata.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,Race
0,6,148,72,33.6,0.627,50,1,White
1,1,85,66,26.6,0.351,31,0,White
2,8,183,64,23.3,0.672,32,1,White
3,1,89,66,28.1,0.167,21,0,White
4,0,137,40,43.1,2.288,33,1,White


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               729 non-null    int64  
 1   Glucose                   729 non-null    int64  
 2   BloodPressure             729 non-null    int64  
 3   BMI                       729 non-null    float64
 4   DiabetesPedigreeFunction  729 non-null    float64
 5   Age                       729 non-null    int64  
 6   Outcome                   729 non-null    int64  
 7   Race                      729 non-null    object 
dtypes: float64(2), int64(5), object(1)
memory usage: 45.7+ KB


In [4]:
le = LabelEncoder()
df["Race_int"] = le.fit_transform(df["Race"])

df.pop("Race")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,Race_int
0,6,148,72,33.6,0.627,50,1,2
1,1,85,66,26.6,0.351,31,0,2
2,8,183,64,23.3,0.672,32,1,2
3,1,89,66,28.1,0.167,21,0,2
4,0,137,40,43.1,2.288,33,1,2


In [5]:
y = df.pop("Outcome")
X= df.copy()

In [6]:
class LogisticRegressionBGD:
    def __init__(self,learning_rate=0.01, epochs = 300):
        self.coef_ = None
        self.lr = learning_rate
        self.epochs = epochs
        
    def sigmoid(self, arr):
        return 1/(1+np.exp(-1*arr))
        
    def fit(self,X,y):  
              
        X = np.insert(np.array(X),0,1,axis = 1)
        self.coef_ = np.ones(X.shape[1])
        
        for i in range(self.epochs):            
        
            y_hat = self.sigmoid(np.dot(X,self.coef_))
            
            coef_der = np.dot((y - y_hat),X)/X.shape[0]
            self.coef_ = self.coef_ + self.lr*coef_der
            
    def predict(self, X):
        X = np.insert(np.array(X),0,1,axis = 1)
        y_hat = self.sigmoid(np.dot(X,self.coef_))
        return y_hat.ravel()

In [7]:
from sklearn import metrics
log_reg = LogisticRegressionBGD(0.001,10000)

In [8]:
log_reg.fit(X,y)
y_hat = log_reg.predict(X)
metrics.roc_auc_score(y,y_hat)

0.7408441547617064

In [9]:
logr = LogisticRegression(max_iter=10000,n_jobs=-1,verbose=2,tol=0.001)
logr.fit(X,y)
y_hat_lo = logr.predict(X)
metrics.roc_auc_score(y,y_hat_lo)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.0s finished


0.7262831519111838