In [27]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import copy
from sklearn.metrics import accuracy_score

# Step 1: Creating Dumy Dataset 

In [28]:
x,y = make_classification(n_features=4,n_classes=2)
X_train_dumy,X_test_dumy,y_train_dumy,y_test_dumy = train_test_split(x,y,test_size=0.1)

# Step 2: Standardization

In [29]:
def Zscore_standarize_function(x):
    mu = np.mean(x,axis =0)
    sigma = np.std(x,axis=0)
    
    x_norm = (x-mu)/sigma
    
    return x_norm

In [30]:
X_test_dumy = Zscore_standarize_function(X_test_dumy)
X_train_dumy = Zscore_standarize_function(X_train_dumy)

# Step 3: Initializing Parameters

In [31]:
n , m = x.shape

print('number of features:',m)
print('number of samples:',n)

w_tmp_dumy = np.random.rand(4,)
b_tmp_dumy= np.random.randint(1)
lr_dumy = 0.001
num_iters_dumy = 1000


number of features: 4
number of samples: 100


# Step 4: Some Functions needed for fitting the algorithm

In [32]:
#Sigmoid Function

def sigmoid(z):
    g = 1/(1 + np.exp(-z))
    
    return g


#Cost Function

def compute_logistic_cost_function(x,y,w,b):
    m= x.shape[0]
    
    cost = 0
    for i in range(m):
        z = np.dot(x[i],w)+b
        f = sigmoid(z)
        cost += -y[i]*np.log(f) - (1-y[i])*np.log(1-f)
    
    cost = cost/m
    return cost

# Step 5: Fitting the Algorithm using Gradient Descent 

In [33]:
def compute_gradient_logistic(X, y, w, b): 
    m,n = X.shape
    dj_dw = np.zeros((n,))                           
    dj_db = 0.

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)          
        err_i  = f_wb_i  - y[i]                       
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j]      
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m                                   
    dj_db = dj_db/m                                  
        
    return dj_db, dj_dw  

def gradient_descent(X, y, w_in, b_in, alpha, num_iters): 
    J_history = []
    w = copy.deepcopy(w_in)  
    b = b_in
    
    for i in range(num_iters):
        dj_db, dj_dw = compute_gradient_logistic(X, y, w, b)   

        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        if i<100000:      # prevent resource exhaustion 
            J_history.append( compute_logistic_cost_function(X, y, w, b) )

        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         

In [34]:
w_out_dumy , b_out_dumy , _ = gradient_descent(X_train_dumy,y_train_dumy,w_tmp_dumy,b_tmp_dumy,lr_dumy , num_iters_dumy)
print(f"\nupdated parameters: w:{w_out_dumy}, b:{b_out_dumy}")

Iteration    0: Cost 0.9041044371546386   
Iteration  100: Cost 0.8622888576679208   
Iteration  200: Cost 0.823482877640195   
Iteration  300: Cost 0.7876210332471498   
Iteration  400: Cost 0.7546092313483208   
Iteration  500: Cost 0.724328354909744   
Iteration  600: Cost 0.6966386778886507   
Iteration  700: Cost 0.6713848057351323   
Iteration  800: Cost 0.6484008152477022   
Iteration  900: Cost 0.627515263555542   

updated parameters: w:[0.61648149 0.72113495 0.60491557 0.62875885], b:0.012174224903258877


# Step 6: Prediction

In [35]:
def predict(X, w, b) :
    m, n = X.shape
    p = np.zeros(m)    
    for i in range(m) :
        f_wb = sigmoid(np.dot(X[i], w) + b)
        p[i] = 1 if f_wb > 0.5 else 0
    return p

In [36]:
y_pred_dumy = predict(X_test_dumy,w_out_dumy,b_out_dumy)
print(f"Test Accuracy = {accuracy_score(y_test_dumy, y_pred_dumy) * 100}") 

Y_pred2 = predict(X_train_dumy,w_out_dumy,b_out_dumy)
print('Train Accuracy: %f'%(np.mean(Y_pred2 == y_train_dumy) * 100))

Test Accuracy = 90.0
Train Accuracy: 70.000000


# Step 7: Apply on Real dataset

In [37]:
dataset = pd.read_csv('framingham.csv')
dataset.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [38]:
# check for null values
print(dataset.isnull().sum())
print(dataset.shape)

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64
(4238, 16)


### fill the null values

In [39]:
mean_eduacation  = dataset['education'].mean()
dataset['education'].fillna(value=mean_eduacation,inplace=True)

dataset['cigsPerDay'].fillna(value=0.0,inplace=True)

np.random.seed(3)
dataset['BPMeds'].fillna(value=np.random.randint(0,2), inplace=True)

mean_totchol = dataset['totChol'].mean()
dataset['totChol'].fillna(value=mean_totchol,inplace=True)

mean_eduacation  = dataset['BMI'].mean()
dataset['BMI'].fillna(value=mean_eduacation,inplace=True)

mean_eduacation  = dataset['glucose'].mean()
dataset['glucose'].fillna(value=mean_eduacation,inplace=True)

mean_value_rate = dataset['heartRate'].mean()
dataset['heartRate'].fillna(value=mean_value_rate, inplace=True)

dataset.isnull().sum()


male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [40]:
y = dataset['TenYearCHD']
X = dataset.drop('TenYearCHD', axis = 1)
X = X.to_numpy()
y = y.to_numpy()

In [41]:
# normalization
x = Zscore_standarize_function(X)

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1)

In [42]:
## initializing parameters 
n , m = x.shape

print('number of features:',m)
print('number of samples:',n)

w_tmp = np.random.rand(15,)
b_tmp = np.random.randint(1)
lr = 0.001
num_iters = 1000


number of features: 15
number of samples: 4238


### fit the model

In [43]:
w_out , b_out , _ = gradient_descent(x_train,y_train,w_tmp,b_tmp,lr,num_iters)
print(f"\nupdated parameters: w:{w_out}, b:{b_out}")

Iteration    0: Cost nan   


  cost += -y[i]*np.log(f) - (1-y[i])*np.log(1-f)
  cost += -y[i]*np.log(f) - (1-y[i])*np.log(1-f)


Iteration  100: Cost 1.1766692429500234   
Iteration  200: Cost 4.065935715328967   
Iteration  300: Cost 2.8551401691100353   
Iteration  400: Cost 5.9671799310727405   
Iteration  500: Cost 2.3489462446601186   
Iteration  600: Cost 0.8091376047197567   
Iteration  700: Cost 3.5299625806634127   
Iteration  800: Cost 9.423100637497322   
Iteration  900: Cost 1.2162822089283933   

updated parameters: w:[ 0.31708515  0.14398034  0.07797935  0.48733481  0.07520919  0.49418045
  0.30931967  0.86478449  0.37480672 -0.07674173  0.09403801 -0.20845362
  0.14774636 -0.16381149  0.02088486], b:-0.011919414348499498


### predict

In [44]:
y_pred = predict(x_test,w_out,b_out)
print(f"Test Accuracy = {accuracy_score(y_test, y_pred) * 100}") 

Y_pred2 = predict(x_train,w_out,b_out)
print('Train Accuracy: %f'%(np.mean(Y_pred2 == y_train) * 100))

Test Accuracy = 83.9622641509434
Train Accuracy: 84.897745
