1.	Write python programs to predict diabetes using logistic regression. Implement the algorithm using library and without using library. Implement batch gradient descent. Find accuracy, precision, recall, F1-score, and specificity and compare both strategies (Use diabetes.csv). Assume train/test split is 70:30.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data handeling
from sklearn.model_selection import train_test_split # splitting data in train/test set
from scipy.special import expit


In [None]:
def sigmoid(z):
  return 1.0 / (1 + np.exp(-z))

def predict(features, weights):
  '''
  Returns 1D array of probabilities
  that the class label == 1
  '''
  z = np.dot(features, weights)
  return sigmoid(z)

In [None]:
def update_weights(features, labels, weights, lr):
    '''
    Vectorized Gradient Descent

    Features:(200, 3)
    Labels: (200, 1)
    Weights:(3, 1)
    '''
    N = len(features)

    #1 - Get Predictions
    predictions = predict(features, weights)


    #2 Transpose features from (200, 3) to (3, 200)
    # So we can multiply w the (200,1)  cost matrix.
    # Returns a (3,1) matrix holding 3 partial derivatives --
    # one for each feature -- representing the aggregate
    # slope of the cost function across all observations
    gradient = np.dot(features.T,  predictions - labels)

    #3 Take the average cost derivative for each feature
    gradient /= N

    #4 - Multiply the gradient by our learning rate
    gradient *= lr

    #5 - Subtract from our weights to minimize cost
    weights -= gradient

    return weights

In [None]:
def decision_boundary(prob):
  return 1 if prob >= .5 else 0

In [None]:
def classify(predictions):
  '''
  input  - N element array of predictions between 0 and 1
  output - N element array of 0s (False) and 1s (True)
  '''
  decision_boundarys = np.vectorize(decision_boundary)
  return decision_boundarys(predictions)

In [None]:
def train(features, labels, weights, lr, iters):
    for i in range(iters):
        weights = update_weights(features, labels, weights, lr)

    return weights

In [None]:
def accuracy(predicted_labels, actual_labels):
    diff = predicted_labels - actual_labels
    return 1.0 - (float(np.count_nonzero(diff)) / len(diff))

def precision(predicted_labels, actual_labels):
    connected_result = zip(predicted_labels, actual_labels.values)
    both_ones = 0
    first_one = 0
    for prediction, actual in connected_result:
        if (prediction[0] == 1) & (actual[0] == 1):
            both_ones += 1
        if (prediction[0] == 1) & (actual[0] != 1):
            first_one += 1

    TP = both_ones
    FP = first_one

    return (TP)/ (TP+FP)

def recall(predicted_labels, actual_labels):
    connected_result = zip(predicted_labels, actual_labels.values)
    both_ones = 0
    total_ones = 0
    for prediction, actual in connected_result:
        if (prediction[0] == 1) & (actual[0] == 1):
            both_ones += 1
        if (actual[0] == 1):
            total_ones += 1

    return both_ones / total_ones

def f1_score(predicted_labels, actual_labels):
    precision_score = precision(predicted_labels, actual_labels)
    recall_score = recall(predicted_labels, actual_labels)

    return (2 * precision_score * recall_score) / (precision_score + recall_score)

def specificity(predicted_labels, actual_labels):
    connected_result = zip(predicted_labels, actual_labels.values)
    both_zeroes = 0
    ones_zeroes = 0
    for prediction, actual in connected_result:
        if (prediction[0] == 0) & (actual[0] == 0):
            both_zeroes += 1
        if (prediction[0] == 1 & actual[0] == 0):
            ones_zeroes += 1

    return both_zeroes / (both_zeroes + ones_zeroes)


In [None]:
df= pd.read_csv("Diabetes.csv")

features = ['Pragnency', 'Glucose', 'Blod Pressure', 'Skin Thikness', 'Insulin', 'BMI', 'DFP', 'Age']
x = df.loc[:, features]
y = df.loc[:, ['Diabetes']]



X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
weights = np.zeros([8, 1])
weights = train(X_train, Y_train, weights, 0.1, 1000) #train(features, labels, weights, lr, iters)
predictions = predict(X_test, weights)
classifications = classify(predictions)

accuracy_result = accuracy(classifications, Y_test)
precision_result = precision(classifications, Y_test)
recall_result = recall(classifications, Y_test)
f1_result = f1_score(classifications, Y_test)
specificity_result = specificity(classifications, Y_test)

print("Accuracy = {0}".format(str(accuracy_result)))
print("Precision = {0}".format(str(precision_result)))
print("Recall = {0}".format(str(recall_result)))
print("F1 Score = {0}".format(str(f1_result)))
print("Specificity Score = {0}".format(str(specificity_result)))

  


Accuracy = 0.4025974025974026
Precision = 0.3382352941176471
Recall = 0.9583333333333334
F1 Score = 0.5
Specificity Score = 0.5
