In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [73]:
data = pd.read_csv("health.csv")
data_copy = data.copy()

In [74]:
X = data.drop(columns=['Outcome'], axis=1)
Y = data['Outcome']

Pre-Processing Steps

1. Check for any missing values

In [75]:
# Check for missing values in the entire DataFrame
missing_values = data.isnull().sum()

for column, count in missing_values.items():
    if count > 0:
        print(f'Column: {column}, Missing Values: {count}')


2. Drop the duplicates

In [76]:
data.drop_duplicates(inplace=True)

3. Normalization of the data

In [None]:
data = data.drop('Outcome', axis=1)
X = data.copy()

num_data = X.select_dtypes(include=['number'])
scaler = StandardScaler()

data[num_data.columns] = scaler.fit_transform(data[num_data.columns])
data = pd.concat([data, Y], axis=1)
print(data)

In [78]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

Logistic Regression

In [79]:
def sigmoid(z):
  return 1/(1+np.exp(-z))

def cost(old_y, new_y):
  return (np.sum(old_y*np.log(new_y) + (1 - old_y)*np.log(1 - new_y)))

def predict(X, parameters):
  return sigmoid(X.dot(parameters))

def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    accuracy = correct / total
    return accuracy

In [None]:
def learning_rate_schedule(iteration):
    return 1.0 / (1.0 + iteration)

alpha_values = [1.0, 0.1, 0.01, 0.001]
results = []

for alpha in alpha_values:
    # Reset initial_parameters and lists for each alpha value
    initial_parameters = np.zeros(X.shape[1])
    training_loss = []
    validation_loss = []
    training_accuracy = []
    validation_accuracy = []

    for i in range(0, 1000):
        indices = np.random.permutation(len(X_train))
        Shuf_X = X_train.iloc[indices]
        Shuf_Y = Y_train.iloc[indices]

        for j in range(len(Shuf_X)):
            x_j = Shuf_X.iloc[j].values
            y_j = Shuf_Y.iloc[j]

            pred_y = (predict(x_j, initial_parameters) > 0.5)

            gradient = x_j.T.dot(pred_y - y_j)
            learning_rate = alpha * learning_rate_schedule(i)  # Apply the learning rate schedule
            initial_parameters -= learning_rate * gradient

        # Training Loss and Accuracy
        Y_train_pred = predict(X_train, initial_parameters) > 0.5
        training_loss.append(cost(Y_train, predict(X_train, initial_parameters)))
        training_accuracy.append(accuracy(Y_train, Y_train_pred))

        # Validation Loss and Accuracy
        Y_val_pred = predict(X_val, initial_parameters) > 0.5
        validation_loss.append(cost(Y_val, Y_val_pred))
        validation_accuracy.append(accuracy(Y_val, Y_val_pred))

    # Plot training loss vs. iteration
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    plt.semilogy(training_loss, label='Training Loss')  # Use semilogy for exponential scale
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()
    plt.title(f'Training Loss vs. Iteration (alpha={alpha})')

    # Plot validation loss vs. iteration
    plt.subplot(2, 2, 2)
    plt.semilogy(validation_loss, label='Validation Loss', color='orange')  # Use semilogy
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()
    plt.title(f'Validation Loss vs. Iteration (alpha={alpha})')

    # Plot training accuracy vs. iteration
    plt.subplot(2, 2, 3)
    plt.plot(training_accuracy, label='Training Accuracy', color='green')
    plt.xlabel('Iteration')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title(f'Training Accuracy vs. Iteration (alpha={alpha})')

    # Plot validation accuracy vs. iteration
    plt.subplot(2, 2, 4)
    plt.plot(validation_accuracy, label='Validation Accuracy', color='red')
    plt.xlabel('Iteration')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title(f'Validation Accuracy vs. Iteration (alpha={alpha})')

    plt.tight_layout()
    plt.show()

    Y_test_pred = predict(X_test, initial_parameters) > 0.5

    # Calculate confusion matrix
    cm = confusion_matrix(Y_test, Y_test_pred)

    acc = accuracy_score(Y_test, Y_test_pred)

    precision = precision_score(Y_test, Y_test_pred)

    recall = recall_score(Y_test, Y_test_pred)

    f1 = f1_score(Y_test, Y_test_pred)

    results.append([alpha, acc, precision, recall, f1])

    print("Confusion Matrix")
    print(cm)
# Display the results table
headers = ["Alpha", "Accuracy", "Precision", "Recall", "F1 Score"]
print(tabulate(results, headers, tablefmt="fancy_grid"))

Replacing the sigmoid logistic function with tanh (tangent hyperbolic function)

In [None]:
def tanh(x):
  return np.tanh(x)

def derivative_tanh(x):
  return 1 - (np.exp(2*x)/(np.exp(2*x)+1))

def mean_squared_error(old_y, new_y):
    n = len(old_y)
    return (1/n)*np.sum((new_y - old_y)**2)

In [None]:
loss = []
accuracy = []

initial_parameters = np.zeros(X.shape[1])
alpha = 0.1

for i in range(0,1000):
  x = X.dot(initial_parameters)
  pred_y = tanh(x)