In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, roc_auc_score, roc_curve

In [4]:
# Load the dataset
data = pd.read_csv("/content/HeartDisease.csv")

# Fill missing values....... fill with mean median mode depending on data type

numerical_cols = ['age','education','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']
categorical_cols = ['male','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','HeartDisease']
# For numerical columns
# numerical_cols = ['totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'cigsPerDay', 'age']
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# For categorical columns
# categorical_cols = ['male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

train_data = data

# Separate features and target variable if you have one (uncomment and adjust if needed)
X_train = train_data.drop('HeartDisease', axis=1)
Y_train = train_data['HeartDisease']


# Check the sizes
print(f"Training set size: {X_train.shape[0]}")

Training set size: 4238


In [5]:
#converting the pandas dataframe to numpy array

X_train = X_train.values
Y_train = Y_train.values


In [6]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [7]:
def accuracy(predictions, groundTruth):
  return np.mean(predictions == groundTruth)

In [8]:
class LogisticRegression():

    def __init__(self, learningRate, epochs, classThresh = 0.5):
        self.learningRate = learningRate
        self.epochs = epochs
        self.weights = None
        self.bias = None
        self.classThresh = classThresh

        self.costList = None
        self.accuracyList = None

    def getLoss(self):
      return self.costList

    def getAccuracy(self):
      return self.accuracyList

    def gradientDescent(self, X, Y):
        m, n = X.shape
        self.weights = np.ones(n) #initialising the weights as 0s
        self.bias = 1 #initialising the bias as zero

        self.costList = []
        self.accuracyList = []

        epsilon = 1e-10  # Small constant to prevent log(0)

        for _ in range(self.epochs):

            hypothesis = np.dot(X, self.weights) + self.bias
            Y_prediction = sigmoid(hypothesis)

            # Cost Function J(w, b), cross entropy loss
            cost = -(1/m) * np.sum(Y * np.log(Y_prediction + epsilon) + (1 - Y) * np.log(1 - Y_prediction + epsilon))

            #Accuracy
            classification = [0 if y<= self.classThresh else 1 for y in Y_prediction]
            acc = accuracy(classification, Y)

            self.costList.append(cost)
            self.accuracyList.append(acc)

            #calculating the gradients (partial derivative of the cost function J(w, b) wrt weights and bias)
            dw = (1/m) * np.dot(X.T, (Y_prediction - Y))
            db = (1/m) * np.sum(Y_prediction - Y)

            self.weights = self.weights - self.learningRate * dw
            self.bias = self.bias - self.learningRate * db

            # if _ % (self.epochs // 10) == 0:  # Print cost every 10% of iterations
            #     print(f"Cost after {_} iterations: {cost} & accuracy : {acc}")


    def classify(self, X):
        m, n = X.shape

        hypothesis = np.dot(X, self.weights) + self.bias
        Y_prediction = sigmoid(hypothesis)

        classLabels = [0 if y <= self.classThresh else 1 for y in Y_prediction]

        return classLabels

In [9]:
def fit_min_max_scaling(X):
    return (X - X.min()) / (X.max() - X.min())

In [17]:
# spliting the dataset into k folds
def createKFolds(X, Y, k):
    # Shuffle data
    indices = np.arange(X.shape[0])
    # np.random.shuffle(indices)

    X_shuffled = X[indices]
    Y_shuffled = Y[indices]

    # Split into k approximately equal-sized folds
    fold_size = len(X) // k
    folds_X = []
    folds_Y = []

    for i in range(k):
        start = i * fold_size
        # Handling last fold
        end = min((i + 1) * fold_size, len(X))

        folds_X.append(X_shuffled[start:end])
        folds_Y.append(Y_shuffled[start:end])

    return folds_X, folds_Y


def kFoldCrossValidation(X, Y, learningRate, epochs, k=5, classThresh=0.5):
    # Create k folds
    folds_X, folds_Y = createKFolds(X, Y, k)

    losses = []
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(k):
        # Prepare training and validation data for the ith fold
        X_train = np.vstack([folds_X[j] for j in range(k) if j != i])
        Y_train = np.hstack([folds_Y[j] for j in range(k) if j != i])

        X_val = folds_X[i]
        Y_val = folds_Y[i]

        X_train = fit_min_max_scaling(X_train)
        X_val = fit_min_max_scaling(X_val)

        # Initialize the logistic regression model
        model = LogisticRegression(learningRate, epochs, classThresh)

        # Train the model using Batch Gradient Descent
        model.gradientDescent(X_train, Y_train)

        # Get predictions for the validation set
        Y_pred = np.array(model.classify(X_val))

        # Calculate validation loss and accuracy
        epsilon = 1e-10
        val_loss = -(1/len(Y_val)) * np.sum(Y_val * np.log(Y_pred + epsilon) + (1 - Y_val) * np.log(1 - Y_pred + epsilon))
        val_accuracy = accuracy(Y_pred, Y_val)

        precisions.append(precision_score(Y_val, Y_pred, zero_division=0))
        recalls.append(recall_score(Y_val, Y_pred, zero_division=0))
        f1_scores.append(f1_score(Y_val, Y_pred, zero_division=0))

        # Store the performance metrics for this fold
        losses.append(val_loss)
        accuracies.append(val_accuracy)

        print(f"Fold {i + 1}: Loss = {val_loss:.4f}, Accuracy = {val_accuracy:.4f}")
        print(f"Fold {i + 1}: Precision = {precisions[-1]:.4f}, Recall = {recalls[-1]:.4f}, F1 Score = {f1_scores[-1]:.4f}")
        print()

    # Compute and print average loss and accuracy over all folds

    print("--------------%%%%%----------------")
    print(f"Mean Loss: {np.mean(losses):.4f}")
    print(f"Std Loss: {np.std(losses):.4f}")
    print()

    print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
    print(f"Std Accuracy: {np.std(accuracies):.4f}")
    print()

    print(f"Mean Precision: {np.mean(precisions):.4f}")
    print(f"Std Precision: {np.std(precisions):.4f}")
    print()

    print(f"Mean Recall: {np.mean(recalls):.4f}")
    print(f"Std Recall: {np.std(recalls):.4f}")
    print()

    print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")
    print(f"Std F1 Score: {np.std(f1_scores):.4f}")

In [25]:
learningRate = 0.01
epochs = 500
ct = 0.5

kFoldCrossValidation(X_train, Y_train, learningRate, epochs, k=5, classThresh=ct)


Fold 1: Loss = 3.7787, Accuracy = 0.8359
Fold 1: Precision = 1.0000, Recall = 0.0211, F1 Score = 0.0414

Fold 2: Loss = 3.1535, Accuracy = 0.8630
Fold 2: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000

Fold 3: Loss = 3.5884, Accuracy = 0.8442
Fold 3: Precision = 0.4583, Recall = 0.0846, F1 Score = 0.1429

Fold 4: Loss = 3.5069, Accuracy = 0.8477
Fold 4: Precision = 0.0000, Recall = 0.0000, F1 Score = 0.0000

Fold 5: Loss = 3.4253, Accuracy = 0.8512
Fold 5: Precision = 0.5556, Recall = 0.0394, F1 Score = 0.0735

--------------%%%%%----------------
Mean Loss: 3.4906
Std Loss: 0.2053

Mean Accuracy: 0.8484
Std Accuracy: 0.0089

Mean Precision: 0.4028
Std Precision: 0.3762

Mean Recall: 0.0290
Std Recall: 0.0314

Mean F1 Score: 0.0516
Std F1 Score: 0.0534
