In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Load data

In [2]:
df = pd.read_csv("/content/heart.csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# preprocessing

In [3]:
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Drop rows with missing values if any (or handle them appropriately)


Missing values per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [4]:
df.dropna(inplace=True)

# # Separate features and the target variable

In [5]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']


In [6]:

# Define categorical and numerical columns
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply transformations
X_processed = preprocessor.fit_transform(X)





In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [16]:
# Define sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [8]:
# Initialize parameters (weights and bias) for the logistic regression model
def initialize_parameters(features):
    W = np.zeros((features, 1))
    b = 0
    return W, b

In [9]:
def compute_cost_and_gradient(X, y, W, b):
    m = len(y)
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    cost = -np.mean(y * np.log(A) + (1 - y) * np.log(1 - A))
    dW = np.dot(X.T, (A - y)) / m
    db = np.mean(A - y)
    return cost, dW, db

In [10]:
def train(X, y, learning_rate, num_iterations, debug=False):
    W, b = initialize_parameters(X.shape[1])
    for i in range(num_iterations):
        cost, dW, db = compute_cost_and_gradient(X, y, W, b)
        W -= learning_rate * dW
        b -= learning_rate * db
        if debug and i % 100 == 0:
            print(f'Iteration {i}, Cost: {cost}')
    return W, b

In [11]:
def predict(X, W, b):
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    return (A >= 0.5).astype(int)

In [17]:
# Training the model
W, b = train(X_train, y_train.values.reshape(-1, 1), learning_rate=0.01, num_iterations=1000, debug=True)

Iteration 0, Cost: 0.6931471805599453
Iteration 100, Cost: 0.545718321642919
Iteration 200, Cost: 0.4812686589626463
Iteration 300, Cost: 0.44555387123099993
Iteration 400, Cost: 0.4226388981888261
Iteration 500, Cost: 0.40646796389152534
Iteration 600, Cost: 0.394300107077326
Iteration 700, Cost: 0.384724778068689
Iteration 800, Cost: 0.37694119864112396
Iteration 900, Cost: 0.37045848552974225


In [18]:
def evaluate(X, y, W, b):
    y_pred = predict(X, W, b)
    accuracy = np.mean(y_pred.flatten() == y)
    return accuracy

In [19]:
train_accuracy = evaluate(X_train, y_train, W, b)
test_accuracy = evaluate(X_test, y_test, W, b)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 85.95%
Test Accuracy: 90.16%


In [32]:
# build a confusion matrix
def compute_confusion_metrics(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    TP = conf_matrix[1, 1]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]
    FN = conf_matrix[1, 0]
    return TP, TN, FP, FN



In [33]:
# evaluate model for f1score and percision and Recall
def compute_metrics(TP, TN, FP, FN):
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
    return precision, recall, f1_score

In [28]:
# Predict on the test set
y_pred_test = predict(X_test, W, b)

In [29]:
# Compute confusion matrix and metrics
TP, TN, FP, FN = compute_confusion_metrics(y_test, y_pred_test)
precision, recall, f1_score = compute_metrics(TP, TN, FP, FN)


In [30]:
# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print(f"Confusion Matrix:\n{conf_matrix}")


Confusion Matrix:
[[27  2]
 [ 4 28]]


In [31]:
# Print metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Precision: 0.9333
Recall: 0.8750
F1-Score: 0.9032


# challenges

Challenges Faced
Data Cleaning
Handling Missing Values:
 One of the first steps was to check for and handle missing values. In the provided code , there is no missing rows.
Categorical Encoding and Scaling:
 apply one-hot encoding to categorical variables and scaling to numerical variables.
Feature Scaling:
Standardizing the numerical features to have a mean of 0 and a standard deviation of 1 .





# Model performance

Model Implementation
From Scratch Implementation:
 Building the logistic regression function from scratch is challenging, ensuring that all matrix operations are performed correctly and efficiently.

 Gradient Descent :
Choosing an appropriate learning rate was essential. A learning rate that was too high could cause the model to exceeding the optimal solution, while a rate that was too low would result in  delayed closing.

required careful when build matrix operations and ensuring the sigmoid function and cost calculations were correctly implemented and calculated

# Insights Gained from Model Performance

The model achieved approximately 90% accuracy on the testing set, indicating it performs well in predicting the presence of heart disease. The training accuracy was also high, suggesting the model generalized well to the data.

Precision, Recall, and F1-Score:

Precision: Approximately 0.93, indicating that when the model predicts heart disease, it is correct 93% of the time.

Recall: Approximately 0.87, meaning the model successfully identified 82% of the actual heart disease cases.

F1-Score: Approximately 0.90, balancing precision and recall and indicating overall good performance.

# Conculsion

Implementing logistic regression from scratch on this heart disease dataset shows the importance of data preprocessing, categorical encoding, and feature scaling. The model's performance metrics says that it is a reliable predictor of heart disease.

 The process also highlighted the value of understanding the mathematical concepts in Machine Learning and The findings suggest that logistic regression is an effective method for predicting heart diseases.

 handling categorical data and feature scalingparticularly those based on gradient descent