In [53]:
#imports needed
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [54]:
#import data
dataFrame = pd.read_csv("../data/heart-disease-dataset.csv") # dataset source: https://www.kaggle.com/datasets/deesyalovely/heart-disease-dataset
dataFrame.head()



Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
# check for missing values and drop any duplicates
print(dataFrame.isnull().sum())

# Drop duplicates
dataFrame = dataFrame.drop_duplicates()

# Get only numeric columns
numeric_cols = dataFrame.select_dtypes(include=["number"]).columns

# if any missing values in (collums that only have numbers) are found just fill them with the median value for that feature
dataFrame[numeric_cols] = dataFrame[numeric_cols].fillna(dataFrame[numeric_cols].median())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [56]:
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

data_encoded = pd.get_dummies(dataFrame, columns=categorical_cols, drop_first=True)
data_encoded.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [57]:
X = data_encoded.drop("HeartDisease", axis=1)
y = data_encoded["HeartDisease"]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [59]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [60]:
log_model = LogisticRegression(max_iter=2000)
rf_model = RandomForestClassifier(random_state=42)
knn_model = KNeighborsClassifier()
svm_model = SVC(probability=True)

# TRAIN MODELS
log_model.fit(X_train_scaled, y_train)  # scaled
rf_model.fit(X_train, y_train)          # unscaled
knn_model.fit(X_train_scaled, y_train)  # scaled
svm_model.fit(X_train_scaled, y_train)  # scaled

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [61]:
log_pred = log_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test)
knn_pred = knn_model.predict(X_test_scaled)
svm_pred = svm_model.predict(X_test_scaled)

# Accuracy scores
print("Logistic Regression:", accuracy_score(y_test, log_pred))
print("Random Forest:", accuracy_score(y_test, rf_pred))
print("KNN:", accuracy_score(y_test, knn_pred))
print("SVM:", accuracy_score(y_test, svm_pred))

Logistic Regression: 0.8532608695652174
Random Forest: 0.875
KNN: 0.8532608695652174
SVM: 0.875


In [62]:
print("Logistic Regression Report:\n", classification_report(y_test, log_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))
print("KNN Report:\n", classification_report(y_test, knn_pred))
print("SVM Report:\n", classification_report(y_test, svm_pred))

# save scores so i can get the best one when i export
models = {
    "Logistic Regression": (log_model, accuracy_score(y_test, log_pred)),
    "Random Forest": (rf_model, accuracy_score(y_test, rf_pred)),
    "KNN": (knn_model, accuracy_score(y_test, knn_pred)),
    "SVM": (svm_model, accuracy_score(y_test, svm_pred))
}

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

Random Forest Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85        77
           1       0.90      0.89      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184

KNN Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted 

In [64]:
# export Random forrest and scaler model

import os
import joblib

# Make sure the output directory exists
model_dir = "../model"
os.makedirs(model_dir, exist_ok=True)

# Save the scaler
scaler_path = os.path.join(model_dir, "scaler.pkl")
joblib.dump(scaler, scaler_path)


# get best model based on acuracy score:
best_name, (best_model, best_score) = max(models.items(), key=lambda x: x[1][1])
model_path = os.path.join(model_dir, f"{best_name.replace(' ', '_').lower()}.pkl")
joblib.dump(best_model, model_path)
print("Best model:", best_name, "Accuracy:", best_score)


print("Scaler saved to:", scaler_path)
print("Model saved to:", model_path)

Best model: Random Forest Accuracy: 0.875
Scaler saved to: ../model\scaler.pkl
Model saved to: ../model\random_forest.pkl
