# 1. Problem Definition

## problem statement
The aim of this project was to develop, train and deploy a machine learning model which is able to predict if a user may or may not be at risk of heart disease

In [59]:
#imports needed
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import pandas as pd
import glob


In [None]:
#import data

data_path = "../data/*.csv"
files = glob.glob(data_path)
## source of my datasets:
# https://www.kaggle.com/datasets/deesyalovely/heart-disease-dataset
# https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset


df_list = []
for file in files:
    df = pd.read_csv(file)
    df_list.append(df)
    

    print(f"Loaded {file} with shape {df.shape}")

dataFrame = pd.concat(df_list, axis=0, ignore_index=True)

print("Combined dataset shape:", dataFrame.shape)
dataFrame.head()



Loaded ../data\heart-disease-dataset.csv with shape (918, 12)
Loaded ../data\heart.csv with shape (1025, 14)
Combined dataset shape: (1943, 26)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,40.0,M,ATA,140.0,289.0,0.0,Normal,172.0,N,0.0,...,,,,,,,,,,
1,49.0,F,NAP,160.0,180.0,0.0,Normal,156.0,N,1.0,...,,,,,,,,,,
2,37.0,M,ATA,130.0,283.0,0.0,ST,98.0,N,0.0,...,,,,,,,,,,
3,48.0,F,ASY,138.0,214.0,0.0,Normal,108.0,Y,1.5,...,,,,,,,,,,
4,54.0,M,NAP,150.0,195.0,0.0,Normal,122.0,N,0.0,...,,,,,,,,,,


In [61]:
# check diffrences in columns between datasets
for i, df in enumerate(df_list):
    print(f"Dataset {i} columns:", df.columns.tolist())

Dataset 0 columns: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
Dataset 1 columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']


In [62]:
# rename columns in df1 to match df0
df0 = pd.read_csv("../data/heart-disease-dataset.csv")  # your current one
df1 = pd.read_csv("../data/heart.csv")  # UCI one

rename_map = {
    'age': 'Age',
    'sex': 'Sex',
    'cp': 'ChestPainType',
    'trestbps': 'RestingBP',
    'chol': 'Cholesterol',
    'fbs': 'FastingBS',
    'restecg': 'RestingECG',
    'thalach': 'MaxHR',
    'exang': 'ExerciseAngina',
    'oldpeak': 'Oldpeak',
    'slope': 'ST_Slope',
    'target': 'HeartDisease'
}

df1 = df1.rename(columns=rename_map)

In [63]:
# drop unnecessary columns in df1
df1 = df1.drop(columns=['ca', 'thal'], errors='ignore')

In [64]:
#convert number coded columns to string coded columns to match df0
# Sex: 0 = F, 1 = M
df1['Sex'] = df1['Sex'].map({0: 'F', 1: 'M'})

# ChestPainType
cp_map = {
    0: 'TA',
    1: 'ATA',
    2: 'NAP',
    3: 'ASY'
}
df1['ChestPainType'] = df1['ChestPainType'].map(cp_map)

# RestingECG
ecg_map = {
    0: 'Normal',
    1: 'ST',
    2: 'LVH'
}
df1['RestingECG'] = df1['RestingECG'].map(ecg_map)

# ExerciseAngina
df1['ExerciseAngina'] = df1['ExerciseAngina'].map({0: 'N', 1: 'Y'})

# ST_Slope
slope_map = {
    0: 'Up',
    1: 'Flat',
    2: 'Down'
}
df1['ST_Slope'] = df1['ST_Slope'].map(slope_map)

In [65]:
# check columns again
print("Dataset 0:", df0.columns.tolist())
print("Dataset 1:", df1.columns.tolist())

Dataset 0: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
Dataset 1: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']


In [66]:
# combine into one large dataframe
dataFrame = pd.concat([df0, df1], axis=0, ignore_index=True)

print("Final combined dataset shape:", dataFrame.shape)

Final combined dataset shape: (1943, 12)


In [67]:
# Export the combined dataset
output_path = "../data/exported/combined_heart_dataset.csv"
dataFrame.to_csv(output_path, index=False)

print("Combined dataset saved to:", output_path)

Combined dataset saved to: ../data/exported/combined_heart_dataset.csv


In [68]:
# check for missing values and drop any duplicates
print(dataFrame.isnull().sum())

# Drop duplicates
dataFrame = dataFrame.drop_duplicates()

# Get only numeric columns
numeric_cols = dataFrame.select_dtypes(include=["number"]).columns

# if any missing values in (columns that only have numbers) are found just fill them with the median value for that feature
dataFrame[numeric_cols] = dataFrame[numeric_cols].fillna(dataFrame[numeric_cols].median())


# translate string features into bools 
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

data_encoded = pd.get_dummies(dataFrame, columns=categorical_cols, drop_first=True)
data_encoded.head()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [76]:
#export cleaned dataset
output_cleaned_path = "../data/exported/cleaned_heart_dataset.csv"
data_encoded.to_csv(output_cleaned_path, index=False)
print("Cleaned dataset saved to:", output_cleaned_path)

Cleaned dataset saved to: ../data/exported/cleaned_heart_dataset.csv


In [69]:
X = data_encoded.drop("HeartDisease", axis=1)
y = data_encoded["HeartDisease"]

In [70]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [71]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [72]:
log_model = LogisticRegression(max_iter=2000)
rf_model = RandomForestClassifier(random_state=42)
knn_model = KNeighborsClassifier()
svm_model = SVC(probability=True)

# TRAIN MODELS
log_model.fit(X_train_scaled, y_train)  # scaled
rf_model.fit(X_train_scaled, y_train)   # scaled
knn_model.fit(X_train_scaled, y_train)  # scaled
svm_model.fit(X_train_scaled, y_train)  # scaled

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [73]:
log_pred = log_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test_scaled)
knn_pred = knn_model.predict(X_test_scaled)
svm_pred = svm_model.predict(X_test_scaled)

# Accuracy scores
print("Logistic Regression:", accuracy_score(y_test, log_pred))
print("Random Forest:", accuracy_score(y_test, rf_pred))
print("KNN:", accuracy_score(y_test, knn_pred))
print("SVM:", accuracy_score(y_test, svm_pred))

Logistic Regression: 0.8155737704918032
Random Forest: 0.8442622950819673
KNN: 0.8483606557377049
SVM: 0.8360655737704918


In [74]:
print("Logistic Regression Report:\n", classification_report(y_test, log_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))
print("KNN Report:\n", classification_report(y_test, knn_pred))
print("SVM Report:\n", classification_report(y_test, svm_pred))

# save scores so i can get the best one when i export
models = {
    "Logistic Regression": (log_model, accuracy_score(y_test, log_pred)),
    "Random Forest": (rf_model, accuracy_score(y_test, rf_pred)),
    "KNN": (knn_model, accuracy_score(y_test, knn_pred)),
    "SVM": (svm_model, accuracy_score(y_test, svm_pred))
}

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       103
           1       0.83      0.86      0.84       141

    accuracy                           0.82       244
   macro avg       0.81      0.81      0.81       244
weighted avg       0.81      0.82      0.81       244

Random Forest Report:
               precision    recall  f1-score   support

           0       0.86      0.76      0.80       103
           1       0.84      0.91      0.87       141

    accuracy                           0.84       244
   macro avg       0.85      0.83      0.84       244
weighted avg       0.85      0.84      0.84       244

KNN Report:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82       103
           1       0.86      0.88      0.87       141

    accuracy                           0.85       244
   macro avg       0.85      0.84      0.84       244
weighted 

In [75]:
# export the best model and the scaler used
import os
import joblib

# Make sure the output directory exists
model_dir = "../model"
os.makedirs(model_dir, exist_ok=True)

# Save the scaler
scaler_path = os.path.join(model_dir, "scaler.pkl")
joblib.dump(scaler, scaler_path)


# get best model based on acuracy score:
best_name, (best_model, best_score) = max(models.items(), key=lambda x: x[1][1])
model_path = os.path.join(model_dir, f"bestModel.pkl")
joblib.dump(best_model, model_path)


print("Best model:", best_name, "Accuracy:", best_score)
print("Scaler saved to:", scaler_path)
print("Model saved to:", model_path)

Best model: KNN Accuracy: 0.8483606557377049
Scaler saved to: ../model\scaler.pkl
Model saved to: ../model\bestModel.pkl
