# ECE 539 Project: Analysis and Prediction of Heart Disease (Main Code)

In [246]:
# IMPORT LIBRARIES AND SET WORKING DIRECTORY

DIR = './'

'''
# To use Google Colab, uncomment this section and set DIR (above) to appropriate directory path
from google.colab import drive
drive.mount('/content/drive')
DIR = '/content/drive/...'
'''

import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

## Data Fetching

In [247]:
# data from https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset
# possible alternate: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction?select=heart.csv

data = pd.read_csv(os.path.join(DIR, 'heart.csv'))

## Data Processing

In the original data, some features consist string values. We need to convert them to numbers. In this case, we use one hot encoding to ensure unbiased analysis.

In [248]:
# encoder = OneHotEncoder(handle_unknown='ignore')

Sex_df = pd.DataFrame(encoder.fit_transform(data[['Sex']]).toarray())
ChestPainType_df = pd.DataFrame(encoder.fit_transform(data[['ChestPainType']]).toarray())
RestingECG_df =  pd.DataFrame(encoder.fit_transform(data[['RestingECG']]).toarray())
ExerciseAngina_df = pd.DataFrame(encoder.fit_transform(data[['ExerciseAngina']]).toarray())
ST_Slope_df = pd.DataFrame(encoder.fit_transform(data[['ST_Slope']]).toarray())

data = pd.concat([data, Sex_df, ChestPainType_df, RestingECG_df, ExerciseAngina_df, ST_Slope_df], axis=1)
data = pd.concat([data, ChestPainType_df, ], axis=1)

data.drop('Sex', axis=1, inplace=True)
data.drop('ChestPainType', axis=1, inplace=True)
data.drop('RestingECG', axis=1, inplace=True)
data.drop('ExerciseAngina', axis=1, inplace=True)
data.drop('ST_Slope', axis=1, inplace=True)

In [249]:
# Normalize appropriate features

cols_to_norm = ['RestingBP', 'Cholesterol', 'MaxHR']
data[cols_to_norm] = data[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

data = data.to_numpy()

In [250]:
# Separate features and labels

X = data[:,:-1]
y = data[:,-1]

In [251]:
# Split data
# train/test/validation = 60/20/20
# Random state set to get the same results

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=2)

## Data Analysis

### Logistic Regression

In [252]:
logreg_clf = LogisticRegression(fit_intercept=True, solver='newton-cg').fit(X_train, y_train)
print('Weights:\n', logreg_clf.coef_[0], '\n')
print('Intercept:\n', logreg_clf.intercept_[0])

Weights:
 [ 0.00755064  0.02020315  0.01540529  0.13198464  0.15333062  0.20602664
 -0.05958815 -0.00947965  0.00948037 -1.51974648 -1.15257729 -1.26707124
  3.93939573  0.08457147 -0.14754425  0.0629735   0.34166038 -0.34165965
  0.03861582 -0.04776402  0.00914893 -1.51974648 -1.15257729 -1.26707124] 

Intercept:
 -2.9771726185105747


In [253]:
cross_val_score(logreg_clf, X_valid, y_valid)

array([0.93333333, 0.96666667, 1.        , 1.        , 0.96551724])

### K-Nearest Neighbors

### Decision Tree