# Heart failure predictor

#### SVC model using multiple features to predict a person's risk of heart failure

Importing required dependencies

In [1]:
import pandas as pd
import pickle 
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

Reading the dataset

In [2]:
heart_data = pd.read_csv("heart.csv")

Getting a general feel for the data

In [3]:
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
heart_data.shape

(918, 12)

In [5]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Checking if empty cells (null values) exist in dataset

In [6]:
heart_data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
heart_data['Cholesterol'].value_counts()

Cholesterol
0      172
254     11
223     10
220     10
230      9
      ... 
392      1
316      1
153      1
466      1
131      1
Name: count, Length: 222, dtype: int64

In [8]:
# There are a lot of '0' values in cholesterol, so this column is dropped from features
heart_data = heart_data.drop(columns="Cholesterol")

Count occurences of different categories in object valued columns

In [9]:
print(heart_data['Sex'].value_counts(), end='\n\n')
print(heart_data['ChestPainType'].value_counts(), end='\n\n')
print(heart_data['RestingECG'].value_counts(), end='\n\n')
print(heart_data['ExerciseAngina'].value_counts(), end='\n\n')
print(heart_data['ST_Slope'].value_counts(), end='\n\n')

Sex
M    725
F    193
Name: count, dtype: int64

ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64

ExerciseAngina
N    547
Y    371
Name: count, dtype: int64

ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64



Encoding object values to integers using LabelEncoder

In [10]:
label_codes = {}
object_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [11]:
# Sex -> F:0, M:1
# ChestPainType -> ASY:0, ATA:1, NAP:2, TA:3
# RestingECG -> LVH:0, Normal:1, ST:2
# ExerciseAngina -> N:0, Y:1
# ST_Slope ->  Down:0 ,Flat:1, Up:2

for col in object_cols:
    label_encode = LabelEncoder()
    heart_data[col] = label_encode.fit_transform(heart_data[col])
    label_codes[col] = label_encode
    
with open("Label_codes.pkl", 'wb') as f:
    pickle.dump(label_codes, f)

In [12]:
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,0,1,172,0,0.0,2,0
1,49,0,2,160,0,1,156,0,1.0,1,1
2,37,1,1,130,0,2,98,0,0.0,2,0
3,48,0,0,138,0,1,108,1,1.5,1,1
4,54,1,2,150,0,1,122,0,0.0,2,0


Creating feature set (X) with all feature values and target (Y) with all the output values from the dataset

In [13]:
X = heart_data.drop(columns='HeartDisease', axis=1)
Y = heart_data['HeartDisease']

In [14]:
print(X)

     Age  Sex  ChestPainType  RestingBP  FastingBS  RestingECG  MaxHR  \
0     40    1              1        140          0           1    172   
1     49    0              2        160          0           1    156   
2     37    1              1        130          0           2     98   
3     48    0              0        138          0           1    108   
4     54    1              2        150          0           1    122   
..   ...  ...            ...        ...        ...         ...    ...   
913   45    1              3        110          0           1    132   
914   68    1              0        144          1           1    141   
915   57    1              0        130          0           1    115   
916   57    0              1        130          0           0    174   
917   38    1              2        138          0           1    173   

     ExerciseAngina  Oldpeak  ST_Slope  
0                 0      0.0         2  
1                 0      1.0         1  


In [15]:
print(Y)

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64


Using StandardScaler to standardize the dataset values

In [16]:
ss = StandardScaler()

In [17]:
X['Sex'] = ss.fit_transform(X[['Sex']])
X['Oldpeak'] = ss.fit_transform(X[['Oldpeak']])
X['Age'] = ss.fit_transform(X[['Age']])
X['RestingBP'] = ss.fit_transform(X[['RestingBP']])
X['MaxHR'] = ss.fit_transform(X[['MaxHR']])
X_scaled = X

In [18]:
X_scaled

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,-1.433140,0.515952,1,0.410909,0,1,1.382928,0,-0.832432,2
1,-0.478484,-1.938163,2,1.491752,0,1,0.754157,0,0.105664,1
2,-1.751359,0.515952,1,-0.129513,0,2,-1.525138,0,-0.832432,2
3,-0.584556,-1.938163,0,0.302825,0,1,-1.132156,1,0.574711,1
4,0.051881,0.515952,2,0.951331,0,1,-0.581981,0,-0.832432,2
...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,0.515952,3,-1.210356,0,1,-0.188999,0,0.293283,1
914,1.536902,0.515952,0,0.627078,1,1,0.164684,0,2.357094,1
915,0.370100,0.515952,0,-0.129513,0,1,-0.857069,1,0.293283,1
916,0.370100,-1.938163,1,-0.129513,0,0,1.461525,0,-0.832432,1


In [19]:
# combine X and Y into a single dataframe
heart_data = pd.concat([X_scaled, Y], axis=1)

# save preprocessed dataset with both X and Y in 'heart_data_preprocessed.csv'
heart_data.to_csv('heart_data_preprocessed.csv', index=False)

Splitting into training and test data

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.1, stratify=Y ,random_state=2)

In [21]:
print(X.shape, X_scaled.shape, X_train.shape, X_test.shape)

(918, 10) (918, 10) (826, 10) (92, 10)


Training the classifier model using SVC

In [22]:
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

Calculating the accuracy of model with training data

In [23]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [24]:
print(f"Accuracy of model in training data: {training_data_accuracy*100:.2f} %")

Accuracy of model in training data: 85.71 %


Measuring accuracy of the model using test data

In [25]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [26]:
print(f"Accuracy of model in test data: {test_data_accuracy*100:.2f} %")

Accuracy of model in test data: 88.04 %


Checking output for a random sample of input values

In [27]:

# Load label codes
with open("label_codes.pkl", 'rb') as file:
    label_codes = pickle.load(file)

# Function to format given new data
def format_input(new_data):
    arr = pd.DataFrame({"Age":[new_data[0]], "Sex":[new_data[1]], "ChestPainType":[new_data[2]], "RestingBP":[new_data[3]], "FastingBS":[new_data[5]], "RestingECG":[new_data[6]], "MaxHR": [new_data[7]], "ExerciseAngina": [new_data[8]], "Oldpeak":[new_data[9]], "ST_Slope":[new_data[10]]})
    
    for col in object_cols:
        label_encode = label_codes[col]
        arr[col] = label_encode.transform(arr[col])
    
    return arr

def prediction(new_data):
    new_data = new_data.split(',')
    arr = format_input(new_data)

    prediction = classifier.predict(arr)

    if prediction[0] == 1:
        return ("High risk of heart disease")
    else:
        return ("Low risk of heart disease")

In [28]:
print(prediction('49,F,ATA,110,208,0,Normal,160,N,0,Up'))

Low risk of heart disease


In [29]:
print(prediction('57,M,ASY,150,255,0,Normal,92,Y,3,Flat'))

High risk of heart disease


In [30]:
filename = 'trained_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [31]:
# loading the model

loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [32]:

def prediction(new_data):
    new_data = new_data.split(',')
    arr = format_input(new_data)

    prediction = loaded_model.predict(arr)

    if prediction[0] == 1:
        return ("High risk of heart disease")
    else:
        return ("Low risk of heart disease")

print(prediction('49,F,ATA,110,208,0,Normal,160,N,0,Up'))

Low risk of heart disease


In [33]:
print(loaded_model)

SVC(kernel='linear')
