<a href="https://colab.research.google.com/github/thammathara/Python_Project/blob/main/ml_heart_failure_prediction_with_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Read Data**

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# read data
heart = pd.read_csv("heart.csv")
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# **Check & clean data**

In [26]:
# it is clear there is no null
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [27]:
#ST_Slope  0 = Flat,  1 = Up
heart['ST_seg']= np.where(heart['ST_Slope']=='Up',1,0)

#Sex 0 = Male , 1 = Female
heart['Gender']= np.where(heart['Sex']=='F',1,0)

In [28]:
heart.tail(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ST_seg,Gender
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1,0,0
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1,0,0
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1,0,0
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1,0,1
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up,0,1,0


# **Split Data**

In [29]:
# prepare data
X = heart.drop(["Sex", "HeartDisease", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope" ], axis=1)
y = heart["HeartDisease"]

# split data  
#Train size = 0.8, Test size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size = 0.20, random_state = 42 #set.seed()
)

In [31]:
X_train.shape # ( row, column )

(734, 8)

In [45]:
sc_x = StandardScaler()

X_train = sc_x.fit_transform(X_train) 
X_test = sc_x.transform(X_test)
  
print (X_train[0:10, :])

[[-1.24506731 -0.70898547  0.372803    1.84260945  2.28435288 -0.09706109
  -0.84792072 -0.54060477]
 [-1.8862362  -0.16628515  0.08614581 -0.5427086   1.65224147 -0.83628643
   1.17935554 -0.54060477]
 [ 0.25099346  0.91911549  0.12313384  1.84260945 -0.44162756  0.08774524
  -0.84792072 -0.54060477]
 [-1.77937472 -0.16628515  0.10463982 -0.5427086   0.22999081 -0.83628643
   1.17935554  1.84978019]
 [-0.28331396 -0.70898547 -1.84647842  1.84260945 -1.27127378 -0.83628643
  -0.84792072 -0.54060477]
 [ 0.14413197  0.64776533  0.44677904 -0.5427086  -1.58732949  1.01177691
  -0.84792072 -0.54060477]
 [-0.17645248  1.73316597 -1.84647842  1.84260945 -0.56014845  0.08774524
   1.17935554 -0.54060477]
 [-0.60389841  1.46181581  0.63171916 -0.5427086  -1.31078075  0.08774524
  -0.84792072 -0.54060477]
 [ 0.25099346 -0.16628515  0.52075509  1.84260945  0.22999081 -0.28186743
  -0.84792072 -0.54060477]
 [-1.99309769 -0.70898547 -0.36695748 -0.5427086   1.92879021 -0.83628643
   1.17935554  1.

# **Train The Model**

In [48]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

#After training the model, it is time to use it to do predictions on testing data. 
y_pred = classifier.predict(X_test)

# **Evaluation Metrics**

In [49]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[66 11]
 [25 82]]


**Out of 184 :\
True Positive + True Negative = 66+82 = 148\
False Positive + False Negative = 11+25 = 36\
Performance measure – Accuracy**

In [54]:
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy :  0.8043478260869565


**Test data 20%**

In [63]:
print(classification_report(y_test, y_pred, digits=2, target_names=['No HeartDisease 0','HeartDisease 1']))

                   precision    recall  f1-score   support

No HeartDisease 0       0.73      0.86      0.79        77
   HeartDisease 1       0.88      0.77      0.82       107

         accuracy                           0.80       184
        macro avg       0.80      0.81      0.80       184
     weighted avg       0.82      0.80      0.81       184

