Importing the dependencies

In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data collection and processing

In [65]:
# loading csv data to a pandas dataframe
heart_data = pd.read_csv('/content/heart.csv')

In [66]:
Sex_list_names = list(heart_data['Sex'].value_counts().index)
Sex_list_values = [x for x in range(len(Sex_list_names))]

RestingECG_list_names = list(heart_data['RestingECG'].value_counts().index)
RestingECG_list_values = [x for x in range(len(RestingECG_list_names))]

ChestPainType_list_names = list(heart_data['ChestPainType'].value_counts().index)
ChestPainType_list_values = [x for x in range(len(ChestPainType_list_names))]

ExerciseAngina_list_names = list(heart_data['ExerciseAngina'].value_counts().index)
ExerciseAngina_list_values = [x for x in range(len(ExerciseAngina_list_names))]

ST_Slope_list_names = list(heart_data['ST_Slope'].value_counts().index)
ST_Slope_list_values = [x for x in range(len(ST_Slope_list_names))]

In [67]:
heart_data.replace(Sex_list_names,Sex_list_values, inplace= True)
heart_data.replace(RestingECG_list_names,RestingECG_list_values, inplace= True)
heart_data.replace(ChestPainType_list_names,ChestPainType_list_values, inplace= True)
heart_data.replace(ExerciseAngina_list_names,ExerciseAngina_list_values, inplace= True)
heart_data.replace(ST_Slope_list_names,ST_Slope_list_values, inplace= True)

In [86]:
print(dict(zip(Sex_list_names,Sex_list_values)))
print(dict(zip(RestingECG_list_names,RestingECG_list_values)))
print(dict(zip(ChestPainType_list_names,ChestPainType_list_values)))
print(dict(zip(ExerciseAngina_list_names,ExerciseAngina_list_values)))
print(dict(zip(ST_Slope_list_names,ST_Slope_list_values)))

{'M': 0, 'F': 1}
{'Normal': 0, 'LVH': 1, 'ST': 2}
{'ASY': 0, 'NAP': 1, 'ATA': 2, 'TA': 3}
{'N': 0, 'Y': 1}
{'Flat': 0, 'Up': 1, 'Down': 2}


In [68]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,2,140,289,0,0,172,0,0.0,1,0
1,49,1,1,160,180,0,0,156,0,1.0,0,1
2,37,0,2,130,283,0,2,98,0,0.0,1,0
3,48,1,0,138,214,0,0,108,1,1.5,0,1
4,54,0,1,150,195,0,0,122,0,0.0,1,0


In [69]:
# print the last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
913,45,0,3,110,264,0,0,132,0,1.2,0,1
914,68,0,0,144,193,1,0,141,0,3.4,0,1
915,57,0,0,130,131,0,0,115,1,1.2,0,1
916,57,1,2,130,236,0,1,174,0,0.0,0,1
917,38,0,1,138,175,0,0,173,0,0.0,1,0


In [70]:
# number of rows and columns in the dataset
heart_data.shape

(918, 12)

In [71]:
# getting some info about the datset
heart_data.info

<bound method DataFrame.info of      Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0     40    0              2        140          289          0           0   
1     49    1              1        160          180          0           0   
2     37    0              2        130          283          0           2   
3     48    1              0        138          214          0           0   
4     54    0              1        150          195          0           0   
..   ...  ...            ...        ...          ...        ...         ...   
913   45    0              3        110          264          0           0   
914   68    0              0        144          193          1           0   
915   57    0              0        130          131          0           0   
916   57    1              2        130          236          0           1   
917   38    0              1        138          175          0           0   

     MaxHR  Exercis

In [72]:
# checling the missing values
heart_data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [73]:
# statistical measures of the data
heart_data.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.21024,0.748366,132.396514,198.799564,0.233115,0.592593,136.809368,0.404139,0.887364,0.567538,0.553377
std,9.432617,0.407701,0.931031,18.514154,109.384145,0.423046,0.79367,25.460334,0.490992,1.06657,0.618959,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,0.0,120.0,173.25,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,0.0,130.0,223.0,0.0,0.0,138.0,0.0,0.6,0.0,1.0
75%,60.0,0.0,1.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [74]:
# checking the distribution of target variable
heart_data['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

1--> Defective heart
0--> Healthy heart

Splitting the features and Heart disease

In [75]:
X = heart_data.drop(columns='HeartDisease', axis=1)
Y = heart_data['HeartDisease']

In [76]:
print (X)

     Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0     40    0              2        140          289          0           0   
1     49    1              1        160          180          0           0   
2     37    0              2        130          283          0           2   
3     48    1              0        138          214          0           0   
4     54    0              1        150          195          0           0   
..   ...  ...            ...        ...          ...        ...         ...   
913   45    0              3        110          264          0           0   
914   68    0              0        144          193          1           0   
915   57    0              0        130          131          0           0   
916   57    1              2        130          236          0           1   
917   38    0              1        138          175          0           0   

     MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0  

In [77]:
print (Y)

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64


Splitting the data into training data and testing data

In [78]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)

In [79]:
print(X.shape, X_train.shape, X_test.shape)

(918, 11) (734, 11) (184, 11)


Model training

Logistic Regression

In [80]:
model = LogisticRegression()

In [81]:
X_train.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
35,32,0,2,125,254,0,0,155,0,0.0,1
173,49,0,1,140,187,0,0,172,0,0.0,1
256,55,1,2,130,394,0,1,150,0,0.0,1
1,49,1,1,160,180,0,0,156,0,1.0,0
621,56,0,1,130,256,1,1,142,1,0.6,0


In [82]:
# training the LogisticRegression model with training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Model evaluation

Accuracy score

In [83]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [84]:
training_data_accuracy

0.8719346049046321

In [88]:
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.8719346049046321


In [89]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [91]:
print('Accuracy on test data : ', test_data_accuracy)

Accuracy on test data :  0.8152173913043478


Building a predictive system

In [95]:
input_data = (173,49,0,1,140,187,0,0,172,0,0.0)

# change the input data as numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predecting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The person does not have a Heart Disease')
else:
    print('The person has a heart disease')
    




[1]
The person has a heart disease


  "X does not have valid feature names, but"


In [97]:
input_data = (917,38,0,1,138,175,0,0,173,0,0.0)

# change the input data as numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predecting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The person does not have a Heart Disease')
else:
    print('The person has a heart disease')

[1]
The person has a heart disease


  "X does not have valid feature names, but"
