### Importing the Dependencies

In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


### Data Collection and Analysis

In [28]:
# Loading the dataset to a pandas dataframe
heart_disease_dataset = pd.read_csv('datasets/heart_disease.csv')

In [29]:
heart_disease_dataset.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [30]:
heart_disease_dataset.shape

(1190, 12)

In [31]:
heart_disease_dataset.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [32]:
heart_disease_dataset.columns

Index(['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
       'fasting blood sugar', 'resting ecg', 'max heart rate',
       'exercise angina', 'oldpeak', 'ST slope', 'target'],
      dtype='object')

In [33]:
# Checking for missing values
heart_disease_dataset.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64

In [34]:
heart_disease_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [35]:
heart_disease_dataset['target'].value_counts()

target
1    629
0    561
Name: count, dtype: int64

0 ---> Normal

1 --> Heart Disease

In [36]:
heart_disease_dataset.groupby('target').mean()

Unnamed: 0_level_0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,51.124777,0.623886,2.777184,129.793226,231.657754,0.11943,0.631016,150.894831,0.139037,0.464706,1.297683
1,56.034976,0.888712,3.63911,134.259141,191.372019,0.297297,0.758347,129.777424,0.608903,1.33132,1.915739


In [37]:
# Separating the data and labels
X = heart_disease_dataset.drop(columns='target', axis=1)
y = heart_disease_dataset['target']

In [38]:
X.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope
0,40,1,2,140,289,0,0,172,0,0.0,1
1,49,0,3,160,180,0,0,156,0,1.0,2
2,37,1,2,130,283,0,1,98,0,0.0,1
3,48,0,4,138,214,0,0,108,1,1.5,2
4,54,1,3,150,195,0,0,122,0,0.0,1


In [39]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: target, dtype: int64

### Data Standardization

In [40]:
scalar = StandardScaler()

In [41]:
scalar.fit(X)

In [42]:
standardized_data = scalar.transform(X)

In [43]:
standardized_data

array([[-1.46672783,  0.55599543, -1.31835093, ..., -0.79521891,
        -0.84979236, -1.02321701],
       [-0.50460037, -1.79857595, -0.24893198, ..., -0.79521891,
         0.07111913,  0.61558278],
       [-1.78743698,  0.55599543, -1.31835093, ..., -0.79521891,
        -0.84979236, -1.02321701],
       ...,
       [ 0.35062404,  0.55599543,  0.82048698, ...,  1.25751537,
         0.25530143,  0.61558278],
       [ 0.35062404, -1.79857595, -1.31835093, ..., -0.79521891,
        -0.84979236,  0.61558278],
       [-1.68053393,  0.55599543, -0.24893198, ..., -0.79521891,
        -0.84979236, -1.02321701]])

In [44]:
# X = standardized_data
y = heart_disease_dataset['target']

### Train Test Split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=2)

In [46]:
print(X.shape, X_train.shape, X_test.shape)

(1190, 11) (833, 11) (357, 11)


### Training the Model

In [None]:
# using logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using logistic regression model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using logistic regression model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 0.8163265306122449
Accuracy Score of the test data: 0.8319327731092437
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       168
           1       0.87      0.80      0.83       189

    accuracy                           0.83       357
   macro avg       0.83      0.83      0.83       357
weighted avg       0.84      0.83      0.83       357



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# using KNeighborsClassifier model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using KNeighborsClassifier model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using KNeighborsClassifier model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 0.8547418967587035
Accuracy Score of the test data: 0.7310924369747899
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       168
           1       0.76      0.72      0.74       189

    accuracy                           0.73       357
   macro avg       0.73      0.73      0.73       357
weighted avg       0.73      0.73      0.73       357



In [None]:
# using svc model
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train, y_train)
# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using svc model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using svc model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 0.8271308523409364
Accuracy Score of the test data: 0.8515406162464986
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       168
           1       0.89      0.83      0.85       189

    accuracy                           0.85       357
   macro avg       0.85      0.85      0.85       357
weighted avg       0.85      0.85      0.85       357



In [None]:
# using decision tree model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using decision tree model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using decision tree model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 1.0
Accuracy Score of the test data: 0.876750700280112
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       168
           1       0.89      0.88      0.88       189

    accuracy                           0.88       357
   macro avg       0.88      0.88      0.88       357
weighted avg       0.88      0.88      0.88       357



In [None]:
# using GradientBoostingClassifier model
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100)
model.fit(X_train, y_train)
# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using GradientBoostingClassifier model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using GradientBoostingClassifier model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 0.9471788715486195
Accuracy Score of the test data: 0.9103641456582633
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       168
           1       0.93      0.90      0.91       189

    accuracy                           0.91       357
   macro avg       0.91      0.91      0.91       357
weighted avg       0.91      0.91      0.91       357



In [None]:
# using GaussianNB model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)

# Accuracy score on the test data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)
print('Accuracy Score of the training data using GaussianNB model:', training_data_accuracy)
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)
print('Accuracy Score of the test data using GaussianNB model:', test_data_accuracy)
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

Accuracy Score of the training data: 0.8307322929171669
Accuracy Score of the test data: 0.84593837535014
              precision    recall  f1-score   support

           0       0.80      0.89      0.85       168
           1       0.89      0.80      0.85       189

    accuracy                           0.85       357
   macro avg       0.85      0.85      0.85       357
weighted avg       0.85      0.85      0.85       357



In [53]:
model = RandomForestClassifier(class_weight='balanced', random_state=42)

In [54]:
# Trainnig the support vector machine classifier
model.fit(X_train, y_train)

In [55]:
# Accuracy score on the training data
y_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_prediction, y_train)

### Model Evaluation

### Accuracy Score

In [56]:
'Accuracy Score of the training data:', training_data_accuracy

('Accuracy Score of the training data:', 1.0)

In [57]:
# Accuracy score on the test data
y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_prediction, y_test)

In [58]:
'Accuracy Score of the test data:', test_data_accuracy

('Accuracy Score of the test data:', 0.9299719887955182)

In [59]:
# Compute F1-Score
f1 = f1_score(y_test, y_test_prediction)
print(f"F1-Score: {f1:.4f}")



F1-Score: 0.9340


In [60]:

# Compute recall
recall = recall_score(y_test, y_test_prediction)
print(f"Recall: {recall:.4f}")



Recall: 0.9365


In [61]:
# Compute precision
precision = precision_score(y_test, y_test_prediction)
print(f"Precision: {precision:.4f}")

Precision: 0.9316


In [62]:
# Evaluate the model
print(classification_report(y_test, y_test_prediction))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       168
           1       0.93      0.94      0.93       189

    accuracy                           0.93       357
   macro avg       0.93      0.93      0.93       357
weighted avg       0.93      0.93      0.93       357



This is a classification report that shows the performance metrics for a binary classification model, evaluating it on two classes (labeled 0 and 1).

Here is a breakdown of the provided metrics:

### Class 0:
- **Precision**: 0.93 — The model correctly identified 93% of the instances that it predicted as class 0.
- **Recall**: 0.92 — The model correctly identified 92% of the actual class 0 instances.
- **F1-score**: 0.92 — This is the harmonic mean of precision and recall for class 0, indicating the balance between the two metrics.
- **Support**: 168 — This is the number of actual instances of class 0 in the dataset.

### Class 1:
- **Precision**: 0.93 — The model correctly identified 93% of the instances that it predicted as class 1.
- **Recall**: 0.94 — The model correctly identified 94% of the actual class 1 instances.
- **F1-score**: 0.93 — This indicates a balance between precision and recall for class 1.
- **Support**: 189 — This is the number of actual instances of class 1 in the dataset.

### Overall:
- **Accuracy**: 0.93 — The model correctly predicted the class for 93% of all instances in the dataset.
- **Macro Average**: This is the average of the precision, recall, and F1-score calculated across both classes, treating both classes equally. In this case, the values are:
  - Precision: 0.93
  - Recall: 0.93
  - F1-score: 0.93
- **Weighted Average**: This is the average of the precision, recall, and F1-score across both classes, weighted by the number of instances in each class. In this case, the values are:
  - Precision: 0.93
  - Recall: 0.93
  - F1-score: 0.93

### Conclusion:
The model has performed well on both classes, achieving high precision, recall, and F1-scores, all around 0.93, which indicates a balanced performance without major bias toward one class. The accuracy of 93% further reflects the overall effectiveness of the model.

### Saving the Trained Model

In [63]:
import pickle

In [64]:
filename = 'models/heart_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [65]:
# Loading the saved model
loaded_model = pickle.load(open('models/heart_disease_model.sav', 'rb'))

In [66]:
input_data = (64,1,4,128,263,0,0,105,1,0.2,2)
input_data_as_numpy_array = np.asarray(input_data)
reshaped_input_data = input_data_as_numpy_array.reshape(1, -1)

prediction = loaded_model.predict(reshaped_input_data)

print(prediction)

if (prediction[0] == 0):
    print('The person does not have a heart disease')
else:
    print('The person has heart disease')

[0]
The person does not have a heart disease




In [67]:
heart_disease_dataset['chest pain type'].value_counts()

chest pain type
4    625
3    283
2    216
1     66
Name: count, dtype: int64