In [53]:
# importing all the neccessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [54]:
# Loading the heart dataset
data=pd.read_csv("heart.csv")

In [55]:
# Displaying Random 5 samples
data.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
897,55,F,ASY,128,205,0,ST,130,Y,2.0,Flat,1
374,61,M,ASY,125,0,0,Normal,105,Y,0.0,Down,1
742,52,F,NAP,136,196,0,LVH,169,N,0.1,Flat,0
652,59,M,TA,160,273,0,LVH,125,N,0.0,Up,1
324,46,M,ASY,100,0,1,ST,133,N,-2.6,Flat,1


In [56]:
# Getting dataset summary
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [57]:
# Getting dataset statistics
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [58]:
data.describe(include="all")

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


In [59]:
# Checking the null values
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [60]:
# Checking the duplicates
data.duplicated().sum()

np.int64(0)

In [61]:
# Checking the number of unique values in each feature
data.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [62]:
cat_col = data.select_dtypes(include='object').columns

#### <b>Converting Categorical Variables to Numeric</b>
* Sex : M = 0, F = 1
* ChestPainType : ATA = 0, NAP = 1, ASY = 2, TA = 3
* RestingECG : Normal = 0, ST = 1, LVH = 2
* ExerciseAngina : N = 0, Y = 1
* ST_Slope : UP = 0, Flat = 1, Down = 2

In [63]:
for col in cat_col:
    print(col)
    print((data[col].unique()),list(range(data[col].nunique())))
    data[col].replace((data[col].unique()), range(data[col].nunique()), inplace=True)
    print('*'*90)
    print()

Sex
['M' 'F'] [0, 1]
******************************************************************************************

ChestPainType
['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]
******************************************************************************************

RestingECG
['Normal' 'ST' 'LVH'] [0, 1, 2]
******************************************************************************************

ExerciseAngina
['N' 'Y'] [0, 1]
******************************************************************************************

ST_Slope
['Up' 'Flat' 'Down'] [0, 1, 2]
******************************************************************************************



In [64]:
data['Cholesterol'].value_counts()

Cholesterol
0      172
254     11
220     10
223     10
204      9
      ... 
353      1
278      1
157      1
176      1
131      1
Name: count, Length: 222, dtype: int64

Cholesterol cannot be 0, it states that the reading was not properly noted.

#### <b>Imputing the 0 values in cholesterol column with KNN imputer</b>

In [65]:
data['Cholesterol'].replace(0,np.nan,inplace=True)

In [66]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(data)
data = pd.DataFrame(after_impute, columns=data.columns)

Doing the same for Resting Blood Pressure

In [67]:
data['RestingBP'][data['RestingBP']==0]

449    0.0
Name: RestingBP, dtype: float64

In [68]:
from sklearn.impute import KNNImputer
data['RestingBP'].replace(0,np.nan,inplace=True)
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(data)
data = pd.DataFrame(after_impute, columns=data.columns)

#### <b>Change columns type to int</b>

In [69]:
withoutOldPeak = data.columns
withoutOldPeak = withoutOldPeak.drop('Oldpeak')
data[withoutOldPeak] = data[withoutOldPeak].astype('int32')

#### <b>Data Visualization</b>

In [70]:
!pip install plotly



You should consider upgrading via the 'C:\Users\rahul\OneDrive\Desktop\HeartDiseasePredictor\hd_env310\Scripts\python.exe -m pip install --upgrade pip' command.


In [71]:
import plotly.express as px

In [72]:
px.line(data.corr()['HeartDisease'][:-1].sort_values())

#### <b> Age and HeartDisease Distribution</b>

In [73]:
px.sunburst(data,path=['HeartDisease', 'Age'])

In [74]:
px.histogram(data, x='Age', color='HeartDisease')

In [75]:
# Percentage of HeartDisease data distribution
px.pie(data, names='HeartDisease', title='Percentage of HeartDisease classes distribution')

#### <b>Sex vs Heart Disease</b>

In [76]:
px.histogram(data, x='Sex', color='HeartDisease')

#### <b>ChestPainType vs Heart Disease</b>

In [77]:
px.histogram(data, x='ChestPainType', color='HeartDisease')
# ChestPainType : ATA = 0, NAP = 1, ASY = 2, TA = 3

#### <b>RestingBP vs Heart Disease</b>

In [78]:
px.sunburst(data,path=['HeartDisease', 'RestingBP'])

#### <b>FastingBS vs Heart Disease</b>

In [79]:
px.histogram(data, x='FastingBS', color='HeartDisease')

#### <b>MaxHR vs Heart Disease</b>

In [80]:
px.sunburst(data,path=['HeartDisease', 'MaxHR'])

In [81]:
px.violin(data, x='HeartDisease', y='MaxHR', color='HeartDisease')

#### <b>OldPeak vs Heart Disease</b>

In [82]:
px.violin(data, x='HeartDisease', y='Oldpeak', color='HeartDisease')

#### <b>ST_Slope vs Heart Disease</b>

In [83]:
px.histogram(data, x='ST_Slope', color='HeartDisease')

#### <b>ExerciseAngina vs Heart Disease</b>

In [84]:
px.histogram(data, x='ExerciseAngina', color='HeartDisease')

#### <b>Train Test Split</b>

In [85]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(
    data.drop('HeartDisease',axis=1),
    data['HeartDisease'],
    test_size=0.2,
    random_state=42,
    stratify=data['HeartDisease']
)

### <b>Training Models</b>

#### <b>Logistic Regression</b>

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

solver = ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']
best_solver=''
test_score= np.zeros(6)
for i, n in enumerate(solver):
    lr=LogisticRegression(solver=n).fit(x_train, y_train)
    test_score[i] = lr.score(x_test, y_test)
    if lr.score(x_test, y_test)== test_score.max():
        best_solver=n
print(best_solver)
lr = LogisticRegression(solver=best_solver)
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
print(f'LogisticRegression score: {accuracy_score(y_test, lr_pred)}')

newton-cholesky
LogisticRegression score: 0.8586956521739131


In [87]:
import pickle
with open('LogisticR.pkl','wb') as f:
    pickle.dump(lr,f)

#### <b>Support Vector Machine (SVM)</b>

In [88]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

kernels = {'linear':0,'poly':0, 'rbf':0, 'sigmoid':0}
best=''
for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(x_train, y_train)
    yhat = svm.predict(x_test)
    kernels[i]=f1_score(y_test, yhat, average="weighted")
    if kernels[i] == max(kernels.values()):
        best=i
print(best)
svm=SVC(kernel=best)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)
print(f'SVM f1_score kernel({best}): {f1_score(y_test,svm_pred, average="weighted")}')

linear
SVM f1_score kernel(linear): 0.8422922535440344


In [89]:
with open('SVM.pkl','wb') as f:
    pickle.dump(svm,f)

#### <b>Decision Tree Classifier</b>

In [90]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
# Base model with class balancing
dtree = DecisionTreeClassifier(class_weight='balanced')

# Expanded parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'random_state': [0, 42]
}

# 5-fold stratified CV to preserve class distribution
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid search with F1 scoring (good for imbalanced datasets)
grid_search = GridSearchCV(
    estimator=dtree,
    param_grid=param_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(x_train, y_train)

# Best model with optimal parameters
best_dtree = grid_search.best_estimator_

# Fit best model
best_dtree.fit(x_train, y_train)

# Predict and evaluate
dtc_pred = best_dtree.predict(x_test)

print("Optimized Decision Tree Accuracy:", accuracy_score(y_test, dtc_pred))


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Optimized Decision Tree Accuracy: 0.8260869565217391


In [91]:
with open('Dtree.pkl','wb') as f:
    pickle.dump(best_dtree,f)

#### <b>Random Forest Classifier</b>

In [92]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

# Create the base RandomForest model
rfc = RandomForestClassifier(class_weight='balanced')

# Expanded hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [6, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'random_state': [42]
}

# Use stratified k-fold for balanced validation
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid search with f1 score (you can switch to 'roc_auc' if needed)
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(x_train, y_train)

# Best model with optimal params
best_rfc = grid_search.best_estimator_

# Fit the best model on training data
best_rfc.fit(x_train, y_train)

# Predict on test data
rfc_pred = best_rfc.predict(x_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, rfc_pred))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Random Forest Accuracy: 0.8641304347826086


In [93]:
with open('RandomForest.pkl','wb') as f:
    pickle.dump(best_rfc,f)