# Library Imports


In [1]:
# Import necessary libraries
import numpy as np
import pandas  as pd
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Loading the Dataset

In [2]:
# Load the dataset
df=pd.read_csv("heart_2020_cleaned.csv")

In [3]:
df.head(89)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,No,27.46,Yes,No,No,7.0,15.0,No,Female,75-79,White,No,No,Good,7.0,No,No,No
85,No,23.78,No,No,No,0.0,5.0,No,Female,70-74,White,No,Yes,Excellent,8.0,No,No,No
86,No,34.75,Yes,No,No,0.0,0.0,No,Female,45-49,White,No,No,Very good,7.0,No,No,Yes
87,No,22.67,No,No,No,0.0,3.0,No,Female,80 or older,Black,No,Yes,Good,8.0,No,No,No


In [4]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [6]:
print(df.shape)

(319795, 18)


# Data Preprocessing

In [7]:
#Data mapping is a technique used to transform different types of data 
#to a common format that is suitable for machine learning models to improve 
#the model's performance. For example, BMI, a continuous variable, can be mapped
#to four categories: Normal weight BMI, Underweight BMI, Overweight BMI, and
#Obese. This mapping helps the model to better understand the patterns in the
#data and make more accurate predictions.


In [8]:
target=df["HeartDisease"]
df.drop(["HeartDisease"], axis=1, inplace=True)


### Categorical Features Conversion

In [9]:
df.replace("Yes",1,inplace=True)
df.replace("No",0,inplace=True)

In [10]:
df.AgeCategory.unique()

array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
       '60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
      dtype=object)

In [11]:
df.replace("18-24",0,inplace=True)
df.replace("25-29",1,inplace=True)
df.replace("30-34",2,inplace=True)
df.replace("35-39",3,inplace=True)
df.replace("40-44",4,inplace=True)
df.replace("45-49",5,inplace=True)
df.replace("50-54",6,inplace=True)
df.replace("55-59",7,inplace=True)
df.replace("60-64",8,inplace=True)
df.replace("65-69",9,inplace=True)
df.replace("70-74",10,inplace=True)
df.replace("75-79",11,inplace=True)
df.replace("80 or older",13,inplace=True)

In [12]:
df.Diabetic.unique()

array([1, 0, 'No, borderline diabetes', 'Yes (during pregnancy)'],
      dtype=object)

In [13]:
df.replace("No, borderline diabetes",2,inplace=True)
df.replace("Yes (during pregnancy)",3,inplace=True)

In [14]:
df.GenHealth.unique()


array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

In [15]:
df.replace("Excellent",0,inplace=True)
df.replace("Good",1,inplace=True)
df.replace("Fair",2,inplace=True)
df.replace("Very good",3,inplace=True)
df.replace("Poor",4,inplace=True)

In [16]:
df.Race.unique()

array(['White', 'Black', 'Asian', 'American Indian/Alaskan Native',
       'Other', 'Hispanic'], dtype=object)

In [17]:
df.replace("White",0,inplace=True)
df.replace("Other",1,inplace=True)
df.replace("Black",2,inplace=True)
df.replace("Hispanic",3,inplace=True)
df.replace("Asian",4,inplace=True)
df.replace("American Indian/Alaskan Native",5,inplace=True)

In [18]:
df.Sex.unique()

array(['Female', 'Male'], dtype=object)

In [19]:

df.replace("Female",0,inplace=True)
df.replace("Male",1,inplace=True)

###  BMI Categorization

In [20]:
df.BMI.unique()

array([16.6 , 20.34, 26.58, ..., 62.42, 51.46, 46.56])

In [21]:
df['BMI'].mask(df['BMI']  < 18.5, 0, inplace=True)
df['BMI'].mask(df['BMI'].between(18.5,25), 1, inplace=True)
df['BMI'].mask(df['BMI'].between(25,30), 2, inplace=True)
df['BMI'].mask(df['BMI']  > 30, 3, inplace=True)

In [22]:
df.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.0,1,0,0,3.0,30.0,0,0,7,0,1,1,3,5.0,1,0,1
1,1.0,0,0,1,0.0,0.0,0,0,13,0,0,1,3,7.0,0,0,0
2,2.0,1,0,0,20.0,30.0,0,1,9,0,1,1,2,8.0,1,0,0
3,1.0,0,0,0,0.0,0.0,0,0,11,0,0,0,1,6.0,0,0,1
4,1.0,0,0,0,28.0,0.0,1,0,4,0,0,1,3,8.0,0,0,0


# Train-Test Split

In [23]:
# Split the data into training and testing
X_train,X_test,y_train,y_test = train_test_split(df,target,test_size=50,random_state=2)

# Training the Logistic Regression Model

In [24]:
# Train a logistic regression model on the training set
LogRegModel=LogisticRegression()

In [25]:
LogRegModel.fit(X_train, y_train)

In [26]:
# Make predictions on the test data
y_pred = LogRegModel.predict(X_test)


# Evaluate Model Performance

In [27]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9200


In [28]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[46  0]
 [ 4  0]]


In [29]:
from sklearn.metrics import classification_report

# Generate a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          No       0.92      1.00      0.96        46
         Yes       0.00      0.00      0.00         4

    accuracy                           0.92        50
   macro avg       0.46      0.50      0.48        50
weighted avg       0.85      0.92      0.88        50



# Saving the Model with Pickle

In [30]:
# Save the model using pickle
with open('LogRegModel.pkl', 'wb') as f:
    pickle.dump(LogRegModel, f)