In [1]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Loading the dataset
dataset = pd.read_csv('Cardiovascular_Disease.csv')

In [3]:
#Defining independent variables in x and dependent variable (target) in y
x = dataset.iloc[:,:18].values
y = dataset.iloc[:,-1].values

In [4]:
#Checking for missing values
missing_values = dataset.isnull().sum()
print('Missing values count:', missing_values)

Missing values count: General_Health                  0
Checkup                         0
Exercise                        0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
Heart_Disease                   0
dtype: int64


In [5]:
#Applying Ordinal Encoding on the 1st column

from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()

encoded_column = oe.fit_transform(dataset[['General_Health']])
dataset['General_Health'] = encoded_column

#viewing the Ordinal Encoded values
dataset[['General_Health']].head(5)

Unnamed: 0,General_Health
0,3.0
1,4.0
2,4.0
3,3.0
4,2.0


In [6]:
#Applying Label Encoding on all the required columns
from sklearn.preprocessing import LabelEncoder

#Initializing and fitting LabelEncoder for required columns

columns_to_LabelEncode = ['Exercise','Skin_Cancer','Other_Cancer','Depression','Diabetes','Arthritis','Sex','Smoking_History']

encoded_columns = {}

for column in columns_to_LabelEncode:
    encoded_columns[column] = LabelEncoder().fit_transform(dataset[column])
    
#Updating the dataset with encoded values
for column in encoded_columns:
    dataset[column] = encoded_columns[column]
    
#Seperately handling the target (dependent) variable
target_column = 'Heart_Disease'
dataset[target_column] = LabelEncoder().fit_transform(dataset[target_column])

#Viewing the LabelEncoded data
dataset[['Exercise','Skin_Cancer','Other_Cancer','Depression','Diabetes','Arthritis','Sex','Smoking_History','Heart_Disease']].head(5)


Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Smoking_History,Heart_Disease
0,0,0,0,0,0,1,0,1,0
1,0,0,0,0,1,0,0,0,1
2,1,0,0,0,1,0,0,0,0
3,1,0,0,0,1,0,1,0,1
4,0,0,0,0,0,0,1,1,0


In [7]:
#Applying One Hot Encoding on all the required columns
from sklearn.preprocessing import OneHotEncoder

columns_to_OneHotEncode = ['Checkup', 'Age_Category']

#Initializign the OneHotEncoder
ohencoder = OneHotEncoder(drop='first', sparse=False)

#Fitting and transforming the selected columns
encoded_columns = ohencoder.fit_transform(dataset[columns_to_OneHotEncode])

#Creating a new dataset with the encoded column
encoded_dataset = pd.DataFrame(encoded_columns, columns=ohencoder.get_feature_names_out(columns_to_OneHotEncode))

#Displaying the encoded data
print(encoded_dataset)

#Concatenating the original dataset with the encoded dataset
dataset = pd.concat([dataset, encoded_dataset], axis=1)



        Checkup_Never  Checkup_Within the past 2 years  \
0                 0.0                              1.0   
1                 0.0                              0.0   
2                 0.0                              0.0   
3                 0.0                              0.0   
4                 0.0                              0.0   
...               ...                              ...   
308849            0.0                              0.0   
308850            0.0                              0.0   
308851            0.0                              0.0   
308852            0.0                              0.0   
308853            0.0                              0.0   

        Checkup_Within the past 5 years  Checkup_Within the past year  \
0                                   0.0                           0.0   
1                                   0.0                           1.0   
2                                   0.0                           1.0   
3          

In [8]:
#Redefining independent and dependent variables by including the encoded data in independent variable
df1 = dataset.iloc[:,0]     # Considering 'General Health' column
df2 = dataset.iloc[:,2:9]   # Excluding 'Checkup' and 'Age Category' columns as they have categorical data 
df3 = dataset.iloc[:,12:18] # Excluding 'Height_(cm)', 'Weight_(kg)' columns as we have BMI column
df4 = dataset.iloc[:,19:-1] # Including all One Hot Encoded columns after the independent variable

x = pd.concat([df1, df2, df3, df4], axis=1)
y = dataset.iloc[:,18]
data = pd.concat([x, y], axis=1) # 'data' now consists of the cleaned and final data.

In [9]:
#Performing Feature Selection with Mutual Information
from sklearn.feature_selection import SelectKBest, mutual_info_classif

#Selecting the top k features based on mutual information
selector = SelectKBest(score_func=mutual_info_classif, k=7)
x = selector.fit_transform(x, y) #Replacing the dependent variables with the top k features

#Getting the indices of selected features and their corresponding scores
selected_indices = selector.get_support(indices=True)
selected_scores = selector.scores_
print(data.columns[selected_indices])

Index(['General_Health', 'Exercise', 'Diabetes', 'Arthritis', 'Sex',
       'Smoking_History', 'Checkup_Within the past year'],
      dtype='object')


In [10]:
#Splitting data into Training Data and Testing Data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)

In [11]:
#Standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [13]:
from sklearn.naive_bayes import GaussianNB
#Creating a Gaussian Naïve Bayes classifier
gnb = GaussianNB()

#Training the model using the training data
gnb.fit(x_train, y_train)

In [14]:
#Predicting the results of the test set
y_pred = gnb.predict(x_test)

In [15]:
#Calculating Accuracy Percent
from sklearn.metrics import accuracy_score
a = accuracy_score(y_test, y_pred)
print('Accuracy percent: '+str(a*100)+'%')

Accuracy percent: 87.99760405368214%


In [16]:
#Calculating Precision Score
from sklearn.metrics import precision_score
p = precision_score(y_test, y_pred)
print(p)

0.2356122890523583


In [17]:
#Calculating Recall Score
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)

0.21911468812877263

In [18]:
#Calculating F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f1)

0.22706422018348624


In [30]:
#Accessing the estimated prior probabilities of each class
prior_probs = gnb.class_prior_
class_labels = gnb.classes_
print("Prior Class Label & Probabilities:", dict(zip(class_labels, prior_probs)))

Prior Class Label & Probabilities: {0: 0.9190514928182028, 1: 0.0809485071817972}


In [31]:
#Calculating class probabilities for the test data
class_probs = gnb.predict_proba(x_test)

#Display the class probabilities for the first few instances
print("Class Probabilities:", class_probs[:5]) # Print probabilities for the first 5 instances

Class Probabilities: [[0.98777418 0.01222582]
 [0.90839878 0.09160122]
 [0.98641301 0.01358699]
 [0.84731306 0.15268694]
 [0.66861663 0.33138337]]
