In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('Heart Disease.csv')
df.shape

(319795, 18)

# Removing Duplicate value

In [3]:
df.drop_duplicates(inplace=True)
df.shape

(301717, 18)

In [4]:
df.HeartDisease.value_counts()

No     274456
Yes     27261
Name: HeartDisease, dtype: int64

# Encoding The DataSet with LabelEncoder


In [5]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()

In [6]:
from pandas.core.dtypes.common import is_numeric_dtype
for column in df.columns:
    if is_numeric_dtype(df[column]):
        continue
    else:
        df[column]=label.fit_transform(df[column])

# Separting x and y

In [7]:
x=df.drop('HeartDisease',axis=1)
y=df['HeartDisease']

In [8]:
x

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.60,1,0,0,3,30,0,0,7,5,2,1,4,5,1,0,1
1,20.34,0,0,1,0,0,0,0,12,5,0,1,4,7,0,0,0
2,26.58,1,0,0,20,30,0,1,9,5,2,1,1,8,1,0,0
3,24.21,0,0,0,0,0,0,0,11,5,0,0,2,6,0,0,1
4,23.71,0,0,0,28,0,1,0,4,5,0,1,4,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,27.41,1,0,0,7,0,1,1,8,3,2,0,1,6,1,0,0
319791,29.84,1,0,0,0,0,0,1,3,3,0,1,4,5,1,0,0
319792,24.24,0,0,0,0,0,0,0,5,3,0,1,2,6,0,0,0
319793,32.81,0,0,0,0,0,0,0,1,3,0,0,2,12,0,0,0


In [9]:
y

0         0
1         0
2         0
3         0
4         0
         ..
319790    1
319791    0
319792    0
319793    0
319794    0
Name: HeartDisease, Length: 301717, dtype: int32

# Imbalance Data handling


In [10]:
df.HeartDisease.value_counts()

0    274456
1     27261
Name: HeartDisease, dtype: int64

In [11]:
not_HeartDisease=(274456/(274456+27261))*100
HeartDisease=(27261/(274456+27261))*100

In [12]:
print('No HeartDisease {} percent of the total people'.format(not_HeartDisease))

No HeartDisease 90.96471196518591 percent of the total people


In [13]:
print('HeartDisease {} percent of the total people'.format(HeartDisease))

HeartDisease 9.03528803481408 percent of the total people


In [14]:
HeartDisease_no=df[df['HeartDisease']==0]
HeartDisease_yes=df[df['HeartDisease']==1]

In [15]:
HeartDisease_no.shape

(274456, 18)

In [16]:
HeartDisease_yes.shape

(27261, 18)

# So we have to balance our dataset . Now applying Random over sampling techniques



In [17]:
from imblearn.over_sampling import RandomOverSampler
random_over_sampler = RandomOverSampler(random_state =100)

In [18]:
x_new,y_new=random_over_sampler.fit_resample(x,y)
x_new

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.60,1,0,0,3,30,0,0,7,5,2,1,4,5,1,0,1
1,20.34,0,0,1,0,0,0,0,12,5,0,1,4,7,0,0,0
2,26.58,1,0,0,20,30,0,1,9,5,2,1,1,8,1,0,0
3,24.21,0,0,0,0,0,0,0,11,5,0,0,2,6,0,0,1
4,23.71,0,0,0,28,0,1,0,4,5,0,1,4,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548907,35.43,0,0,0,0,0,0,0,12,5,0,1,4,5,0,0,1
548908,23.01,1,0,0,0,20,0,1,8,5,0,1,4,6,0,0,0
548909,30.80,0,0,0,0,0,0,0,10,4,2,0,2,5,0,0,0
548910,44.63,0,0,0,0,7,1,0,9,2,2,0,2,8,0,0,0


In [19]:
y_new

0         0
1         0
2         0
3         0
4         0
         ..
548907    1
548908    1
548909    1
548910    1
548911    1
Name: HeartDisease, Length: 548912, dtype: int32

In [20]:
x_new.shape

(548912, 17)

In [21]:
y_new.shape

(548912,)

In [22]:
y_new.value_counts()

0    274456
1    274456
Name: HeartDisease, dtype: int64

# Bernoullil Naive Bayes for holdout cross validation with Randon Over Sampling


In [24]:
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import BernoulliNB
bnb=BernoulliNB()

In [25]:
over_xtrain,over_xtest,over_ytrain,over_ytest = train_test_split(x_new,y_new,test_size = 0.30 ,random_state =42)
bnb.fit(over_xtrain,over_ytrain)

BernoulliNB()

In [26]:
OverSampleing_dataset_accuracy9= bnb.score(over_xtest,over_ytest)
OverSampleing_dataset_accuracy9

0.7020901903154111

# Bernoulli Naive Bayes for KFold cross validation for Random Over sampling


In [27]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits = 10)

In [None]:
OverSampleingDataset_kfold_result9= cross_val_score(bnb,x_new,y_new,cv = kfold)
OverSampleingDataset_kfold_result9

In [None]:
OverSampleingDataset_kfold_result9.mean()

# Bernoulli Naive Bayes for StratiFied KFold cross validation for Over sampling

In [None]:
from sklearn.model_selection import StratifiedKFold
Stf_Kfold = StratifiedKFold(n_splits = 10)

In [None]:
OverSampleingDataset_Sf_kfold_result9= cross_val_score(bnb,x9_new,9,cv = Stf_Kfold)
OverSampleingDataset_Sf_kfold_result9 

In [None]:
OverSampleingDataset_Sf_kfold_result9.mean()

# Now from over sampling ,we calculate all confusion matrix performance


In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,plot_roc_curve
bnb.fit(over_xtrain,over_ytrain)

In [None]:
pred_y9=mnb.predict(over_xtest)#Predicted Class 
pred_y9

In [None]:
over_ytest

In [None]:
np.array(over_ytest) #actual class/y

In [None]:
print(classification_report(over_ytest,pred_y9))

In [None]:
cm9=confusion_matrix(over_ytest,pred_y9)
cm9

# Visualization with heatmap performance of confussion matrix for Bernoulli NB


In [None]:
sns.heatmap(cm9,annot=True)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')

In [None]:
tp,fn,fp,tn=confusion_matrix(over_ytest,pred_y9).reshape(-1)

In [None]:
tp

In [None]:
fn

In [None]:
fp

In [None]:
tn

# Accuracy


In [None]:
(tp+tn)/(tp+tn+fn+fp)

In [None]:
#precision
ppv = tp/(tp+fp)
print(ppv)

In [None]:
#Recall
trp = tp/(tp+fn)
print(trp)

In [None]:
#1-Specificity
fpr = fp/(fp+tn)
print(fpr)

In [None]:
#F1-score
f1_score = (2*ppv*trp)/(ppv+trp)
print(f1_score)

In [None]:
#Specificity
tnr = tn/(tn+fp)
print(tnr)

In [None]:
#CSI
csi = tp/(tp+fn+fp)
print(csi)

In [None]:
#FDR
fdr = fp/(tp+fp)
print(fdr)

# Roc and Auc curve for Bernoulli NB


In [None]:
plot_roc_curve(bnb,over_xtest,over_ytest,color='red')
plt.plot([0,1],[0,1])

# Tunning for Bernoulli NB