In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_excel('/kaggle/input/heart-disease-dataset/Heart Disease.xlsx')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(),annot=True)

In [None]:
#sns.countplot(df.HeartDisease)
sns.countplot(x='Smoking',hue='HeartDisease',data=df)

# NB: The Dataset is not balanced.

In [None]:
data = df[df['HeartDisease']=='Yes']
data

In [None]:
smoke=data['Smoking'].value_counts()
smoke

In [None]:
labels=["No",'Yes']

In [None]:
plt.figure(figsize=(5,5))
plt.pie(smoke,labels=labels,autopct='%2.2f%%',shadow=True)#autopct enables you to display the percent value using Python string formatting
plt.legend(title='Smoking',loc='lower right')
plt.show()

In [None]:
def  bar_chart(feature):
    live=df[df['HeartDisease']=='No'][feature].value_counts()
    die=df[df['HeartDisease']=='Yes'][feature].value_counts()
    df1=pd.DataFrame([live,die])
    df1.index=['Live','Die']
    df1.plot(kind='bar',figsize=(10,5))

In [None]:
bar_chart('Smoking')

In [None]:
sns.distplot(df['BMI']) #A Distplot or distribution plot, depicts the variation in the data distribution.

In [None]:
def kde(x):
    facet=sns.FacetGrid(df,hue="HeartDisease",aspect=4)
    facet.map(sns.kdeplot,x,shade=True)
    facet.set(xlim=(df[x].min(),df[x].max()))#Limits for each of the axes on each facet (only relevant when share{x, y} is True).
    facet.add_legend()
    plt.show()

In [None]:
kde('BMI')

In [None]:
kde('PhysicalHealth')

In [None]:
kde('MentalHealth')

In [None]:
kde('SleepTime')

In [None]:
bar_chart('AlcoholDrinking')

In [None]:
AlcoholDrinking=data['AlcoholDrinking'].value_counts()
AlcoholDrinking

In [None]:
plt.figure(figsize=(5,5))
plt.pie(AlcoholDrinking,labels=labels,autopct='%2.2f%%',shadow=True)#autopct enables you to display the percent value using Python string formatting
plt.legend(title='',loc='lower right')
plt.show()

In [None]:
bar_chart('Stroke')

In [None]:
Stroke=data['Stroke'].value_counts()
Stroke

In [None]:
plt.figure(figsize=(5,5))
plt.pie(Stroke,labels=labels,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.75,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='Stroke',loc='upper right')
plt.show()

In [None]:
bar_chart('DiffWalking')

In [None]:
DiffWalking=data['DiffWalking'].value_counts()

In [None]:
plt.figure(figsize=(5,5))
plt.pie(DiffWalking,labels=labels,explode=[0,0.025] ,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.75,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='DiffWalking',loc='upper right',bbox_to_anchor=(1.2,1))
plt.show()

In [None]:
Sex=df['Sex'].value_counts()
Sex
labels=['Female','Male']
plt.figure(figsize=(6,6))
plt.pie(Sex,labels=labels,explode=[0,0.05] ,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.75,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='Sex',loc='upper right')
plt.show()

In [None]:
Race=df['Race'].value_counts()
Race

In [None]:
labels=['White','Hispanic','Black','Other','Asian','American Indian/Alaskan Native']
plt.figure(figsize=(5,5))
plt.pie(Race,labels=labels,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.65,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='Race',loc='upper right',framealpha=0.5,fontsize=8,bbox_to_anchor=(2,1))
plt.show()

In [None]:
PhysicalActivity=df['PhysicalActivity'].value_counts()
PhysicalActivity

In [None]:
labels=['Yes','No']
plt.figure(figsize=(5,5))
plt.pie(PhysicalActivity,labels=labels,explode=[0,0.05] ,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.75,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='Physical Activity',loc='upper right',bbox_to_anchor=(1.15,1))
plt.show()

In [None]:
bar_chart('Asthma')


In [None]:
SkinCancer=data['SkinCancer'].value_counts()
SkinCancer

In [None]:
plt.figure(figsize=(5,5))
plt.pie(SkinCancer,labels=labels,explode=[0,0.05] ,autopct='%2.1f%%',shadow=True)
circle=plt.Circle(xy=(0,0),radius=0.75,facecolor='white')
plt.gca().add_artist(circle)
plt.legend(title='Skin Cancer',loc='upper right',bbox_to_anchor=(1.1,1))
plt.show()

In [None]:
 bar_chart('SkinCancer')

# Data preprocessing for balancing


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
list=['HeartDisease','Smoking', 'AlcoholDrinking', 'Stroke','DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']
for i in list:
    df[i]=le.fit_transform(df[i])

In [None]:
df.head()

In [None]:
x=df.drop(columns=['HeartDisease'])
y=df['HeartDisease']

In [None]:
x.sample(10)

# Undersampling

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
NearMiss_obj = NearMiss()
new_x , new_y = NearMiss_obj.fit_resample(x,y)

In [None]:
new_y

In [None]:
new_y.value_counts()

In [None]:
df2=pd.DataFrame(new_x)
df2.head()


In [None]:
df3=pd.DataFrame(new_y)
df3.head()

In [None]:
df4=pd.concat([df2,df3],axis=1)
df4

# Feature ranking


In [None]:
df4.nunique()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier # Decison tree

In [None]:
extr = ExtraTreesClassifier()

In [None]:
extr.fit(new_x,new_y)

In [None]:
feature_importance = extr.feature_importances_
feature_importance  

In [None]:
imp = pd.DataFrame(feature_importance, columns=['Gain_Score'])
imp.head(10)

In [None]:
new_x.columns

In [None]:
cols = pd.DataFrame(new_x.columns, columns=['Feature_Names'])
cols.head(10)

In [None]:
gains = pd.concat([cols,imp],axis=1)
gains

In [None]:
newx = gains.nlargest(18,'Gain_Score')
newx

In [None]:
sns.barplot(x='Gain_Score',y='Feature_Names',data=newx)


In [None]:
features = pd.Series(extr.feature_importances_, index = x.columns)
plt.figure(figsize=(14,6))
features.nlargest(10).plot(kind='barh', color='g')

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2, f_regression #chi2 only when features are non negative

In [None]:
model2 = SelectKBest(score_func=f_classif)


In [None]:
feature_score = model2.fit(new_x,new_y)


In [None]:
feature_score.scores_

In [None]:
cols = pd.DataFrame(feature_score.scores_ , columns=['Feature_Scores'])
cols

In [None]:
col2 = pd.DataFrame(new_x.columns, columns=['Feature_Names'])
col2.head()

In [None]:
scores = pd.concat([col2,cols],axis=1)
scores

In [None]:
new = scores.nlargest(18,'Feature_Scores')
new

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x='Feature_Scores',y='Feature_Names',data=new)
plt.title('Feature ranking using SelectKBest classifier',fontsize=18)


# PCA

In [None]:
#FEATURE SCALING 
from sklearn.preprocessing import MinMaxScaler

In [None]:
mmx = MinMaxScaler() 

In [None]:
scaled_x = mmx.fit_transform(new_x)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)
x_pca = pca.fit_transform(scaled_x)

In [None]:
x_pca

In [None]:
features = pd.DataFrame(x_pca, columns=['pca1','pca2','pca3'])

In [None]:
import plotly.express as pl
pl.scatter_3d(features, x='pca1',y='pca2',z='pca3', color='pca1')

# EDA for Balanced Data


In [None]:
sns.countplot(x='Smoking',hue='HeartDisease',data=df4)


In [None]:
def KDE(x):
    facet=sns.FacetGrid(df4,hue="HeartDisease",aspect=4)
    facet.map(sns.kdeplot,x,shade=True)
    facet.set(xlim=(df4[x].min(),df4[x].max()))
    facet.add_legend()
    plt.show()

In [None]:
KDE('BMI')

In [None]:
KDE('PhysicalHealth')

In [None]:
KDE('SleepTime')

In [None]:
df4.corr()

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df4.corr(),annot=True)

# Split Dataset into Train and Test


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(new_x,new_y,train_size=0.75,random_state=42)

In [None]:
xtest.head(50)

In [None]:
ytest.head(50)

In [None]:
from sklearn.tree import DecisionTreeClassifier
import xgboost 
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf=DecisionTreeClassifier()
xgb = XGBClassifier()
ada=AdaBoostClassifier()
neigh=KNeighborsClassifier(n_neighbors= 50)
log=LogisticRegression()
svm = SVC(kernel='linear')
rt = RandomForestClassifier()

In [None]:
def model(xtrain,ytrain,xtest,ytest):
    model_name=['LogisticRegression','KNNClaccification','AdaBoostClassifier','DecisionTreeClassifier','XGBClassifier','SVM','RandomForestClassifier']
    accuracy=[]
    train_accuracy=[]

    #LogisticRegression
    log.fit(xtrain,ytrain)
    accuracy.append(log.score(xtest,ytest))
    train_accuracy.append(log.score(xtrain,ytrain))

    
    #KNNClassification
    neigh.fit(xtrain,ytrain)
    accuracy.append(neigh.score(xtest,ytest))
    train_accuracy.append(neigh.score(xtrain,ytrain))

    
    #AdaBoostClassification
    ada.fit(xtrain,ytrain)
    accuracy.append(ada.score(xtest,ytest))
    train_accuracy.append(ada.score(xtrain,ytrain))

    
    #DecisionTreeClassifier
    clf.fit(xtrain,ytrain)
    accuracy.append(clf.score(xtest,ytest))
    train_accuracy.append(clf.score(xtrain,ytrain))

    
    #XGBClassifier
    xgb.fit(xtrain,ytrain)
    accuracy.append(clf.score(xtest,ytest))
    train_accuracy.append(xgb.score(xtrain,ytrain))
    
    #SVM
    svm.fit(xtrain, ytrain)
    accuracy.append(svm.score(xtest,ytest))
    train_accuracy.append(svm.score(xtrain,ytrain))
    
    #Random Forest Classifier
    rt.fit(xtrain, ytrain)
    accuracy.append(rt.score(xtest,ytest))
    train_accuracy.append(rt.score(xtrain,ytrain))
    
    return model_name,accuracy,train_accuracy
    

In [None]:
model_name,test,train=model(xtrain,ytrain,xtest,ytest)

In [None]:
test

In [None]:
ada.predict([['26.54','0','0','0','0','0','0','1','6','5','0','1','0','7','0','0','0']])#Actual value of row no 13501 is 0

In [None]:
ada.predict([['34.33','1','0','0','0','0','1','0','11','5','0','1','3','7','0','1','1']])#Actual value of row no 53055 is 1

In [None]:
svm.predict([['34.33','1','0','0','0','0','1','0','11','5','0','1','3','7','0','1','1']])#SVM shows the same result

In [None]:
rt.predict([['34.33','1','0','0','0','0','1','0','11','5','0','1','3','7','0','1','1']])#Random Forest shows the same result

In [None]:
ada.score(xtest,ytest)

In [None]:
def dataframe(y):
    vis={'Algorithm':['LogisticRegression','KNNClaccification','AdaBoostClassifier','DecisionTreeClassifier','XGBClassifier','SVM','RandomForestClassifier'],
     'Accuracy':y
     
    } 
    accuracy=pd.DataFrame(vis)
    return accuracy


In [None]:
data=dataframe(test)

In [None]:
plt.figure(figsize=(16,8))

plt.title('Testing Accuracy Comparison')
sns.barplot(y="Algorithm",x='Accuracy',data=data,palette='Set2')


In [None]:
data2=dataframe(train)

In [None]:
plt.figure(figsize=(16,8))

plt.title('Trainging Accuracy Comparison')
sns.barplot(y="Algorithm",x='Accuracy',data=data2,palette='Set2')


In [None]:
train

In [None]:
sns.lineplot(data=data, x="Algorithm", y="Accuracy")
sns.lineplot(data=data2, x="Algorithm", y="Accuracy")
plt.ylabel("Accuracy")
plt.legend(labels=["Test_accuracy","Train_accuracy"], title = "Accuracy")
plt.show()

In [None]:
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix,roc_curve

In [None]:
pred_log=log.predict(xtest)
pred_dclf=clf.predict(xtest)
pred_xgb=xgb.predict(xtest)
pred_ada=ada.predict(xtest)
pred_neigh=neigh.predict(xtest)
pred_svm=svm.predict(xtest)
pred_rfc=rt.predict(xtest)


In [None]:
print("Report of Logistic Regression:\n\n",classification_report(ytest, pred_log))
print("Report of XGboost Classification:\n\n",classification_report(ytest, pred_xgb))
print("Report of Adaboost Classificaiton:\n\n",classification_report(ytest, pred_ada))
print("Report of Knearest Neighbour Classificaiton:\n\n",classification_report(ytest, pred_neigh))
print("Report of Support Vector Machine Classification:\n\n",classification_report(ytest, pred_svm))
print("Report of Random Forest Classification:\n\n",classification_report(ytest, pred_rfc))

print("Report of Decision Tree:\n\n",classification_report(ytest, pred_dclf))

# Confusion Matrix


In [None]:
cm_log=confusion_matrix(ytest,pred_log)
cm_dclf=confusion_matrix(ytest,pred_dclf)
cm_xgb=confusion_matrix(ytest,pred_xgb)
cm_ada=confusion_matrix(ytest,pred_ada)
cm_neigh=confusion_matrix(ytest,pred_neigh)
cm_svm=confusion_matrix(ytest,pred_svm)
cm_rfc=confusion_matrix(ytest,pred_rfc)

In [None]:
label_con=['True','False']

In [None]:
conf_mat=[cm_log,cm_dclf,cm_xgb,cm_ada,cm_neigh,cm_svm,cm_rfc]

In [None]:
len(conf_mat)


In [None]:
algo=['Logistic Regression','Decision Tree Classification','XGboost Classification','Adaboost Classificaiton','Knearest Neighbour Classificaiton','Support Vector Machine Classification','Random Forest Classification']

In [None]:
plt.figure(figsize=(12,28))
for i in range(1,8):

    plt.subplot(5,2,i)
    sns.heatmap(conf_mat[i-1],cmap='summer',annot=True,xticklabels=label_con,yticklabels=label_con)
    plt.xlabel('Predicted values')
    plt.ylabel('Actual values')
    plt.title(f'Confusion Matrix for {algo[i-1]}',color='black')

# ROC Curve


In [None]:
from sklearn.metrics import plot_roc_curve


In [None]:
list=[clf,xgb,ada,neigh,log,svm,rt]
type(list[0])

In [None]:
plt.figure(figsize=(12,28))
for i in range(1,8):
    plot_roc_curve(list[i-1], xtest, ytest)
    plt.plot([0,1],[0,1])

# Gradio deployment

In [None]:
df4.head()

In [None]:
rt.predict([['34.33','1','0','0','0','0','1','0','11','5','0','1','3','7','0','1','1']])#Random Forest shows the same result

In [None]:
ada.predict([['34.33','1','0','0','0','0','1','0','11','5','0','1','3','7','0','1','1']])#Actual value of row no 53055 is 1

In [None]:
clf.predict([['26.54','0','0','0','0','0','0','1','6','5','0','1','0','7','0','0','0']])#Actual value of row no 13501 is 0

In [None]:
!pip install gradio
import gradio as gr


In [None]:
def heart(BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer):
    
    HeartDisease = rt.predict([[BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer]])
    return HeartDisease
    

In [None]:
heart(26.54,0,0,0,0,0,0,1,6,5,0,1,0,7,0,0,0)

In [None]:
interface = gr.Interface(
  fn = heart, #function = heart
  inputs = ['number','number','number','number','number','number','number','number','number',
            'number','number','number','number','number','number','number','number'],
  outputs = ['text']  


).launch(share=True)