In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
df=pd.read_csv('heart.csv')

In [None]:
df.head()

#### Columns Details

age: person's age

gender (1 = male, 0 = female) 

cp: The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic) 

trestbps: The person's resting blood pressure (mm Hg on admission to the hospital) 

chol: The person's cholesterol measurement in mg/dl 

fbs: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false) 

restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria) 

thalach: The person's maximum heart rate achieved exang: Exercise induced angina (1 = yes; 0 = no) 

oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more here) 

slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping) 

ca: The number of major vessels (0-3) 

thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect) 

target: Heart disease (0 = no, 1 = yes)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.target.value_counts()

In [None]:
"""
Plotting the target values. The target value of "0" refers 
to no disease and a value of "1" refers to disease.
"""

sns.countplot(x="target", data=df)

In [None]:
"""
Plotting number of males vs number of females. Value "1"
represents a male and "0" represents female.
"""

sns.countplot(x="gender",data=df)
plt.xlabel("Gender (0=female, 1=male)")
plt.show()

In [None]:
# Here gender=1(male) and target =1(have disease)

male_disease = df[(df.gender==1) & (df.target==1)]   

# Here gender=1(male) and target =0(have no disease )

male_no_disease = df[(df.gender==1) & (df.target==0)]       

print(len(male_disease),"male_disease")
print(len(male_no_disease),"male_no_disease")

In [None]:
a=len(male_disease)
b=len(male_no_disease)
sns.barplot(x=['male_disease ','male_no_disease'],y=[a,b])
plt.xlabel('Male and Target')
plt.ylabel('Count')
plt.title('Disease in Males')
plt.show()

In [None]:
# Here gender=0(female) and target =1(have disease)

female_disease = df[(df.gender==0) & (df.target==1)]       

# Here gender=0(female) and target =0(have no disease )
female_no_disease = df[(df.gender==0) & (df.target==0)]       

print(len(female_disease),"female_disease")
print(len(female_no_disease),"female_no_disease")

In [None]:
c=len(female_disease)
d=len(female_no_disease)
sns.barplot(x=['female_disease ','female_no_disease'],y=[c,d])
plt.xlabel('Female and Target')
plt.ylabel('Count')
plt.title('Disease in Females')
plt.show()

In [None]:
"""
Let's do chest pain analysis
"""
df["cp"].value_counts()

In [None]:
"""
Plotting chest pain according to severity
"""
sns.countplot(x='cp', data=df)
plt.xlabel(" Chest type")
plt.ylabel("Count")
plt.title("Chest type Vs count plot")
plt.show()

In [None]:
"""
Within each chest pain category, let's see how many had 
the disease and how many didn't have the disease.
"""
print(len(df[(df.cp==0)&(df.target==0)]),"=cp_zero_target_zero")
print(len(df[(df.cp==0)&(df.target==1)]),"=cp_zero_target_one")
print(len(df[(df.cp==1)&(df.target==0)]),"=cp_one_target_zero")
print(len(df[(df.cp==1)&(df.target==1)]),"=cp_one_target_one")

In [None]:
target_0=len(df[(df.cp==0)&(df.target==0)])
target_1=len(df[(df.cp==0)&(df.target==1)])
plt.subplot(1,2,1)
sns.barplot(x=["target_0","target_1"],y=[target_0,target_1])
plt.ylabel("Count")
plt.title("Chest_type_0 Vs count plot")


target_0=len(df[(df.cp==1)&(df.target==0)])
target_1=len(df[(df.cp==1)&(df.target==1)])
plt.subplot(1,2, 2)
sns.barplot(x=["target_0","target_1"],y=[target_0,target_1])
plt.ylabel("Count")
plt.title("Chest_type_1 Vs count plot")

In [None]:
print(len(df[(df.cp==2)&(df.target==0)]),"=cp_two_target_zero")
print(len(df[(df.cp==2)&(df.target==1)]),"=cp_two_target_one")
print(len(df[(df.cp==3)&(df.target==0)]),"=cp_three_target_zero")
print(len(df[(df.cp==3)&(df.target==1)]),"=cp_three_target_one")

In [None]:
target_0=len(df[(df.cp==2)&(df.target==0)])
target_1=len(df[(df.cp==2)&(df.target==1)])
plt.subplot(1,2,1)
sns.barplot(x=["target_0","target_1"],y=[target_0,target_1])
plt.ylabel("Count")
plt.title("Chest_type_2 Vs count plot")


target_0=len(df[(df.cp==3)&(df.target==0)])
target_1=len(df[(df.cp==3)&(df.target==1)])
plt.subplot(1,2, 2)
sns.barplot(x=["target_0","target_1"],y=[target_0,target_1])
plt.ylabel("Count")
plt.title("Chest_type_3 Vs count plot")

In [None]:
"""
Exploring relationship between age and cholestrol.
"""
age_unique=sorted(df.age.unique())
age_chol_values=df.groupby('age')['chol'].count().values
mean_chol=[]
for i,age in enumerate(age_unique):
    mean_chol.append(sum(df[df['age']==age].chol)/age_chol_values[i])
    

In [None]:
plt.figure(figsize=(10,5))
sns.pointplot(x=age_unique,y=mean_chol,color='red',alpha=0.8)
plt.xlabel('age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('chol',fontsize = 15,color='blue')
plt.title('age vs chol',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
"""
Since "cp", "thal" and "slope" columns are categorical 
variables. We will convert them into dummy variable.
"""

cp = pd.get_dummies(df['cp'], prefix = "cp", drop_first=True)
thal = pd.get_dummies(df['thal'], prefix = "thal" , drop_first=True)
slope = pd.get_dummies(df['slope'], prefix = "slope", drop_first=True)


In [None]:
data = pd.concat([df, cp, thal, slope], axis=1)
data.head()

In [None]:
"""
Let's drop old "cp", "thal" and "slope" columns
"""
data.drop(["cp", "thal", "slope"], axis=1, inplace=True)
data.head()

In [None]:
x = data.drop(['target'], axis=1)
y = data.target

In [None]:
print(x.shape)

In [None]:
x.corr()

In [None]:
"""
Normalizing x
"""

x = (x - x.min())/(x.max()-x.min())
x.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [None]:
from sklearn.linear_model import LogisticRegression

logi = LogisticRegression(C=1, penalty='l2')
logi.fit(x_train, y_train)
logi.score(x_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm_lg = confusion_matrix(y_test, logi.predict(x_test))
print(cm_lg)

In [None]:
"""
Decision Tree
"""

from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(x_train,y_train)                        

In [None]:
predict=dtree.predict(x_test)                               
predict

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predict))


from sklearn.metrics import confusion_matrix
cm_tree = confusion_matrix(y_test,predict )

print(cm_tree)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy is:",accuracy_score(y_test,predict)*100)    #HERE WE ARE GETTING OUR ACCURACY OF OUR MODEL

In [None]:
"""
Random Forest
"""
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(x_train,y_train)

In [None]:
rfc_predict=rfc.predict(x_test)                                

In [None]:
from sklearn.metrics import confusion_matrix
cm_rf = confusion_matrix(y_test,rfc_predict )

print(cm_rf)
print(classification_report(y_test,rfc_predict))

In [None]:
print("Accuracy is:",accuracy_score(y_test,rfc_predict)*100)    