In [127]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Reading CSV file and making a copy to avoid loss of data if further changes made.

In [128]:
hrt = pd.read_csv('/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv')
hrt1 = hrt.copy()

Let's check if there are no null values, the data percentage and the first five rows.

In [129]:
hrt.info(), hrt.describe().T, hrt.head()

**OBSERVATIONS**
* There are no null values in the dataset.

Let's understand the data first.

There are 13 attributes

* age: age in years
* sex: sex (1 = male; 0 = female)
* cp: chest pain type
    -- Value 0: typical angina
    -- Value 1: atypical angina
    -- Value 2: non-anginal pain
    -- Value 3: asymptomatic
* trestbps: resting blood pressure (in mm Hg on admission to the hospital)
* chol: serum cholestoral in mg/dl
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* restecg: resting electrocardiographic results
    -- Value 0: normal
    -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach: maximum heart rate achieved
* exang: exercise induced angina (1 = yes; 0 = no)
* oldpeak = ST depression induced by exercise relative to rest
* slope: the slope of the peak exercise ST segment
    -- Value 0: upsloping
    -- Value 1: flat
    -- Value 2: downsloping
* ca(coronary calcium scan): number of major vessels (0-3) colored by flourosopy
* thal(Thalassemia): 0 = normal; 1 = fixed defect; 2 = reversable defect and the label
* condition: 0 = no disease, 1 = disease

# **OUTLIERS REMOVAL**

Now let's make the data more clean by removing it's outliers
* The columns having continous values are to be considered to remove outliers.
* Outliers are nothing but the values that are rare and out of the specific range,(box plot itself decides the range and shows the values/points lying outside)

**Resting Blood Pressure**

In [130]:
sns.boxplot(hrt['trestbps'])

In [131]:
hrt = hrt.drop(hrt[(hrt['trestbps']>170)].index)
sns.boxplot(hrt['trestbps'])

**Cholestoral**

In [132]:
sns.boxplot(hrt['chol'])

In [133]:
hrt = hrt.drop(hrt[(hrt['chol']>350)].index)
sns.boxplot(hrt['chol'])

**Thalach (max heart rate achieved)**

In [134]:
sns.boxplot(hrt['thalach'])

In [135]:
hrt = hrt.drop(hrt[(hrt['thalach']<80)].index)
sns.boxplot(hrt['thalach'])

**Old Peak**

In [136]:
sns.boxplot(hrt['oldpeak'])

In [137]:
hrt = hrt.drop(hrt[(hrt['oldpeak']>4)].index)
sns.boxplot(hrt['oldpeak'])

****

# **CORRELATION OF COLUMNS**

Let's first find the correlations between all the columns to conclude that which column has strong correlation with other column

In [138]:
hrt.corr()

In [139]:
plt.figure(figsize=(10,12))
sns.heatmap(hrt.corr(), annot=True, robust=True)

**OBSERVATIONS**
* From above map, we can conclude that columns having correlation of more than 0.35 should be considered as a strong relation.
* The columns having more than specified corr are
1. Age and ca
2. Sex and thal
3. cp and exang, cp and condition
4. exang and condition
5. old peak and slope, old peak and condition
6. ca and condition
7. thal and condition



# **BIVARIATE ANALYSIS OF COLUMNS WITH STRONG CORREALTION**

**AGE AND CA**

In [140]:
sns.barplot(x=hrt['ca'], y=hrt['age'], data=hrt)

**OBSERVATIONS**
* From above graph we understand that as the age increases there is more like to have type 2 and 3 coronary calcium scan attribute which has a strong corelation with the condition.
* The CA is mostly seen in the age group above 50.

**SEX AND THALESSEMIA**

In [141]:
sns.countplot(hrt['thal'], hue=hrt['sex'])

**OBSERVATIONS**
* The data we have, Thalessemia type 0(normal) can be seen in both male and female
* Type 1(fixed defect) can be seen for more male than in female.
* Type 2(reverseble defect) can be seen in most of the male and very less in female.


* **CHEST PAIN AND EXERCISE INDUCED ANGINA**
* **CHEST PAIN AND CONDITION**

In [142]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.countplot(hrt['cp'], hue=hrt['exang'])
plt.title('Exercise Induced Angina Graph')

plt.figure(figsize=(15,5))
plt.subplot(1,2,2)
sns.countplot(hrt['cp'], hue=hrt['condition'])
plt.title('Chest pain Graph')

**OBSERVATIONS (Graph 1)**
* People having chest pain type 0,1,2 show very low chance of having exercise induced of angina.
* But incase of chest pain type 3 many do show exercise induced of angina.

**(Graph 2)**
* Chest pain type 0 and 1 show low chances of having the disease 
* Chest pain type 2 show a hike in chances of having disease as compared to type 0 and 1 but the chances of not having the disease is much more than having it, in the type 2.
* The rate of having the disease in chest pain type 3 is the maximum of all.

**EXERCISE INDUCED ANGINA AND CONDITION**

In [143]:
sns.countplot(hrt['exang'], hue=hrt['condition'])

**OBSERVATIONS**
* We can conclude that people having Exercise induced angina as well as not having excercise induced angina both show same rate of having disease
* But the number of people having no disease, have no excercise induced angina.

* **OLD PEAK AND SLOPE**
* **OLD PEAK AND CONDITION**

In [144]:
plt.figure(figsize=(8,20))
sns.jointplot(data=hrt, x='oldpeak',y='oldpeak', hue='slope')
plt.show()

**OBSERVATION**
* The above graph depicts that slope marking of 2 can be seen in the people having the old peak more than 2.5
* Slope means ST segment shift relative to exercise-induced increments in heart rate.

In [145]:
plt.figure(figsize=(8,20))
sns.jointplot(data=hrt, x='oldpeak',y='oldpeak', hue='condition')
plt.show()

**OBSERVATIONS**
* Same as the slope the condition stating that the person has heart disease is prominent after 2.
* The people having old peak more than 2 are more likely to have the disease.


**CA AND CONDITION**


In [146]:
sns.countplot(hrt['ca'], hue=hrt['condition'])

**OBSERVATION**
* People having CA - 0 have more chances of not having the disease
* But for ca level 0 and 1, the rate of having disease is same.
* The ca with 3 has the lowest number of people not having the disease, but the rate of people having disease is more than the one not having which can conclude that ca 3 can be dangerous.

**THALESSEMIA AND CONDITION**

In [147]:
sns.countplot(hrt['thal'], hue=hrt['condition'])

**OBSERVATION**
* The highest rate of heart disease can be seen in thalessemia 2.
* The number of people having thal - 1,can be considered as people with no heart disease.
* People having thal-0, are more of not having the disease than having it.

DATA VISUALISING THE WHOLE DATASET

In [148]:
sns.pairplot(hrt,hue='condition')

# PREPARING THE MODEL

First split the data in train and test

In [149]:
x = hrt.drop('condition', axis=1)
y = hrt['condition']

x.shape, y.shape

In [150]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=4, shuffle=True)

# **K-NN model**

In [151]:
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)

In [152]:
knn_classifier = KNeighborsClassifier(n_neighbors=22)
knn_classifier.fit(x_train_std, y_train)

In [153]:
y_pred_knn = knn_classifier.predict(x_test_std)

Calculating confusion matrix

In [154]:
knn_cm = confusion_matrix(y_test, y_pred_knn)
print(knn_cm)
knn_acc = accuracy_score(y_test, y_pred_knn)
print(knn_acc)

In [155]:
print("Accuracy on training set: {:.3f}".format(knn_classifier.score(x_train_std, y_train)))
print("Accuracy on testing set: {:.3f}".format(knn_classifier.score(x_test_std, y_test)))

In [156]:
knn_test = knn_classifier.score(x_test_std, y_test)

Let's find the number of neighbors to be considered.

In [157]:
scores = []
for k in range(1,30):
    knn = KNeighborsClassifier(k).fit(x_train_std, y_train)
    scores.append(knn.score(x_train_std, y_train))

print(scores, end = " ")

In [158]:
plt.plot(np.arange(1,30), scores)

# **DECISION TREE**

Train the model.

In [159]:
dt_class = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_class.fit(x_train, y_train)

Predict the results

In [160]:
y_pred_dt = dt_class.predict(x_test)
print(y_pred_dt)

# **Confusion Matrix**

In [161]:
dt_cm = confusion_matrix(y_test,y_pred_dt)
print(dt_cm)
dt_acc = accuracy_score(y_test,y_pred_dt)
print(dt_acc)

In [162]:
print("Accuracy on Training set: {:.3f}".format(dt_class.score(x_train,y_train)))
print("Accuracy on Test set: {:.3f}".format(dt_class.score(x_test,y_test)))
dt_test = dt_class.score(x_test,y_test)

In [163]:
algorithms = ['KNN','Decision Tree']
scores = [knn_test, dt_test]
plt.figure(figsize=(10,8))
sns.barplot(algorithms, scores)

# **SVM**

In [164]:
svc_classifier = SVC(C=0.5)
svc_classifier.fit(x_train, y_train)

In [165]:
y_pred_svc = svc_classifier.predict(x_test)
y_pred_svc

# **CONFUSION MATRIX**

In [166]:
svc_cm = confusion_matrix(y_test,y_pred_svc)
print(svc_cm)
svc_acc = accuracy_score(y_test,y_pred_svc)
print(dt_acc)

In [167]:
print("Accuracy on Training set: {:.3f}".format(svc_classifier.score(x_train,y_train)))
print("Accuracy on Test set: {:.3f}".format(svc_classifier.score(x_test,y_test)))
svc_test = svc_classifier.score(x_test,y_test)

In [169]:
algorithms = ['KNN','Decision Tree','Spupport Vector Machine']
scores = [knn_test, dt_test, svc_test]
plt.figure(figsize=(10,8))
sns.barplot(algorithms, scores)

# **OBSERVATION**
* FROM ABOVE BAR PLOT WE CAN CONCLUDE THAT KNN BEST FITS THE DATA.