In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('heart_attack_dataset.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
#Assiging the features and the target variable
features = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]

y_hat = df[['target']]

In [6]:
#Splitting the dataset into training and testing subsets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, y_hat, test_size=0.25, random_state=42)

In [7]:
#Base Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(random_state=42)

classifier.fit(x_train, y_train)

predictions = classifier.predict(x_test)

score = classifier.score(x_test, y_test)
print(score)

0.8026315789473685


In [8]:
#Calculating Mean Absolute Error
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, predictions)

print(mae)

0.19736842105263158


In [9]:
#Finding the optimal max depth for decision tree classifier
max_depth_range = list(range(1,20))

accuracy = []

for depth in max_depth_range:
    clf = DecisionTreeClassifier(max_depth = depth, random_state=42)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    accuracy.append((depth, score))

print(accuracy)

[(1, 0.6842105263157895), (2, 0.8289473684210527), (3, 0.7763157894736842), (4, 0.8157894736842105), (5, 0.8026315789473685), (6, 0.7894736842105263), (7, 0.8026315789473685), (8, 0.7631578947368421), (9, 0.8026315789473685), (10, 0.8026315789473685), (11, 0.8026315789473685), (12, 0.8026315789473685), (13, 0.8026315789473685), (14, 0.8026315789473685), (15, 0.8026315789473685), (16, 0.8026315789473685), (17, 0.8026315789473685), (18, 0.8026315789473685), (19, 0.8026315789473685)]


In [10]:
#Finding the optimal max leaf nodes for decision tree classifier
max_leaf_range = list(range(2,10))

accuracy_1 = []

for leaf in max_leaf_range:
    clf = DecisionTreeClassifier(max_depth = 2, max_leaf_nodes = leaf, random_state=42)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    accuracy_1.append((leaf, score))

print(accuracy_1)

[(2, 0.6842105263157895), (3, 0.7105263157894737), (4, 0.8289473684210527), (5, 0.8289473684210527), (6, 0.8289473684210527), (7, 0.8289473684210527), (8, 0.8289473684210527), (9, 0.8289473684210527)]


In [11]:
#Using sklearn feature importances to find out the importance of all features
importances = pd.DataFrame({'Feature':x_train.columns, 'Importance': np.round(clf.feature_importances_, 3)})
importances = importances.sort_values('Importance',ascending=False)
importances.head()

Unnamed: 0,Feature,Importance
11,ca,0.55
12,thal,0.265
2,cp,0.185
0,age,0.0
1,sex,0.0


In [12]:
#Hyperparameter tuned decision tree classifier
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(max_depth = 2, max_leaf_nodes = 4, random_state=42)

classifier.fit(x_train, y_train)

predictions = classifier.predict(x_test)

score = classifier.score(x_test, y_test)
print(score)

0.8289473684210527
