In [13]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [14]:
df = pd.read_csv("heart.csv")

In [15]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [16]:
df.shape

(303, 14)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [18]:
df.target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [19]:
df.target.value_counts(normalize=True)

1    0.544554
0    0.455446
Name: target, dtype: float64

In [20]:
X = df.drop("target", axis=1)

y = df.pop("target")

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, train_labels, test_labels = train_test_split(X, y, test_size=.30, random_state=0)

In [22]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('train_labels',train_labels.shape)
print('test_labels',test_labels.shape)

X_train (212, 13)
X_test (91, 13)
train_labels (212,)
test_labels (91,)


In [23]:
# Initialise a Decision Tree Classifier

dt_model = DecisionTreeClassifier(criterion = 'gini' )

In [24]:
# Fit the model

dt_model.fit(X_train,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [31]:
from sklearn import tree

train_char_label = ['0','1']
target_Tree_File = open('target_Tree_File.dot','w')
dot_data = tree.export_graphviz(dt_model, 
                                out_file=target_Tree_File, 
                                feature_names = list(X_train), 
                                class_names = list(train_char_label))

target_Tree_File.close()

In [32]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values('Imp',ascending=False))


               Imp
cp        0.258870
oldpeak   0.131152
ca        0.124819
age       0.102905
thal      0.074463
slope     0.064039
thalach   0.063324
sex       0.054394
fbs       0.028669
trestbps  0.027304
chol      0.026697
restecg   0.022588
exang     0.020774


In [33]:
y_predict = dt_model.predict(X_test)

In [41]:
reg_dt_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 7,random_state=0)
reg_dt_model.fit(X_train, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [42]:
target_tree_regularized = open('target_tree_regularized.dot','w')
dot_data = tree.export_graphviz(reg_dt_model, out_file= target_tree_regularized , feature_names = list(X_train), class_names = list(train_char_label))

target_tree_regularized.close()
dot_data

In [43]:
import matplotlib.pyplot as plt

In [51]:
# predict probabilities
probs = reg_dt_model.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.4f' % auc)

AUC: 0.9993


In [55]:
# predict probabilities
probs = reg_dt_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.4f' % auc)

AUC: 0.7580


In [58]:
from sklearn.metrics import classification_report,confusion_matrix
ytrain_predict = reg_dt_model.predict(X_train)
ytest_predict = reg_dt_model.predict(X_test)

In [59]:
print(classification_report(train_labels, ytrain_predict))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        94
           1       0.98      1.00      0.99       118

    accuracy                           0.99       212
   macro avg       0.99      0.98      0.99       212
weighted avg       0.99      0.99      0.99       212



In [60]:
confusion_matrix(train_labels, ytrain_predict)

array([[ 91,   3],
       [  0, 118]], dtype=int64)

In [61]:
confusion_matrix(test_labels, ytest_predict)

array([[33, 11],
       [11, 36]], dtype=int64)