In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
df=pd.read_csv('/kaggle/input/processed-cleveland-heart-disease/processed.cleveland.data',header=None)

In [None]:
df.head()

In [None]:
df.describe

In [None]:
df.columns = ['age','sex','cp','restbp','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','hd']
df.head()

****Missing Data****

In [None]:
print(df.dtypes)
print(df.columns.isnull().sum())

In [None]:
for i in df.select_dtypes('object').columns:
    print(i, df[i].unique())

In [None]:
 df['thal'].unique()   

In [None]:
df.loc[(df['ca']=='?')|(df['thal']=='?')]

Since it is only 2% missing data, we drop them

In [None]:
df.replace('?',np.nan,inplace=True)
df.loc[df['ca'].isnull()]

In [None]:
df.dropna(subset=['ca','thal'],inplace=True)

In [None]:
print(len(df))
print(df['ca'].unique())
print(df['thal'].unique())

In [None]:
df['ca'].astype(float)
df['thal'].astype(float)
print(df['ca'].dtype)
print(df['thal'].dtype)

In [None]:
sns.histplot(df['age'],kde=True)
plt.title('Distribution of age groups')
plt.show()

In [None]:
df.groupby('sex')['age'].value_counts().to_frame()

In [None]:
x= df.drop('hd',axis=1).copy()
y= df['hd'].copy()
print(x.head())
print(y.head())

In [None]:
for i in df.select_dtypes('float').columns:
    plt.figure(figsize=(8, 6))
    df[i].hist()
    plt.title('Histogram of ' + i)
    plt.xlabel(i)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
fig = (1,2,1)
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='cp',hue='sex')

In [None]:
x=pd.get_dummies(x,columns=['cp','restecg','slope','thal'])
x.head()

In [None]:
y = y.to_frame()
y.head()

In [None]:
y.loc[y['hd'] > 1] =1
y['hd'].unique()

Building the Tree

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,random_state=42)
tree = DecisionTreeClassifier(random_state=42)
tree = tree.fit(X_train,Y_train)

In [None]:
plt.figure(figsize=(10,10))
plot_tree(tree,filled=True)
plt.show()

In [None]:
Y_pred = tree.predict(X_test)

In [None]:
Y_pred=pd.DataFrame(Y_pred)

In [None]:
confusion_matrix(Y_test,Y_pred)

Tune the Tree

Find Alpha

In [None]:
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy',accuracy)

In [None]:
path = tree.cost_complexity_pruning_path(X_train,Y_train)
tree_alpha = path.ccp_alphas
print(tree_alpha)

In [None]:
ccp_trees=[]
for i in tree_alpha:
    ccp_tree = DecisionTreeClassifier(random_state=0,ccp_alpha=i)
    ccp_tree.fit(X_train,Y_train)
    ccp_trees.append(ccp_tree)
print(ccp_trees)

In [None]:
train_scores = [i.score(X_train,Y_train) for i in ccp_trees]
test_scores = [i.score(X_test,Y_test) for i in ccp_trees]
print(train_scores)
print(test_scores)

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel('alpha')
ax.set_ylabel('Score')
ax.set_xlim(0,0.05)
ax.plot(tree_alpha,train_scores,marker='o',label="train")
ax.plot(tree_alpha,test_scores,marker='o',label='test')
plt.legend()
plt.show()

We can tell the alpha that maximize the testing score is 0.016 for this train_test_split. What about other folds?

Cross Validation

In [None]:
alpha_loop_values = []

for i in tree_alpha:
    dt = DecisionTreeClassifier(random_state=0,ccp_alpha=i)
    scores = cross_val_score(dt,X_train,Y_train,cv=5)
    alpha_loop_values.append([i,np.mean(scores),np.std(scores)])

alpha_df = pd.DataFrame(alpha_loop_values, columns = ['alpha','mean_accuracy','std'])

fig, ax = plt.subplots()

ax.set_xlabel('alpha')
ax.set_ylabel('mean_accuracy')
ax.set_xlim(0.01,0.02)
ax.plot(alpha_df['alpha'],alpha_df['mean_accuracy'],marker='o')

In [None]:
alpha_df.loc[(alpha_df['alpha']>0.012)&(alpha_df['alpha']<0.018)]

Overall, 0.014225 is the best alpha

In [None]:
best_alpha = 0.014225
best_alpha

Build Tree with best_alpha

In [None]:
tree_pruned = DecisionTreeClassifier(random_state=42,ccp_alpha=best_alpha)
tree_pruned = tree_pruned.fit(X_train,Y_train)
Y_pred_pruned = tree_pruned.predict(X_test)
Y_pred_pruned = pd.DataFrame(Y_pred_pruned)
Y_pred_pruned

In [None]:
confusion_matrix(Y_test,Y_pred_pruned)

In [None]:
accuracy = accuracy_score(Y_test, Y_pred_pruned)
print('Accuracy',accuracy)

The pruned tree give a better accuracy at test set. so we will use this and replot the tree

In [None]:
plt.figure(figsize=(10,10))
plot_tree(tree_pruned,filled=True,feature_names=x.columns)