In [None]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('heart_v2.csv')

In [None]:
df.head()

##### Building simple tree classifier without tuning hyperparameters

In [None]:
X_train,X_test = train_test_split(df,train_size=0.7,test_size=0.3,random_state=100)

In [None]:
X_test.shape

In [None]:
# create X_train and y_train also scale thenm
# scaler = MinMaxScaler()
# num_col = ['age','cholestrol','BP']
# X_train[num_col] = scaler.fit_transform(X_train[num_col])
y_train = X_train.pop('heart disease')


In [None]:
X_train.head()


In [None]:
y_train.head()

In [None]:
# create X_test and y_test 
# X_test[num_col] = scaler.transform(X_test[num_col])
y_test = X_test.pop('heart disease')


In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)
y_train_pred = classifier.predict(X_train) 
y_train_pred

In [None]:
print(accuracy_score(y_train,y_train_pred))
confusion_matrix(y_train,y_train_pred)

In [None]:
y_test_pred = classifier.predict(X_test)
print(accuracy_score(y_test,y_test_pred))
confusion_matrix(y_test,y_test_pred)

In [None]:
 # Function for evaluate the model
def evalute_model(dt_classifier): 
    y_train_pred = dt_classifier.predict(X_train)
    y_test_pred = dt_classifier.predict(X_test)
    print('Train Set Performance')
    print(accuracy_score(y_train,y_train_pred))
    print(confusion_matrix(y_train,y_train_pred))
    print('-'*50)
    print('Test Set Performance')
    print(accuracy_score(y_test,y_test_pred))
    print(confusion_matrix(y_test,y_test_pred))
     
def plotTree(classifier):
    plt.figure(figsize=(10,10))
    sklearn.tree.plot_tree(classifier)
    plt.show()

In [None]:
evalute_model(classifier)
plotTree(classifier)

##### Controlling Depth of tree

In [323]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier.predict(X_train,y_train)
evalute_model(classifier)
plotTree(classifier)

###### Specify the minimum samples before split

In [None]:
classifier = DecisionTreeClassifier(min_samples_split=20,random_state=42)
classifier.predict(X_train,y_train)
evalute_model(classifier)
plotTree(classifier)

##### Specifying min sample leaf in tree

In [None]:
classifier = DecisionTreeClassifier(min_samples_leaf=20)
classifier.predict(X_train,y_train)
evalute_model(classifier)
plotTree(classifier)

#### Hyper Parameter tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

classifier = DecisionTreeClassifier(random_state=42)


In [None]:
# using this total params it will create 50 comibination 5*5*2
params = {
    'max_depth' : [2,3,5,6,7,8,10,15,20,30],
    'min_samples_leaf':[5,7,8,9,10,12,14,20,50,60,100],
    'criterion':['gini','entropy']
}

In [None]:
# cv -> its an cross validation scheme ths data is dvided into number of set for train test and keep on shuffling
gridsearch = GridSearchCV(estimator=classifier,
             param_grid=params,
             cv=4, n_jobs=-1,
             verbose=True,
             scoring='accuracy')



In [None]:
%%time
gridsearch.fit(X_train,y_train)

In [None]:
# Checking the result 
res = pd.DataFrame(gridsearch.cv_results_)
res

In [None]:
res.shape

In [None]:
# 5 best result 
res.nlargest(5,'mean_test_score')

In [None]:
gridsearch.best_score_

In [None]:
gridsearch.best_estimator_

In [None]:
# we have fetched the model
dt_best = gridsearch.best_estimator_
evalute_model(dt_best)
plotTree(dt_best)