In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
dataset = pd.read_csv('diabetes-dataset.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [3]:
feature_names = dataset.columns
print(feature_names)

x = dataset.drop(["Outcome"],axis=1)
y=dataset["Outcome"]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state=6)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


## Depth of The Decision Tree : None means that all leaves will be pure or all leaves contains less than min_samples_splits sampels (default 2).

In [4]:
model_max_depth = DecisionTreeClassifier(max_depth=None)
model_max_depth.fit(X_train, Y_train)
prediction = model_max_depth.predict(X_test)
print("Accuracy (Max Depth = None) :", metrics.accuracy_score(prediction,Y_test))

Accuracy (Max Depth = None) : 0.955


## Max_Depth:The maximum depth of the tree. If N is specified, it will grow to that size and will stop. (1 <= N <= 20 )

In [8]:
#With max depth as 5 which is quite less so it will not reach to best analysis.
k=10
for i in range(20):
    model_max_depth_n = DecisionTreeClassifier(max_depth=i+1)
    model_max_depth_n.fit(X_train, Y_train)
    prediction = model_max_depth_n.predict(X_test)
    print("Accuracy (Max Depth = ",i+1,") :", metrics.accuracy_score(prediction,Y_test))

Accuracy (Max Depth =  1 ) : 0.72
Accuracy (Max Depth =  2 ) : 0.76
Accuracy (Max Depth =  3 ) : 0.7616666666666667
Accuracy (Max Depth =  4 ) : 0.77
Accuracy (Max Depth =  5 ) : 0.8466666666666667
Accuracy (Max Depth =  6 ) : 0.85
Accuracy (Max Depth =  7 ) : 0.845
Accuracy (Max Depth =  8 ) : 0.88
Accuracy (Max Depth =  9 ) : 0.9016666666666666
Accuracy (Max Depth =  10 ) : 0.93
Accuracy (Max Depth =  11 ) : 0.9383333333333334
Accuracy (Max Depth =  12 ) : 0.945
Accuracy (Max Depth =  13 ) : 0.9466666666666667
Accuracy (Max Depth =  14 ) : 0.9583333333333334
Accuracy (Max Depth =  15 ) : 0.9516666666666667
Accuracy (Max Depth =  16 ) : 0.955
Accuracy (Max Depth =  17 ) : 0.955
Accuracy (Max Depth =  18 ) : 0.9583333333333334
Accuracy (Max Depth =  19 ) : 0.955
Accuracy (Max Depth =  20 ) : 0.965


#### We can conclude by looking at the results of model_max_depth with max_depth as None and model_max_depth with max_depth from 1-20 that model_max_depth outforms model_max_depth with n from 1 to 29 but at max_depth = 20, out second model outperforms the first one with None as max_depth

### Using Decision Tree with criterion as gini  to measure the quality of our dataset split.

In [9]:
model_gini = DecisionTreeClassifier(criterion='gini')
model_gini.fit(X_train, Y_train)
prediction = model_gini.predict(X_test)

print("Accuracy (Criterion = gini):", metrics.accuracy_score(prediction,Y_test))

Accuracy (Criterion = gini): 0.955


### Using Decision Tree with criterion as entropy to measure the quality of our dataset split.

In [10]:
model_entropy = DecisionTreeClassifier(criterion='entropy')
model_entropy.fit(X_train, Y_train)
prediction = model_entropy.predict(X_test)
print("Accuracy (Criterion = entropy):", metrics.accuracy_score(prediction,Y_test))


Accuracy (Criterion = entropy): 0.9616666666666667


#### From model with criteria as entropy and model_gini with criteria as gini, we can conclude that entropy outperformed and improved the accuracy of our model.

### Using Decision Tree with splitter arguement as random for random feature selection at each node. 

In [11]:
model_random = DecisionTreeClassifier(splitter='random')
model_random.fit(X_train, Y_train)
prediction = model_random.predict(X_test)
print("Accuracy (Splitter =  random) :", metrics.accuracy_score(prediction,Y_test))

Accuracy (Splitter =  random) : 0.9466666666666667


### Using Decision Tree with splitter arguement as best for best features selection at each node.

In [12]:
model_best = DecisionTreeClassifier(splitter='best')
model_best.fit(X_train, Y_train)
prediction = model_best.predict(X_test)
print("Accuracy (Splitter = Best) :", metrics.accuracy_score(prediction,Y_test))

Accuracy (Splitter = Best) : 0.9583333333333334


#### By looking at the accuracy of model_random with splitter as random and model_best with splitter as best, we can conclude that Decision Tree with feature split at each node done randomly outperforms the "best" one. 

### Using The Decision Tree with best values of Splitter, Criterion and Max Depth

In [15]:
model = DecisionTreeClassifier(criterion = "gini",splitter='random',max_depth=20)
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
print("Accuracy (Splitter = Best) :", metrics.accuracy_score(prediction,Y_test))

Accuracy (Splitter = Best) : 0.9683333333333334


#### The model combined with best criterion, splitter and max_depth argument did not outperform the previous models.