# Decision Tree

Objective: Find proper drug for a new patient  
Dataset: drug200.csv  
Ref: https://labs.cognitiveclass.ai/tools/jupyterlab/lab/tree/labs/coursera/ML0101EN/ML0101EN-Clas-Decision-Trees-drug-py-v1.ipynb

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv("drug200.csv", delimiter=",")
print(data.shape)
data.head()

(200, 6)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


# Pre Processing

In [2]:
#split dataset from target value
X = data[['Age','Sex','BP','Cholesterol','Na_to_K']].values
X[0:2]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093]], dtype=object)

In [3]:
#categorical variable need to convert to numerical values (Age, Sex and Chol)
from sklearn import preprocessing

x_sex = preprocessing.LabelEncoder()
x_sex.fit(['F','M'])
X[:,1] = x_sex.transform(X[:,1])

x_bp = preprocessing.LabelEncoder()
x_bp.fit(['LOW','NORMAL','HIGH'])
X[:,2] = x_bp.transform(X[:,2])

x_chol = preprocessing.LabelEncoder()
x_chol.fit(['NORMAL','HIGH'])
X[:,3] = x_chol.transform(X[:,3])



#target variable array
y = data["Drug"]

print("Feature Variable: ")
print(X[0:5])
print("-----------------------------")
print("Target Variable: ")
print(y[0:5])

Feature Variable: 
[[23 0 0 0 25.355]
 [47 1 1 0 13.093]
 [47 1 1 0 10.113999999999999]
 [28 0 2 0 7.797999999999999]
 [61 0 1 0 18.043]]
-----------------------------
Target Variable: 
0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object


# Decision Tree Set Up

In [4]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X,y,test_size=0.3,random_state=3)

print("X TRAIN Set: ", X_trainset.shape, y_trainset.shape)
print("X TEST Set : ", X_testset.shape, y_testset.shape)

X TRAIN Set:  (140, 5) (140,)
X TEST Set :  (60, 5) (60,)


# Tree Modelling

In [5]:
#specify criterion='entropy' see information gain of each node
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

#fit with data
drugTree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Prediction

In [6]:
#making prediction and store in predTree
predTree = drugTree.predict(X_testset)

#print and compare with actual set
print("Prediction:")
print(predTree[0:5])
print("----------------------------------------------")
print("TEST set :")
print(y_testset[0:5])

Prediction:
['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
----------------------------------------------
TEST set :
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


# Evaluation

In [7]:
#calculate model accuracy
from sklearn import metrics
import matplotlib.pyplot as plt
print("Decision Tree's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

Decision Tree's Accuracy:  0.9833333333333333
