---

# 3. Machine Learning with Python 4 (Project 4, Decision Tree)

# Goal: 

## Finding the Proper Drug for a New Patient via Decision Tree Algorithm

# _00. Import Main Libraries_

# _01. Load the Data and Explore the Basics of It_

# _02. Preprocessing_

# _03. Train-Test Split_

# _04. Decision Tree Usage_

# _05. Evaluation of the Model_

---
---
---

# _00. Import Main Libraries_

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt

# _01. Load the Data and Explore the Basics of It_

In [2]:
df = pd.read_csv("drug200.csv")

df.head()

Unnamed: 0,Age,Gender,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [3]:
df.shape

(200, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Gender       200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [5]:
df[["Drug"]].value_counts()

Drug 
drugY    91
drugX    54
drugA    23
drugB    16
drugC    16
dtype: int64

# _02. Preprocessing_

In [6]:
# get the predictors

X = df.drop('Drug',axis=1)

X

X_num = df.drop('Drug',axis=1)

y = df[["Drug"]]

In [7]:
# get indicator vars for categorical features

from sklearn import preprocessing

iv_gender = preprocessing.LabelEncoder()
iv_gender.fit(['F','M'])
X_num.Gender = iv_gender.transform(X.Gender) 


iv_BP = preprocessing.LabelEncoder()
iv_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X_num.BP = iv_BP.transform(X.BP)


iv_Chol = preprocessing.LabelEncoder()
iv_Chol.fit([ 'NORMAL', 'HIGH'])
X_num.Cholesterol = iv_Chol.transform(X.Cholesterol) 

X_num

Unnamed: 0,Age,Gender,BP,Cholesterol,Na_to_K
0,23,0,0,0,25.355
1,47,1,1,0,13.093
2,47,1,1,0,10.114
3,28,0,2,0,7.798
4,61,0,1,0,18.043
...,...,...,...,...,...
195,56,0,1,0,11.567
196,16,1,1,0,12.006
197,52,1,2,0,9.894
198,23,1,2,1,14.020


In [8]:
# to get categorical values back from dummies, inverse transform is used 

# iv_gender.inverse_transform(X_num.Gender)

# iv_BP.inverse_transform(X_num.BP)

# iv_Chol.inverse_transform(X_num.Cholesterol)

# _03. Train-Test Split_

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.3, random_state=1)

In [10]:
print("X_train, X_test, y_train, y_test shapes: ", X_train.shape," ", X_test.shape, " ", y_train.shape, " ", y_test.shape)

X_train, X_test, y_train, y_test shapes:  (140, 5)   (60, 5)   (140, 1)   (60, 1)


# _04. Decision Tree Usage_

In [11]:
# instance of decision tree

drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

drugTree

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [12]:
# fit the tree with train data

drugTree.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [13]:
# predict one external data

externalData = {"Age":[23], "Gender": ["M"], "BP": ["HIGH"], "Cholestrol": ["HIGH"], "Na_to_K":[25.355]}

extDatDf = pd.DataFrame(externalData)

extDatDf.Gender = iv_gender.transform(extDatDf.Gender) 

extDatDf.BP = iv_BP.transform(extDatDf.BP) 

extDatDf.Cholestrol = iv_Chol.transform(extDatDf.Cholestrol) 

extDatDf

Unnamed: 0,Age,Gender,BP,Cholestrol,Na_to_K
0,23,1,0,0,25.355


In [14]:
pred0 = drugTree.predict(extDatDf)
     
pred0

array(['drugY'], dtype=object)

In [15]:
# predict the test data

predTree = drugTree.predict(X_test)

predTree = pd.DataFrame(predTree, columns = ["PredictedDrugs"])

# set the index as y_test index

predTree = predTree.set_index(y_test.index)

predTree.head()

Unnamed: 0,PredictedDrugs
58,drugX
40,drugY
34,drugX
102,drugC
184,drugY


In [22]:
yResult = y_test.copy(deep = True)

yResult.loc[:,"PredictedDrugs"]= predTree.PredictedDrugs

yResult.head()

Unnamed: 0,Drug,PredictedDrugs
58,drugX,drugX
40,drugY,drugY
34,drugX,drugX
102,drugC,drugC
184,drugY,drugY


# _05. Evaluation of the Model_

In [17]:
from sklearn import metrics

print("DrugTrees's Accuracy: ", metrics.accuracy_score(y_test.Drug, predTree))

DrugTrees's Accuracy:  0.9666666666666667


In [20]:
(y_test.Drug == predTree.PredictedDrugs).value_counts()

True     58
False     2
dtype: int64