# Decision Tree

Objective: Predicting patient attendance (Show/No Show) from appointment made  
Dataset: appointmentShowNo.csv  
Ref: NA

In [30]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv("appointmentShowNo.csv", delimiter=",")
print(data.shape)
data.head(10)

(110527, 14)


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No
5,95985130000000.0,5626772,F,2016-04-27T08:36:51Z,2016-04-29T00:00:00Z,76,REPÚBLICA,0,1,0,0,0,0,No
6,733688200000000.0,5630279,F,2016-04-27T15:05:12Z,2016-04-29T00:00:00Z,23,GOIABEIRAS,0,0,0,0,0,0,Yes
7,3449833000000.0,5630575,F,2016-04-27T15:39:58Z,2016-04-29T00:00:00Z,39,GOIABEIRAS,0,0,0,0,0,0,Yes
8,56394730000000.0,5638447,F,2016-04-29T08:02:16Z,2016-04-29T00:00:00Z,21,ANDORINHAS,0,0,0,0,0,0,No
9,78124560000000.0,5629123,F,2016-04-27T12:48:25Z,2016-04-29T00:00:00Z,19,CONQUISTA,0,0,0,0,0,0,No


## Data Cleaning

In [2]:
#rename column
data.rename(columns = {'Hipertension': 'Hypertension',
                         'Handcap': 'Handicap',
                         'No-show':'Attend'}, inplace = True)
print(data.columns)

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hypertension',
       'Diabetes', 'Alcoholism', 'Handicap', 'SMS_received', 'Attend'],
      dtype='object')


In [3]:
#convert the AppointmentRegistration and Appointment columns into datetime64 
data.ScheduledDay = data.ScheduledDay.apply(np.datetime64)
data.AppointmentDay = data.AppointmentDay.apply(np.datetime64)

#date only column
data.ScheduledDay = data.ScheduledDay.dt.normalize()

print(data.ScheduledDay.head(1))
print(data.AppointmentDay.head(1))

0   2016-04-29
Name: ScheduledDay, dtype: datetime64[ns]
0   2016-04-29
Name: AppointmentDay, dtype: datetime64[ns]


In [27]:
#calculate date different and append into dataframe
import datetime
from dateutil.relativedelta import relativedelta
from datetime import date

data['WaitDays'] = data['AppointmentDay'] - data['ScheduledDay']
data['WaitDays'] = round(data['WaitDays'] / np.timedelta64(1,'D'),0)
data['WaitDays'] = data['WaitDays'].astype('int64')
data.tail()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,Attend,WaitDays
110522,2572134000000.0,5651768,0,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,0,35
110523,3596266000000.0,5650093,0,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,0,35
110524,15576630000000.0,5630692,0,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,0,41
110525,92134930000000.0,5630323,0,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,0,41
110526,377511500000000.0,5629448,0,2016-04-27,2016-06-07,54,MARIA ORTIZ,0,0,0,0,0,1,0,41


In [5]:
data.Gender[data.Gender == 'M'] = 1
data.Gender[data.Gender == 'F'] = 0
data['Gender'] = data['Gender'].astype('int64')

data.Attend[data.Attend == 'No'] = 0
data.Attend[data.Attend == 'Yes'] = 1
data['Attend'] = data['Attend'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
data.tail()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,Attend,WaitDays
110522,2572134000000.0,5651768,0,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,0,35
110523,3596266000000.0,5650093,0,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,0,35
110524,15576630000000.0,5630692,0,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,0,41
110525,92134930000000.0,5630323,0,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,0,41
110526,377511500000000.0,5629448,0,2016-04-27,2016-06-07,54,MARIA ORTIZ,0,0,0,0,0,1,0,41


## Decision Tree Model

In [15]:
#split dataset from target value
X = data[['Gender','Age','Scholarship','Hypertension','Diabetes',
          'Alcoholism','Handicap','SMS_received','WaitDays']].values
print(X.shape)
X[110300:110302]

(110527, 9)


array([[ 1, 22,  0,  0,  0,  0,  0,  1,  7],
       [ 0,  2,  0,  0,  0,  0,  0,  1,  5]], dtype=int64)

In [16]:
#target variable array
y = data["Attend"]

print("Feature Variable: ")
print(X[110300:110302])
print("-----------------------------")
print("Target Variable: ")
print(y[110300:110302])

Feature Variable: 
[[ 1 22  0  0  0  0  0  1  7]
 [ 0  2  0  0  0  0  0  1  5]]
-----------------------------
Target Variable: 
110300    1
110301    0
Name: Attend, dtype: int64


In [17]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X,y,test_size=0.3,random_state=3)

print("X TRAIN Set: ", X_trainset.shape, y_trainset.shape)
print("X TEST Set : ", X_testset.shape, y_testset.shape)

X TRAIN Set:  (77368, 9) (77368,)
X TEST Set :  (33159, 9) (33159,)


## Tree Modelling

In [18]:
#specify criterion='entropy' see information gain of each node
DTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

#fit with data
DTree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Accuracy Score

In [20]:
#calculate model accuracy
from sklearn import metrics
import matplotlib.pyplot as plt
predTree = DTree.predict(X_testset)
print("Decision Tree's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

Decision Tree's Accuracy:  0.799481287132


In [31]:
#print and compare with actual set
print("TEST set :")
print(y_testset[0:5])
print("----------------------------------------------")
print("Prediction:")
print(predTree[0:5])

TEST set :
54332    0
86406    0
11303    0
69839    0
20240    0
Name: Attend, dtype: int64
----------------------------------------------
Prediction:
[0 0 0 0 0]
