### Decision Tree

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn import tree

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
# y_train = pd.read_csv('y_train.csv')
# y_test = pd.read_csv('y_test.csv')
y_train = pd.read_csv('y_train_1d.csv')
y_test = pd.read_csv('y_test_1d.csv')

In [3]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [4]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test, average='micro') * 100
precision = precision_score(y_pred, y_test, average='micro') * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("Precision of Decision Tree {:.2f} %".format(precision))

Recall of Decision Tree 95.01 %
Precision of Decision Tree 95.01 %


In [5]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.9500764136525726

In [6]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt,  feature_names = list(X_train.columns), max_depth=4))

|--- age_group_90+ <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- age_group_70s <= 0.50
|   |   |   |--- age_group_60s <= 0.50
|   |   |   |   |--- retail_and_recreation_mobility <= -24.50
|   |   |   |   |   |--- truncated branch of depth 13
|   |   |   |   |--- retail_and_recreation_mobility >  -24.50
|   |   |   |   |   |--- class: is_resolved
|   |   |   |--- age_group_60s >  0.50
|   |   |   |   |--- acquisition_group_OB <= 0.50
|   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |   |--- acquisition_group_OB >  0.50
|   |   |   |   |   |--- truncated branch of depth 12
|   |   |--- age_group_70s >  0.50
|   |   |   |--- acquisition_group_OB <= 0.50
|   |   |   |   |--- gender_MALE <= 0.50
|   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |--- gender_MALE >  0.50
|   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |--- acquisition_group_OB >  0.50
|   |   |   |   |--- grocery_and_pharmacy_mobility <= -1.50
|   |   |   |   |   |--

In [7]:
dt.classes_

array(['is_fatal', 'is_resolved', 'is_unresolved'], dtype=object)

#### Attempt to visualize

In [8]:
import matplotlib.pyplot as plt    

plt.style.use('ggplot')

from sklearn import tree
from six import StringIO   
import pydotplus
import imageio

In [9]:
 # Create a dot file
dotfile = open("tree.dot", 'w')
tree.export_graphviz(dt, out_file = dotfile, feature_names = X_train.columns)
dotfile.close()        

# Create pdf and png from the dot data
dot_data = StringIO()
tree.export_graphviz(dt, out_file=dot_data,  filled=True, rounded=True, 
                     special_characters=True, feature_names =  X_train.columns, class_names=dt.classes_)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  

graph.write_png("tree.png")
graph.write_pdf("tree.pdf")    


# img = imageio.imread("tree.png")
# plt.figure(figsize=(15,15)) 
# plt.imshow(img)
# plt.show()

True

### Decision Tree - Binary Problem - Outbreak Related

In [10]:
X_train.head()

Unnamed: 0,retail_and_recreation_mobility,grocery_and_pharmacy_mobility,acquisition_group_CC,acquisition_group_CS,acquisition_group_OB,acquisition_group_TRAVEL,age_group_20s,age_group_30s,age_group_40s,age_group_50s,...,age_group_70s,age_group_80s,age_group_90+,age_group_<20,gender_FEMALE,gender_MALE,special_measure_Stage 2,special_measure_Stage 2 Modified,special_measure_Stage 3,special_measure_Stage 3 Modified
0,-33,-13,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,-15,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,-17,-5,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,-22,-2,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,-32,-5,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [11]:
y_train_3d = pd.read_csv('y_train.csv')
y_test_3d = pd.read_csv('y_test.csv')
new_df = pd.concat([X_train, y_train_3d], axis=1)


In [12]:
y_train_ob = new_df.iloc[:,4:5]
X_train_ob = new_df.drop(new_df.iloc[:,0:6], axis = 1)
new_df = pd.concat([X_test, y_test_3d], axis=1)
y_test_ob = new_df.iloc[:,4:5]
X_test_ob = new_df.drop(new_df.iloc[:,0:6], axis = 1)

In [14]:
y_train_ob.acquisition_group_OB.value_counts()

0    5982
1    1638
Name: acquisition_group_OB, dtype: int64

In [7]:
X_test_ob

Unnamed: 0,age_group_20s,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,age_group_90+,age_group_<20,gender_FEMALE,gender_MALE,special_measure_Stage 2,special_measure_Stage 2 Modified,special_measure_Stage 3,special_measure_Stage 3 Modified,is_unresolved,is_resolved,is_fatal
0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0
1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3921,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0
3922,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0
3923,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0
3924,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0


In [20]:
dt_ob = DecisionTreeClassifier(criterion='entropy')
dt_ob.fit(X_train_ob, y_train_ob)

DecisionTreeClassifier(criterion='entropy')

In [21]:
y_pred_ob = dt_ob.predict(X_test_ob)
recall_ob = recall_score(y_pred_ob, y_test_ob) * 100
precision_ob = precision_score(y_pred_ob, y_test_ob) * 100
print("Recall of Decision Tree {:.2f} %".format(recall_ob))
print("Precision of Decision Tree {:.2f} %".format(precision_ob))

Recall of Decision Tree 70.73 %
Precision of Decision Tree 13.73 %


In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred_ob,y_test_ob)

0.8020886398369842

In [23]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt_ob,  feature_names = list(X_train_ob.columns), max_depth=4))

|--- age_group_90+ <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- special_measure_Stage 2 Modified <= 0.50
|   |   |   |--- is_resolved <= 0.50
|   |   |   |   |--- gender_MALE <= 0.50
|   |   |   |   |   |--- truncated branch of depth 8
|   |   |   |   |--- gender_MALE >  0.50
|   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |--- is_resolved >  0.50
|   |   |   |   |--- special_measure_Stage 3 Modified <= 0.50
|   |   |   |   |   |--- truncated branch of depth 9
|   |   |   |   |--- special_measure_Stage 3 Modified >  0.50
|   |   |   |   |   |--- truncated branch of depth 8
|   |   |--- special_measure_Stage 2 Modified >  0.50
|   |   |   |--- is_resolved <= 0.50
|   |   |   |   |--- gender_FEMALE <= 0.50
|   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |--- gender_FEMALE >  0.50
|   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- is_resolved >  0.50
|   |   |   |   |--- age_group_20s <= 0.50
|   |   |   |   |   |--- truncate

In [37]:
dt_ob.classes_

array([0, 1], dtype=int64)

In [41]:
 # Create a dot file
dotfile_ob = open("tree_ob.dot", 'w')
tree.export_graphviz(dt_ob, out_file = dotfile_ob, feature_names = X_train_ob.columns)
dotfile_ob.close()        

# Create pdf and png from the dot data
dot_data_ob = StringIO()
tree.export_graphviz(dt_ob, out_file=dot_data_ob,  filled=True, rounded=True, 
                     special_characters=True, feature_names =  X_train_ob.columns, class_names=True)
# graph = pydotplus.graph_from_dot_data(dot_data_ob.getvalue())  

# # graph.write_png("tree_ob.png")
# # graph.write_pdf("tree_ob.pdf")    


# # # img = imageio.imread("tree.png")
# # # plt.figure(figsize=(15,15)) 
# # # plt.imshow(img)
# # # plt.show()