### Decision Tree - Prediciting Outcome Class (is_resolved, is_unresolved, is_fatal)

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn import tree
import logging
from codetiming import Timer

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
# y_train = pd.read_csv('y_train.csv')
# y_test = pd.read_csv('y_test.csv')
y_train = pd.read_csv('y_train_1d.csv')
y_test = pd.read_csv('y_test_1d.csv')

In [3]:
t = Timer("example", text="Time spent: {:.2f}", logger=logging.warning)
t.start()

dt = DecisionTreeClassifier(criterion='gini')
dt.fit(X_train, y_train)

t.stop()



0.026257900000000056

In [4]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test, average = 'micro') * 100
precision = precision_score(y_pred, y_test,  average = 'micro') * 100
print("Recall of Decision Tree - Micro {:.2f} %".format(recall))
print("Precision of Decision Tree - Micro {:.2f} %".format(precision))

Recall of Decision Tree - Micro 94.70 %
Precision of Decision Tree - Micro 94.70 %


In [5]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.9470198675496688

In [6]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt,  feature_names = list(X_train.columns), max_depth=2))

|--- age_group_90+ <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- age_group_70s <= 0.50
|   |   |   |--- truncated branch of depth 15
|   |   |--- age_group_70s >  0.50
|   |   |   |--- truncated branch of depth 13
|   |--- age_group_80s >  0.50
|   |   |--- acquisition_group_CC <= 0.50
|   |   |   |--- truncated branch of depth 14
|   |   |--- acquisition_group_CC >  0.50
|   |   |   |--- truncated branch of depth 9
|--- age_group_90+ >  0.50
|   |--- acquisition_group_OB <= 0.50
|   |   |--- retail_and_recreation_mobility <= -10.50
|   |   |   |--- truncated branch of depth 8
|   |   |--- retail_and_recreation_mobility >  -10.50
|   |   |   |--- class: is_fatal
|   |--- acquisition_group_OB >  0.50
|   |   |--- retail_and_recreation_mobility <= -29.50
|   |   |   |--- truncated branch of depth 11
|   |   |--- retail_and_recreation_mobility >  -29.50
|   |   |   |--- truncated branch of depth 9



In [7]:
y_train.Outcome.value_counts()

is_resolved      7418
is_fatal          200
is_unresolved       2
Name: Outcome, dtype: int64

#### Attempt to visualize

In [8]:
import matplotlib.pyplot as plt    

plt.style.use('ggplot')

from sklearn import tree
from six import StringIO   
import pydotplus
import imageio

In [9]:
 # Create a dot file
# dotfile = open("tree.dot", 'w')
# tree.export_graphviz(dt, out_file = dotfile, feature_names = X_train.columns)
# dotfile.close()        

# # Create pdf and png from the dot data
# dot_data = StringIO()
# tree.export_graphviz(dt, out_file=dot_data,  filled=True, rounded=True, 
#                      special_characters=True, feature_names =  X_train.columns, class_names=dt.classes_)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  

# graph.write_png("tree.png")
# graph.write_pdf("tree.pdf")    


# img = imageio.imread("tree.png")
# plt.figure(figsize=(15,15)) 
# plt.imshow(img)
# plt.show()

### Predicting Outcome Class After Removal of Unresolved Columns - Binary Problem

In [10]:
X_train_2 = pd.read_csv('X_train_1d_2.csv')
X_test_2 = pd.read_csv('X_test_1d_2.csv')
y_train_2 = pd.read_csv('y_train_1d_2.csv')
y_test_2 = pd.read_csv('y_test_1d_2.csv')

In [11]:
t.start()

dt_2 = DecisionTreeClassifier(criterion='gini', max_depth = 4)
dt_2.fit(X_train_2, y_train_2)

t.stop()



0.02115440000000035

In [12]:
y_pred_2 = dt.predict(X_test_2)
recall_2 = recall_score(y_pred_2, y_test_2, pos_label = 'is_resolved') * 100
precision_2 = precision_score(y_pred_2, y_test_2,  pos_label = 'is_resolved') * 100
print("Recall of Decision Tree - Micro {:.2f} %".format(recall_2))
print("Precision of Decision Tree - Micro {:.2f} %".format(precision_2))

Recall of Decision Tree - Micro 99.25 %
Precision of Decision Tree - Micro 98.28 %


In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred_2,y_test_2)

0.9759528130671506

In [14]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt_2,  feature_names = list(X_train_2.columns)))

|--- age_group_90+ <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- age_group_70s <= 0.50
|   |   |   |--- age_group_60s <= 0.50
|   |   |   |   |--- class: is_resolved
|   |   |   |--- age_group_60s >  0.50
|   |   |   |   |--- class: is_resolved
|   |   |--- age_group_70s >  0.50
|   |   |   |--- acquisition_group_OB <= 0.50
|   |   |   |   |--- class: is_resolved
|   |   |   |--- acquisition_group_OB >  0.50
|   |   |   |   |--- class: is_resolved
|   |--- age_group_80s >  0.50
|   |   |--- acquisition_group_OB <= 0.50
|   |   |   |--- retail_and_recreation_mobility <= -27.50
|   |   |   |   |--- class: is_resolved
|   |   |   |--- retail_and_recreation_mobility >  -27.50
|   |   |   |   |--- class: is_resolved
|   |   |--- acquisition_group_OB >  0.50
|   |   |   |--- retail_and_recreation_mobility <= -15.00
|   |   |   |   |--- class: is_resolved
|   |   |   |--- retail_and_recreation_mobility >  -15.00
|   |   |   |   |--- class: is_resolved
|--- age_group_90+ >  0.50
|   |---

In [15]:
y_train_2.Outcome.value_counts()

is_resolved    8352
is_fatal        203
Name: Outcome, dtype: int64

#### Handling Class Imbalance Fatal vs. Resolved

In [16]:
from imblearn.under_sampling import NearMiss 
nm = NearMiss()

In [17]:
X_train_2_miss, y_train_2_miss = nm.fit_sample(X_train_2, y_train_2)
X_test_2_miss, y_test_2_miss = nm.fit_sample(X_test_2, y_test_2)

In [18]:
print("Before NearMiss: \n", y_train_2.Outcome.value_counts())
print("\nAfter NearMiss: \n",y_train_2_miss.Outcome.value_counts())


Before NearMiss: 
 is_resolved    8352
is_fatal        203
Name: Outcome, dtype: int64

After NearMiss: 
 is_resolved    203
is_fatal       203
Name: Outcome, dtype: int64


In [19]:
t.start()

dt_2_miss = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=1)
dt_2_miss.fit(X_train_2_miss, y_train_2_miss)

t.stop()



0.004999000000000642

In [20]:
y_pred_2_miss = dt_2_miss.predict(X_test_2_miss)
recall_2_miss = recall_score(y_pred_2_miss, y_test_2_miss, pos_label='is_resolved') * 100
precision_2_miss = precision_score(y_pred_2_miss, y_test_2_miss, pos_label='is_resolved') * 100
print("Recall of Decision Tree {:.2f} %".format(recall_2_miss))
print("Precision of Decision Tree {:.2f} %".format(precision_2_miss))

Recall of Decision Tree 62.50 %
Precision of Decision Tree 66.67 %


In [21]:
accuracy_score(y_pred_2_miss,y_test_2_miss)

0.6333333333333333

In [22]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt_2_miss,  feature_names = list(X_train_2_miss.columns), max_depth=4))

|--- retail_and_recreation_mobility <= -30.50
|   |--- acquisition_group_OB <= 0.50
|   |   |--- class: is_fatal
|   |--- acquisition_group_OB >  0.50
|   |   |--- class: is_resolved
|--- retail_and_recreation_mobility >  -30.50
|   |--- age_group_20s <= 0.50
|   |   |--- class: is_fatal
|   |--- age_group_20s >  0.50
|   |   |--- class: is_resolved



##### Attempt of hyperparameter tuning on Model with UnderSampling

In [23]:
param_dist={
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,5,6,7,8,9, 10,None],
    "min_samples_leaf":[1,2,3,4,5]
}

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
grid = GridSearchCV(dt_2_miss, param_grid=param_dist, cv=10, n_jobs=-1)

In [26]:
grid.fit(X_train_2_miss, y_train_2_miss)

GridSearchCV(cv=10,
             estimator=DecisionTreeClassifier(criterion='entropy', max_depth=2),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
                         'min_samples_leaf': [1, 2, 3, 4, 5]})

In [27]:
grid.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=2)

In [28]:
grid.best_score_

0.6353048780487806

In [29]:
grid.best_params_

{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 1}

### Decision Tree - Binary Problem - Prediciting Comunity Spread Cases

In [30]:
y_train_cs = pd.read_csv('y_train_cs.csv')
X_train_cs = pd.read_csv('X_train_cs.csv')

y_test_cs = pd.read_csv('y_test_cs.csv')
X_test_cs = pd.read_csv('X_test_cs.csv')

In [31]:
y_train_cs

Unnamed: 0,acquisition_group_CS
0,1
1,0
2,1
3,1
4,0
...,...
10367,1
10368,0
10369,0
10370,1


In [32]:
y_train_cs.acquisition_group_CS.value_counts()

0    7212
1    3160
Name: acquisition_group_CS, dtype: int64

In [33]:
X_train_cs

Unnamed: 0,is_unresolved,is_resolved,is_fatal,retail_and_recreation_mobility,grocery_and_pharmacy_mobility,month,parks_mobility,age_group_20s,age_group_30s,age_group_40s,...,age_group_90+,age_group_<20,gender_FEMALE,gender_MALE,special_measure_Stage 2,special_measure_Stage 2 Modified,special_measure_Stage 3,special_measure_Stage 3 Modified,city_Ottawa,city_Toronto
0,0,1,0,-2,15,9,92,0,0,1,...,0,0,1,0,0,0,1,0,0,1
1,0,1,0,-23,-10,9,85,0,0,0,...,0,1,1,0,0,0,1,0,0,1
2,0,1,0,-24,-6,10,28,0,0,0,...,0,0,0,1,0,1,0,0,0,1
3,0,1,0,-20,0,10,50,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,0,1,0,-23,-2,10,9,0,0,1,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10367,0,1,0,-21,-2,10,39,0,0,0,...,0,0,1,0,0,0,0,1,0,1
10368,0,1,0,-33,-3,10,14,1,0,0,...,0,0,0,1,0,0,0,1,0,1
10369,0,1,0,-19,1,10,39,0,0,0,...,0,0,0,1,0,1,0,0,0,1
10370,0,1,0,-14,-2,10,-4,0,0,0,...,0,0,1,0,0,0,0,1,0,1


In [34]:
X_train_cs.columns

Index(['is_unresolved', 'is_resolved', 'is_fatal',
       'retail_and_recreation_mobility', 'grocery_and_pharmacy_mobility',
       'month', 'parks_mobility', 'age_group_20s', 'age_group_30s',
       'age_group_40s', 'age_group_50s', 'age_group_60s', 'age_group_70s',
       'age_group_80s', 'age_group_90+', 'age_group_<20', 'gender_FEMALE',
       'gender_MALE', 'special_measure_Stage 2',
       'special_measure_Stage 2 Modified', 'special_measure_Stage 3',
       'special_measure_Stage 3 Modified', 'city_Ottawa', 'city_Toronto'],
      dtype='object')

In [35]:
X_train_cs.drop(['special_measure_Stage 2', 'special_measure_Stage 2 Modified','special_measure_Stage 3',
                 'special_measure_Stage 3 Modified', 'parks_mobility','retail_and_recreation_mobility', 
                 'grocery_and_pharmacy_mobility','city_Ottawa',
       'city_Toronto'], axis = 1, inplace=True)
X_test_cs.drop(['special_measure_Stage 2', 'special_measure_Stage 2 Modified','special_measure_Stage 3',
                'special_measure_Stage 3 Modified', 'parks_mobility','retail_and_recreation_mobility', 
                'grocery_and_pharmacy_mobility','city_Ottawa',
       'city_Toronto'], axis = 1, inplace=True)

In [36]:
X_test_cs.columns

Index(['is_unresolved', 'is_resolved', 'is_fatal', 'month', 'age_group_20s',
       'age_group_30s', 'age_group_40s', 'age_group_50s', 'age_group_60s',
       'age_group_70s', 'age_group_80s', 'age_group_90+', 'age_group_<20',
       'gender_FEMALE', 'gender_MALE'],
      dtype='object')

In [37]:
t.start()

dt_cs = DecisionTreeClassifier(criterion='entropy')
dt_cs.fit(X_train_cs, y_train_cs)

t.stop()



0.015345200000000503

In [38]:
y_pred_cs = dt_cs.predict(X_test_cs)
recall_cs = recall_score(y_pred_cs, y_test_cs) * 100
precision_cs = precision_score(y_pred_cs, y_test_cs) * 100
print("Recall of Decision Tree {:.2f} %".format(recall_cs))
print("Precision of Decision Tree {:.2f} %".format(precision_cs))

Recall of Decision Tree 50.00 %
Precision of Decision Tree 0.13 %


In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred_cs,y_test_cs)

0.7054741711642252

In [40]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt_cs,  feature_names = list(X_train_cs.columns), max_depth=2))

|--- age_group_90+ <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- age_group_<20 <= 0.50
|   |   |   |--- truncated branch of depth 10
|   |   |--- age_group_<20 >  0.50
|   |   |   |--- truncated branch of depth 5
|   |--- age_group_80s >  0.50
|   |   |--- month <= 7.50
|   |   |   |--- truncated branch of depth 3
|   |   |--- month >  7.50
|   |   |   |--- truncated branch of depth 5
|--- age_group_90+ >  0.50
|   |--- is_resolved <= 0.50
|   |   |--- gender_MALE <= 0.50
|   |   |   |--- class: 0
|   |   |--- gender_MALE >  0.50
|   |   |   |--- truncated branch of depth 3
|   |--- is_resolved >  0.50
|   |   |--- month <= 7.50
|   |   |   |--- truncated branch of depth 2
|   |   |--- month >  7.50
|   |   |   |--- truncated branch of depth 4



In [41]:
dt_cs.classes_

array([0, 1], dtype=int64)

In [42]:
 # Create a dot file
# dotfile_cs = open("tree_cs.dot", 'w')
# tree.export_graphviz(dt_cs, out_file = dotfile_cs, feature_names = X_train_cs.columns)
# dotfile_cs.close()        

# # Create pdf and png from the dot data
# dot_data_cs = StringIO()
# tree.export_graphviz(dt_cs, out_file=dot_data_cs,  filled=True, rounded=True, 
#                      special_characters=True, feature_names =  X_train_cs.columns, class_names=True)
# # graph = pydotplus.graph_from_dot_data(dot_data_cs.getvalue())  

# # graph.write_png("tree_cs.png")
# # graph.write_pdf("tree_cs.pdf")    


# img = imageio.imread("tree.png")
# plt.figure(figsize=(15,15)) 
# plt.imshow(img)
# plt.show()

#### Handling Class Imbalance - Community Spread vs. Not Community Spread

In [43]:
from imblearn.under_sampling import NearMiss 
nm = NearMiss()

In [44]:
X_train_miss, y_train_miss = nm.fit_sample(X_train_cs, y_train_cs)
X_test_miss, y_test_miss = nm.fit_sample(X_test_cs, y_test_cs)

In [45]:
print("Before NearMiss: \n", y_train_cs.acquisition_group_CS.value_counts())
print("\nAfter NearMiss: \n",y_train_miss.acquisition_group_CS.value_counts())
print("Before NearMiss: \n", y_test_cs.acquisition_group_CS.value_counts())
print("\nAfter NearMiss: \n",y_test_miss.acquisition_group_CS.value_counts())

Before NearMiss: 
 0    7212
1    3160
Name: acquisition_group_CS, dtype: int64

After NearMiss: 
 1    3160
0    3160
Name: acquisition_group_CS, dtype: int64
Before NearMiss: 
 0    1830
1     764
Name: acquisition_group_CS, dtype: int64

After NearMiss: 
 1    764
0    764
Name: acquisition_group_CS, dtype: int64


In [46]:
t.start()

dt_miss = DecisionTreeClassifier()
dt_miss.fit(X_train_miss, y_train_miss)

t.stop()



0.011993000000000364

In [47]:
y_pred_miss = dt_miss.predict(X_test_miss)
recall_miss = recall_score(y_pred_miss, y_test_miss) * 100
precision_miss = precision_score(y_pred_miss, y_test_miss) * 100
print("Recall of Decision Tree {:.2f} %".format(recall_miss))
print("Precision of Decision Tree {:.2f} %".format(precision_miss))

Recall of Decision Tree 51.92 %
Precision of Decision Tree 56.68 %


In [48]:
accuracy_score(y_pred_miss,y_test_miss)

0.5209424083769634

In [49]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt_miss,  feature_names = list(X_train_cs.columns), max_depth=2))

|--- age_group_<20 <= 0.50
|   |--- age_group_80s <= 0.50
|   |   |--- is_resolved <= 0.50
|   |   |   |--- truncated branch of depth 3
|   |   |--- is_resolved >  0.50
|   |   |   |--- truncated branch of depth 11
|   |--- age_group_80s >  0.50
|   |   |--- month <= 8.50
|   |   |   |--- truncated branch of depth 4
|   |   |--- month >  8.50
|   |   |   |--- truncated branch of depth 4
|--- age_group_<20 >  0.50
|   |--- month <= 8.50
|   |   |--- month <= 7.50
|   |   |   |--- truncated branch of depth 2
|   |   |--- month >  7.50
|   |   |   |--- truncated branch of depth 2
|   |--- month >  8.50
|   |   |--- gender_MALE <= 0.50
|   |   |   |--- truncated branch of depth 2
|   |   |--- gender_MALE >  0.50
|   |   |   |--- truncated branch of depth 2

