# Decision Tree Machine Learning Model

### Read in csv file

In [2]:
import csv
import numpy as np
import pandas as pd

In [3]:
filename = '../Training and Testing sets/train_tfidf_features.csv'
train_features = pd.read_csv (filename, header=0)

print(train_features.shape)
print(train_features.head(10))

(17184, 5002)
   id  label    0    1    2    3    4    5    6    7  ...  4990  4991  4992  \
0   1      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1   2      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2   3      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3   4      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4   5      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5   6      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6   7      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7   8      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8   9      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9  10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.

In [4]:
train_features.set_index('id', inplace=True, drop=True)
print(train_features.head(10))

    label    0    1    2    3    4    5    6    7    8  ...  4990  4991  4992  \
id                                                      ...                     
1       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

    4993  4994  4995  4996 

### Train-Test split

In [5]:
from df_helper.kernel_reduction_helper import KernelPCA_reduce

# Generate a 100 feature feature set using PCA reduction
# print(train_features.iloc[:, 1:].head(10))
X = KernelPCA_reduce(train_features.iloc[:, 1:],200)
# print(X.head(10))

# X = train_features.iloc[:, 10:20].values
Y = train_features.iloc[:, 0].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1, random_state=41)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=.1, random_state=41)

# X_train is to train data
# X_val is for validation of data
# X_test is for testing model

## Fit the model
Train a simple model with max_depth 3

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score


clf = DecisionTreeClassifier(max_depth =3, random_state = 42)
clf.fit(X_train, Y_train)
test_pred_decision_tree = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(score)


0.17194570135746606


As shown above, the f1 score of the decision tree with max depth is extremely low. However, we can try and tweak the max_depth and see if there is a difference is the values.

In [7]:
from sklearn.metrics import f1_score

for i in range(1,6):
    clf = DecisionTreeClassifier(max_depth =i, random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.17691154422788608
3 : 0.17194570135746606
4 : 0.5
5 : 0.4458464773922187


We observe that the optimal depth seems to be at 4 as any further increase in depth will decrease the f1 score on the validation test set.

In [8]:
for i in range(1,20):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

for i in range(40,47):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.26379542395693134
3 : 0.15727002967359052
4 : 0.4033970276008492
5 : 0.3295880149812734
6 : 0.29183400267737614
7 : 0.3649289099526067
8 : 0.42784032753326506
9 : 0.4191866527632951
10 : 0.3949771689497717
11 : 0.41401273885350315
12 : 0.4444444444444445
13 : 0.4339815762538383
14 : 0.47316103379721675
15 : 0.45004849660523766
16 : 0.4133858267716535
17 : 0.4409005628517824
18 : 0.4524975514201763
19 : 0.46389891696750907
40 : 0.47635135135135126
41 : 0.4877637130801688
42 : 0.4451996601529312
43 : 0.48444070647603027
44 : 0.44079515989628354
45 : 0.47675401521555366
46 : 0.46559048428207306


Alternatively, we used a best random split method instead. We see that the f1 score performance seem to stagnate at around 40-50%. Note that the number of splits were deliberately increased as random split might require more splits.

In [9]:
from sklearn.metrics import f1_score

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="entropy", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("Entropy "+ str(i)+" : "+str(score))

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="log_loss", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("log_loss "+ str(i)+" : "+str(score))

Entropy 1 : 0.0
Entropy 2 : 0.17691154422788608
Entropy 3 : 0.17194570135746606
Entropy 4 : 0.501323918799647
Entropy 5 : 0.456140350877193
Entropy 6 : 0.5136921624173749
Entropy 7 : 0.5211786372007368
Entropy 8 : 0.48762603116406966
Entropy 9 : 0.5
log_loss 1 : 0.0
log_loss 2 : 0.17691154422788608
log_loss 3 : 0.17194570135746606
log_loss 4 : 0.501323918799647
log_loss 5 : 0.456140350877193
log_loss 6 : 0.5136921624173749
log_loss 7 : 0.5211786372007368
log_loss 8 : 0.48762603116406966
log_loss 9 : 0.5


Using the entropy and log_loss also produced similar results as the default gini criterion.

Hence, we can conclude that neither the criterion nor the number of splits were able to increase the accuracy of the model significantly.

# Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, Y_train.ravel())
random_forest = clf.predict(X_val)

score = f1_score(Y_val, random_forest)
print(str(score))

0.0


Changing the max_depth of each tree in the random forest. 

In [11]:
for i in range(1,5):
    clf = RandomForestClassifier(max_depth =i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.5
2 : 0.5
3 : 0.5
4 : 0.5


As seen, increasing the max_depth doesnot affect the accuracy. Thus, we conclude the max_depth of the tree in the forest is only 1. However, we may suggest changing the number of trees in said forest

In [12]:
for i in range(1,5):
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.5
2 : 0.5
3 : 0.5
4 : 0.5


In [13]:
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num =5)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
#Maximum number of levels in tree
max_depth = [2,4]
# Method of selecting samples for training each tree
bootstrap = [True,False]

In [14]:
# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

In [15]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_estimators=i)

rf_Grid = GridSearchCV(estimator = clf, param_grid = random_grid, cv=3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train,Y_train.ravel())

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=40; total time=   0.4s
[CV] END

In [16]:
grid_prediction = rf_Grid.predict(X_val)
score = f1_score(Y_val, grid_prediction)
print(score)

0.18429003021148035


With GridSearchCV, the parameters performed even worse on the validation dataset, suggesting overfitting to the training dataset.

# Ada Boost

In [17]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=50, random_state=0)
clf.fit(X_train, Y_train.ravel())
ada_boost_pred = clf.predict(X_val)
score = f1_score(Y_val, ada_boost_pred)
print(score)

0.5108481262327417


In [18]:
for i in range(1,10):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))
    

for i in range(30,35):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.24327784891165172
3 : 0.29404617253948967
4 : 0.29404617253948967
5 : 0.33886255924170616
6 : 0.37602820211515864
7 : 0.41014799154334036
8 : 0.38513513513513514
9 : 0.41914893617021276
30 : 0.4808080808080808
31 : 0.4803229061553985
32 : 0.4924012158054712
33 : 0.48088531187122735
34 : 0.48040201005025124


# Gradient Boosting Classifier 

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, Y_train.ravel())
gradient_boosting_clf = clf.predict(X_val)
score = f1_score(Y_val, gradient_boosting_clf)
print(score)

0.5286160249739854
