# Decision Tree Machine Learning Model

### Read in csv file

In [70]:
import csv
import numpy as np
import pandas as pd
from df_helper.reduction_helper import PCA_reduce
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [80]:
filename = '../Training and Testing sets/train_tfidf_features.csv'
train_features = pd.read_csv (filename, header=0)

print(train_features.shape)
print(train_features.head(10))

(17184, 5002)
   id  label    0    1    2    3    4    5    6    7  ...  4990  4991  4992  \
0   1      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1   2      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2   3      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3   4      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4   5      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5   6      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6   7      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7   8      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8   9      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9  10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.

In [42]:
train_features.set_index('id', inplace=True, drop=True)
print(train_features.head(10))

    label    0    1    2    3    4    5    6    7    8  ...  4990  4991  4992  \
id                                                      ...                     
1       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

    4993  4994  4995  4996 

### Train-Test split

In [43]:
# Generate a 100 feature feature set using PCA reduction
# print(train_features.iloc[:, 1:].head(10))
X = PCA_reduce(train_features.iloc[:, 1:],200)
# print(X.head(10))

# X = train_features.iloc[:, 10:20].values
Y = train_features.iloc[:, 0].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1, random_state=41)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=.1, random_state=41)

# X_train is to train data
# X_val is for validation of data
# X_test is for testing model

In [81]:

Y = train_features.iloc[:, 0].values.reshape(-1,1)
X_train, X_test, Y_train, Y_test = train_test_split(train_features, Y, test_size=.1, random_state=41)


## Fit the model
Train a simple model with max_depth 3

In [44]:
clf = DecisionTreeClassifier(max_depth =3, random_state = 42)
clf.fit(X_train, Y_train)
test_pred_decision_tree = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(score)


0.17194570135746606


As shown above, the f1 score of the decision tree with max depth is extremely low. However, we can try and tweak the max_depth and see if there is a difference is the values.

In [45]:

for i in range(1,6):
    clf = DecisionTreeClassifier(max_depth =i, random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.17691154422788608
3 : 0.17194570135746606
4 : 0.4995571302037201
5 : 0.4430512016718913


We observe that the optimal depth seems to be at 4 as any further increase in depth will decrease the f1 score on the validation test set.

In [46]:
for i in range(1,20):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

for i in range(40,47):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.02368866328257191
2 : 0.11690363349131123
3 : 0.2103825136612022
4 : 0.308252427184466
5 : 0.33537331701346385
6 : 0.30341340075853346
7 : 0.3730684326710817
8 : 0.41530054644808745
9 : 0.45540398740818466
10 : 0.455078125
11 : 0.4270152505446623
12 : 0.44536940686784604
13 : 0.45813282001924927
14 : 0.4298874104401228
15 : 0.4194214876033058
16 : 0.4898710865561694
17 : 0.45209302325581396
18 : 0.4694656488549618
19 : 0.45107176141658906
40 : 0.4507512520868114
41 : 0.4542372881355933
42 : 0.46761984861227923
43 : 0.49626556016597506
44 : 0.48205128205128206
45 : 0.47952218430034127
46 : 0.48123980424143553


Alternatively, we used a best random split method instead. We see that the f1 score performance seem to stagnate at around 40-50%. Note that the number of splits were deliberately increased as random split might require more splits.

In [47]:
from sklearn.metrics import f1_score

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="entropy", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("Entropy "+ str(i)+" : "+str(score))

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="log_loss", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("log_loss "+ str(i)+" : "+str(score))

Entropy 1 : 0.0
Entropy 2 : 0.17691154422788608
Entropy 3 : 0.17691154422788608
Entropy 4 : 0.5035211267605635
Entropy 5 : 0.4569055036344756
Entropy 6 : 0.5229357798165138
Entropy 7 : 0.5122615803814714
Entropy 8 : 0.49363369245837413
Entropy 9 : 0.49195837275307475
log_loss 1 : 0.0
log_loss 2 : 0.17691154422788608
log_loss 3 : 0.17691154422788608
log_loss 4 : 0.5035211267605635
log_loss 5 : 0.4569055036344756
log_loss 6 : 0.5229357798165138
log_loss 7 : 0.5122615803814714
log_loss 8 : 0.49363369245837413
log_loss 9 : 0.49195837275307475


Using the entropy and log_loss also produced similar results as the default gini criterion.

Hence, we can conclude that neither the criterion nor the number of splits were able to increase the accuracy of the model significantly.

# Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, Y_train.ravel())
random_forest = clf.predict(X_val)

score = f1_score(Y_val, random_forest)
print(str(score))

0.0


Changing the max_depth of each tree in the random forest. 

In [49]:
for i in range(1,5):
    clf = RandomForestClassifier(max_depth =i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.49195837275307475
2 : 0.49195837275307475
3 : 0.49195837275307475
4 : 0.49195837275307475


In [83]:
print(X_train.head(10))

          id  label    0    1    2    3    4    5    6    7  ...  4990  4991  \
8130    8131      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
17077  17078      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
14105  14106      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
16427  16428      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
12464  12465      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2743    2744      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4619    4620      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
16384  16385      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
14181  14182      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
12285  12286      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

       4992  4993  4994  4995  4996  4997  4998  4999  
8130    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
17077  

In [82]:
clf = RandomForestClassifier(max_depth =3)
clf.fit(X_train, Y_train.ravel())
random_forest = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(str(score))



ValueError: X has 1000 features, but RandomForestClassifier is expecting 5002 features as input.

As seen, increasing the max_depth doesnot affect the accuracy. Thus, we conclude the max_depth of the tree in the forest is only 1. However, we may suggest changing the number of trees in said forest

In [50]:
for i in range(1,5):
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.49195837275307475
2 : 0.49195837275307475
3 : 0.49195837275307475
4 : 0.49195837275307475


In [51]:
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num =5)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
#Maximum number of levels in tree
max_depth = [2,4]
# Method of selecting samples for training each tree
bootstrap = [True,False]

In [52]:
# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

In [53]:

clf = RandomForestClassifier(n_estimators=i)

rf_Grid = GridSearchCV(estimator = clf, param_grid = random_grid, cv=3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train,Y_train.ravel())

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=10; total time=   0.1s

[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.4s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=log2, n_estimators=40; total time=   0.4s
[CV] END

In [54]:
grid_prediction = rf_Grid.predict(X_val)
score = f1_score(Y_val, grid_prediction)
print(score)

0.1724659606656581


With GridSearchCV, the parameters performed even worse on the validation dataset, suggesting overfitting to the training dataset.

# Ada Boost

In [55]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=50, random_state=0)
clf.fit(X_train, Y_train.ravel())
ada_boost_pred = clf.predict(X_val)
score = f1_score(Y_val, ada_boost_pred)
print(score)

0.5020161290322581


In [56]:
for i in range(1,10):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))
    

for i in range(30,35):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.24327784891165172
3 : 0.29404617253948967
4 : 0.29404617253948967
5 : 0.33886255924170616
6 : 0.37602820211515864
7 : 0.41014799154334036
8 : 0.38513513513513514
9 : 0.41914893617021276
30 : 0.472986748216106
31 : 0.47648261758691207
32 : 0.486322188449848
33 : 0.48521916411824667
34 : 0.4831804281345566


# Gradient Boosting Classifier 

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, Y_train.ravel())
gradient_boosting_clf = clf.predict(X_val)
score = f1_score(Y_val, gradient_boosting_clf)
print(score)

0.5156576200417536


# Attempt 2 at PCA with higher dimensions

In [None]:
filename = '../Training and Testing sets/train_tfidf_features.csv'
train_features = pd.read_csv (filename, header=0)

# Remove index
train_features = train_features.iloc[:, 1:]
print(train_features.head(10))


   label    0    1    2    3    4    5    6    7    8  ...  4990  4991  4992  \
0      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0  

In [76]:
# Generate a 100 feature feature set using PCA reduction
# print(train_features.iloc[:, 1:].head(10))

X = PCA_reduce(train_features.iloc[:,1:],1000)
# print(X.head(10))

KeyboardInterrupt: 

In [77]:
Y = train_features.iloc[:, 0].values.reshape(-1,1)
print(X)
print(X.shape)
print(Y)
print(Y.shape)

            0         1         2         3         4         5         6    \
0     -0.083198 -0.016048 -0.010595 -0.001964 -0.013779 -0.010988 -0.009680   
1     -0.068421 -0.043649 -0.018443 -0.008228 -0.000051 -0.043176  0.127322   
2     -0.080171 -0.044642 -0.015342 -0.008697 -0.010481 -0.057562  0.075615   
3      0.028601 -0.040406  0.002784  0.007087  0.000462 -0.031219 -0.137443   
4      0.255054 -0.113418 -0.019237 -0.021468 -0.040033 -0.007000 -0.035961   
...         ...       ...       ...       ...       ...       ...       ...   
17179  0.075930  0.078879  0.167305 -0.099091  0.304845  0.037761 -0.022524   
17180 -0.072104 -0.019731 -0.014655 -0.005451 -0.005682 -0.012314  0.005987   
17181  0.002079 -0.041786 -0.016886 -0.008919 -0.018063 -0.020859  0.006685   
17182  0.091355 -0.055903 -0.008473 -0.017559 -0.002191 -0.013549 -0.012348   
17183 -0.040612  0.093708 -0.039986 -0.025745 -0.032479  0.004735  0.001654   

            7         8         9    ...       990 

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1, random_state=41)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=.1, random_state=41)

In [62]:
clf = DecisionTreeClassifier(max_depth = 30, random_state = 42)
clf.fit(X_train, Y_train)
test_pred_decision_tree = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(score)

0.4654731457800511


In [63]:
for i in range(30,60,5):
    clf = RandomForestClassifier(max_depth = i )
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

30 : 0.4654731457800511
35 : 0.4654731457800511
40 : 0.4654731457800511
45 : 0.4654731457800511
50 : 0.4654731457800511
55 : 0.4654731457800511


In [79]:
clf = RandomForestClassifier(max_depth = 10 )
clf.fit(X_train, Y_train.ravel())
random_forest = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(str(i)+" : "+str(score))

55 : 0.4654731457800511


In [72]:
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num =50)]
# Criterion for split
criterion = ['gini','entropy']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 5, stop = 1000, num =400)]
# Method of selecting samples for training each tree
bootstrap = [True,False]
# Out of Bag Score
oob_score = [True,False]
# Run in parallel
n_jobs = [-1]
# Reuse the solution of the previous call to fit and add more estimators to the ensemble
warm_start = [True,False]

In [73]:
# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'criterion':criterion,
            'max_depth': max_depth,
            'bootstrap': bootstrap,
            "oob_score": oob_score,
            "n_jobs": n_jobs,
            "warm_start": warm_start }

In [75]:
clf = RandomForestClassifier()
rf_Grid = GridSearchCV(estimator = clf, param_grid = random_grid,n_jobs= -1, cv = 3, scoring = 'f1')
rf_Grid.fit(X_train,Y_train.ravel())

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 

In [None]:

grid_prediction = rf_Grid.predict(X_val)
score = f1_score(Y_val, grid_prediction)
print(score)