# Decision Tree Machine Learning Model

### Read in csv file

In [1]:
import csv
import numpy as np
import pandas as pd

In [2]:
filename = '../Training and Testing sets/train_tfidf_features.csv'
train_features = pd.read_csv (filename, header=0)

print(train_features.shape)
print(train_features.head(10))

(17184, 5002)


In [4]:
train_features.set_index('id', inplace=True, drop=True)
print(train_features.head(10))

    label    0    1    2    3    4    5    6    7    8  ...  4990  4991  4992  \
id                                                      ...                     
1       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7       0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9       1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

    4993  4994  4995  4996 

### Train-Test split

In [64]:
from df_helper.reduction_helper import PCA_reduce

# Generate a 100 feature feature set using PCA reduction
# print(train_features.iloc[:, 1:].head(10))
X = PCA_reduce(train_features.iloc[:, 1:],50)
# print(X.head(10))

# X = train_features.iloc[:, 10:20].values
Y = train_features.iloc[:, 0].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1, random_state=41)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=.1, random_state=41)

# X_train is to train data
# X_val is for validation of data
# X_test is for testing model

## Fit the model
Train a simple model with max_depth 3

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score


clf = DecisionTreeClassifier(max_depth =3, random_state = 42)
clf.fit(X_train, Y_train)
test_pred_decision_tree = clf.predict(X_val)
score = f1_score(Y_val, test_pred_decision_tree)
print(score)


0.16338880484114976


As shown above, the f1 score of the decision tree with max depth is extremely low. However, we can try and tweak the max_depth and see if there is a difference is the values.

In [42]:
from sklearn.metrics import f1_score

for i in range(1,6):
    clf = DecisionTreeClassifier(max_depth =i, random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.17691154422788608
3 : 0.16338880484114976
4 : 0.5133967156439068
5 : 0.4769539078156313


We observe that the optimal depth seems to be at 4 as any further increase in depth will decrease the f1 score on the validation test set.

In [68]:
for i in range(1,20):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

for i in range(40,47):
    clf = DecisionTreeClassifier(max_depth =i, splitter= "random")
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.13988095238095238
3 : 0.07085346215780998
4 : 0.3086574654956085
5 : 0.2629482071713148
6 : 0.2540983606557377
7 : 0.3704572098475968
8 : 0.37142857142857144
9 : 0.4013377926421405
10 : 0.46963562753036436
11 : 0.37606837606837606
12 : 0.40958605664488024
13 : 0.44263959390862945
14 : 0.46324269889224573
15 : 0.45808966861598444
16 : 0.44398766700924974
17 : 0.48880597014925375
18 : 0.4359233097880928
19 : 0.4579439252336449
40 : 0.4898989898989899
41 : 0.4547008547008547
42 : 0.47939444911690493
43 : 0.47826086956521746
44 : 0.451114922813036
45 : 0.45896147403685095
46 : 0.48315529991783074


Alternatively, we used a best random split method instead. We see that the f1 score performance seem to stagnate at around 40-50%. Note that the number of splits were deliberately increased as random split might require more splits.

In [58]:
from sklearn.metrics import f1_score

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="entropy", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("Entropy "+ str(i)+" : "+str(score))

for i in range(1,10):
    clf = DecisionTreeClassifier(max_depth =i,criterion="log_loss", random_state = 1)
    clf.fit(X_train, Y_train)
    test_pred_decision_tree = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print("log_loss "+ str(i)+" : "+str(score))

Entropy 1 : 0.0
Entropy 2 : 0.17691154422788608
Entropy 3 : 0.16338880484114976
Entropy 4 : 0.5116279069767442
Entropy 5 : 0.4654471544715448
Entropy 6 : 0.4782608695652174
Entropy 7 : 0.5199610516066212
Entropy 8 : 0.4799176107106075
Entropy 9 : 0.49478672985781985
log_loss 1 : 0.0
log_loss 2 : 0.17691154422788608
log_loss 3 : 0.16338880484114976
log_loss 4 : 0.5116279069767442
log_loss 5 : 0.4654471544715448
log_loss 6 : 0.4782608695652174
log_loss 7 : 0.5199610516066212
log_loss 8 : 0.4799176107106075
log_loss 9 : 0.49478672985781985


Using the entropy and log_loss also produced similar results as the default gini criterion.

Hence, we can conclude that neither the criterion nor the number of splits were able to increase the accuracy of the model significantly.

# Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, Y_train.ravel())
random_forest = clf.predict(X_val)

score = f1_score(Y_val, random_forest)
print(str(score))

0.08455284552845528


Changing the max_depth of each tree in the random forest. 

In [97]:
for i in range(1,5):
    clf = RandomForestClassifier(max_depth =i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.48315529991783074
2 : 0.48315529991783074
3 : 0.48315529991783074
4 : 0.48315529991783074


As seen, increasing the max_depth doesnot affect the accuracy. Thus, we conclude the max_depth of the tree in the forest is only 1. However, we may suggest changing the number of trees in said forest

In [102]:
for i in range(1,5):
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, Y_train.ravel())
    random_forest = clf.predict(X_val)
    score = f1_score(Y_val, test_pred_decision_tree)
    print(str(i)+" : "+str(score))

1 : 0.48315529991783074
2 : 0.48315529991783074
3 : 0.48315529991783074
4 : 0.48315529991783074


In [117]:
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num =5)]
# Number of features to consider at every split
max_features = ['auto','sqrt']
#Maximum number of levels in tree
max_depth = [2,4]
# Method of selecting samples for training each tree
bootstrap = [True,False]

In [118]:
# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

In [119]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_estimators=i)

rf_Grid = GridSearchCV(estimator = clf, param_grid = random_grid, cv=3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train,Y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=10; total time=   0.1s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=20; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=20; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=30; total time=   0.4s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=30; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=30; total time=   0.3s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=40; total time=   0.5s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=40; total time=   0.4s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=40; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.1s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=auto, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.4s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.5s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.4s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.5s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=10; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.6s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.4s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=10; total time=   0.2s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=10; total time=   0.2s
[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=20; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=20; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=20; total time=   0.4s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=40; total time=   0.7s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=50; total time=   1.0s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=40; total time=   0.8s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.2s
[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=40; total time=   0.8s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=50; total time=   1.2s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.8s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.3s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.7s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=auto, n_estimators=50; total time=   1.4s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.5s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.8s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.7s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=40; total time=   0.9s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=40; total time=   0.9s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=10; total time=   0.2s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=10; total time=   0.2s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.1s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=40; total time=   0.9s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=10; total time=   0.2s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=20; total time=   0.4s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=20; total time=   0.4s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.1s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=20; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=30; total time=   0.6s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.1s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=40; total time=   0.7s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=40; total time=   0.7s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=40; total time=   0.7s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.2s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.2s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=50; total time=   0.9s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=10; total time=   0.2s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=50; total time=   0.9s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.4s
[CV] END bootstrap=False, max_depth=2, max_features=auto, n_estimators=50; total time=   0.9s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.4s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=20; total time=   0.4s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.6s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=30; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.6s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.6s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=40; total time=   0.6s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.7s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=10; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=10; total time=   0.3s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.7s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=10; total time=   0.3s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, n_estimators=50; total time=   0.7s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=20; total time=   0.5s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=20; total time=   0.5s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=20; total time=   0.5s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=30; total time=   0.7s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=30; total time=   0.8s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=30; total time=   0.8s


  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=40; total time=   1.0s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=40; total time=   1.0s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=40; total time=   1.0s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.3s
[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=10; total time=   0.3s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=auto, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.5s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.5s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=20; total time=   0.5s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.8s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.8s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=30; total time=   0.8s


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=40; total time=   1.0s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=40; total time=   1.1s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=40; total time=   1.1s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, n_estimators=50; total time=   1.2s


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [120]:
grid_prediction = rf_Grid.predict(X_val)
score = f1_score(Y_val, grid_prediction)
print(score)

0.2786885245901639


# Ada Boost

In [63]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

clf = AdaBoostClassifier(n_estimators=50, random_state=0)
clf.fit(X_train, Y_train.ravel())
ada_boost_pred = clf.predict(X_val)
score = f1_score(Y_val, ada_boost_pred)
print(score)

0.501953125


In [124]:
for i in range(1,10):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))
    

for i in range(30,35):
    clf = AdaBoostClassifier(n_estimators=i, random_state=0)
    clf.fit(X_train, Y_train.ravel())
    ada_boost_pred = clf.predict(X_val)
    score = f1_score(Y_val, ada_boost_pred)
    print(str(i)+" : "+str(score))

1 : 0.0
2 : 0.2894736842105263
3 : 0.3800813008130081
4 : 0.37091675447839834
5 : 0.3433179723502304
6 : 0.30900243309002434
7 : 0.39868565169769987
8 : 0.40874316939890715
9 : 0.4425711275026344
30 : 0.5040485829959515
31 : 0.4989939637826961
32 : 0.49798387096774194
33 : 0.4939024390243902
34 : 0.4908350305498982
