# Fit a Decision Tree and then make that Tree into a Forest

In [1]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=10000, noise=0.4)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree = DecisionTreeClassifier()
param_grid = {'max_leaf_nodes': [2, 3, 4, 5],
              'max_depth': [2, 3, 4, 5]}
grid = GridSearchCV(tree, param_grid=param_grid)

In [4]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5], 'max_depth': [2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [5]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [6]:
grid.score(X_train, y_train)

0.8572857142857143

In [7]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1000, train_size=100)
ss.get_n_splits(X_train)
mini_sets = []
for train_index, test_index in ss.split(X_train):
    mini_X_train, mini_y_train = X_train[train_index], y_train[train_index]
    mini_sets.append((mini_X_train, mini_y_train))



In [8]:
from sklearn.base import clone
import numpy as np
forest = [clone(grid.best_estimator_) for _ in range(len(mini_sets))]
scores = []
for tree, (mini_x, mini_y) in zip(forest, mini_sets):
    tree.fit(mini_x, mini_y)
    scores.append(tree.score(mini_x, mini_y))
np.mean(scores)

0.8828

In [23]:
from scipy.stats import mode
predictions = []
for tree in forest:
    predictions.append(tree.predict(X_test))
majority_vote, counts = mode(predictions, axis=0)
print(len(majority_vote[0]))

3000


In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(majority_vote[0], y_test)

0.85