In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np 
from sklearn.metrics import accuracy_score

In [27]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

## Train / Test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Create param list for GridSearch 
grid_search_params = {
    'max_leaf_nodes': list(range(2,100)), ## Can't be 0 obvs. 
    'min_samples_split': [2,3,4]
}

grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), grid_search_params, verbose=1, cv=3)
grid_search_cv.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 882 out of 882 | elapsed:    6.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16

In [28]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=17,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [31]:
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

In [33]:
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone
from scipy.stats import mode

## a. Generate 1000 subsets each of size 100 (randomly selected instances)
## b. Train one DT on each training set w/ best params above 
## c. For each training set, keep most frequent prediction (majority vote)
## d. Evaluate predictions

In [34]:
## a. 1000 subsets
subsets = 1000
train_set_size = 100 

rs = ShuffleSplit(n_splits = subsets, test_size=len(X_train) - train_set_size, random_state=42)

In [35]:
rs

ShuffleSplit(n_splits=1000, random_state=42, test_size=7900, train_size=None)

In [44]:
mini_sets = []
for train_index,test_index in rs.split(X_train): 
    mini_X_train = X_train[train_index]
    mini_y_train = y_train[train_index]
    mini_sets.append((mini_X_train,mini_y_train))

In [45]:
## b - build forest 
forest = [clone(grid_search_cv.best_estimator_) for _ in range(subsets)]
accuracy_scores = []

for tree, (mini_X_train, mini_y_train) in zip(forest, mini_sets):
    tree.fit(mini_X_train, mini_y_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test,y_pred))
    
np.mean(accuracy_scores)

0.8054499999999999

In [49]:
## c - take most frequent predictions
Y_pred = np.empty([subsets, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest): 
    Y_pred[tree_index] = tree.predict(X_test)
    
Y_pred.shape ## 1000 trees (rows), 2000 (len test set)

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0) ## Returns 1 row, 2000 columns (test set size)

In [55]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.872