## Important Points

* Created using make_moons function in sklean a synthetic dataset
* Applied GridSearchCV with params max_leaf_nodes and max_depth
* Used the best_estimators from GridSearch and trained 1000 separate samples of Decision Trees with each having 100 data points
* The above model gave a validation accuracy of 78.6%
* converted above to a a random forest by training on each test instance separately and then taking the mode amongst all predictions, and this increased validation accuracy to 86.28%

|   Model       |  Validation |
|---------------|-------------|
| Decison Tree  |   78.64%    |
| Random Forest |   86.28%    |

In [8]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [9]:
# Creating data and train-test split 
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x, y = make_moons(n_samples=10000, noise=0.4, random_state=True)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=42, test_size=0.25)

In [10]:
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

(7500, 2) (7500,)
(2500, 2) (2500,)


In [11]:
# No of classes
np.unique(y_train)

array([0, 1])

In [12]:
from sklearn.metrics import accuracy_score

def score(y, y_pred, train=False):
    if train:
        print("Training accuracy: ", accuracy_score(y, y_pred))
    else:
        print("Validation accuracy: ", accuracy_score(y, y_pred))

In [13]:
from sklearn.tree import DecisionTreeClassifier
dt =  DecisionTreeClassifier()

In [14]:
# Training best model for later tasks
from sklearn.model_selection import GridSearchCV

params = {"max_leaf_nodes": list(range(2, 50)), "max_depth": list(range(8, 12))}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), verbose=1, param_grid=params, cv=3)
%time
grid_search.fit(x_train, y_train)

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 15 µs
Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:    6.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [8, 9, 10, 11],
                         '

In [16]:
from sklearn.model_selection import ShuffleSplit

subsets = 1000
n_instances = 100
mini_sets = []

rs = ShuffleSplit(n_splits=subsets, train_size=n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(x_train):
    x_mini_train = x_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((x_mini_train, y_mini_train))

In [21]:
# Created smaller subsets
print(len(mini_sets))
print(mini_sets[0][0].shape)

1000
(100, 2)


In [24]:
# Training on all 1000 subsets and then evaluation of test set using each of them
# Taken from solution as more logical way
from sklearn.base import clone

forest = [clone(grid_search.best_estimator_) for _ in range(subsets)]

accuracy_ = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(x_valid)
    accuracy_.append(accuracy_score(y_valid, y_pred))

np.mean(accuracy_)

0.7864703999999999

In [27]:
"""
Since all individual tree's in the forest have been trained above, we can directly use them to predict
""" 

y_pred = []

for tree in forest:
    ind_pred = tree.predict(x_valid)
    y_pred.append(ind_pred)

y_pred = np.asarray(y_pred)

In [29]:
# So in each row we store the 2500 predictions and we require 1 predicton for each column, means  (1, 2500)
y_pred.shape

(1000, 2500)

In [30]:
from scipy.stats import mode

y_pred_mode, votes = mode(y_pred, axis=0)

In [32]:
# We got the required result
y_pred_mode.shape

(1, 2500)

In [37]:
# This is a Random FOrest and compared to individual decision tree we got a much higher increase in accuracy
score(y_valid, y_pred_mode.reshape(-1))

Validation accuracy:  0.8628
