# Random Forest
using a bunch of  decision tree, and pick the labels of the majority 
<img src="pics/rf.pic1.png" width="700">

In [16]:
import numpy as np
from Tree import Tree
from collections import Counter

class Forest:
    
    def __init__(self, n_tree = 20, min_split=2, max_depth=50, n_feature = None):
        self.n_tree = n_tree
        self.min_split=2
        self.max_depth = max_depth
        self.n_feature = n_feature
        self.jungle = []
        
        #sampling with replacement for every data point of the sample 
    def bootstrap_sampling(self,X,y, portion = 1):
        n_sample, _ = X.shape
        sub_sample_idx = np.random.choice(n_sample, size = int(portion*n_sample), replace=True)
        return X[sub_sample_idx], y[sub_sample_idx]
    
    def fit(self, X, y):
        for _ in range(self.n_tree):
            tree = Tree(self.min_split, self.max_depth, self.n_feature)
            X_sample, y_sample = self.bootstrap_sampling(X,y)
            tree.fit(X_sample, y_sample)
            self.jungle.append(tree)
            

    
    def predict(self, X):
        y_pred = []
        for x in X:
            y_hats = [tree.traverse(x, tree.root) for tree in self.jungle]
            y_hat = Counter(y_hats).most_common(1)[0][0]
            y_pred.append(y_hat)
        return np.array(y_pred)
    
    def accuracy(self, X, y):
        y_hat = self.predict(X)
        score = np.sum(y_hat==y)/len(y)
        return f'{score:.3f}'
    


In [19]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

dataset = datasets.load_breast_cancer()
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1234)


forest = Forest(n_tree=3,max_depth=10)
forest.fit(X_train, y_train)

tree = Tree(max_depth=10)
tree.fit(X_train, y_train)

print(f'y_test: \n\t{y_test}')
print(f'y_hat (Tree): \n\t{tree.predict(X_test)}')
print(f'Accuracy Score of Tree: {tree.accuracy(X_test, y_test)}')
print(f'y_hat (Forest): \n\t{forest.predict(X_test)}')
print(f'Accuracy Score of Forest: {forest.accuracy(X_test, y_test)}')

















y_test: 
	[1 1 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1
 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0
 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0
 1 0 0 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 1
 0 0 1 1 0 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 0 1
 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1
 0 0 1 0 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0
 1 0 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 1 0 1 1 1]
y_hat (Tree): 
	[1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1
 0 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1
 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0
 1 1 0 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1
 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 0 1
 0 1 1 1 0 0 1 1 0 0