In [1]:
import numpy as np
from numpy.random import randint
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import math as m
from scipy.stats import mode

array = np.array

In [2]:
class RandomForestClassifier:
    def __init__(self, B = 100):
        self.B = B
        self.trees = []
    def fit(self, X, y):
        for b in range(self.B):
            tree,random_subspace = train_tree_b(X,y)
            self.trees.append((tree, random_subspace))
    def predict(self, X):
        if len(X) == 1:
            predictions = predict_one(X)
        else:
            predictions = [clf.predict(X[:,subspace]) for (clf,subspace) in self.trees]
            predictions = mode(predictions).mode.flatten()
        return predictions 
    def predict_one(self, x):
        predictions = [clf.predict(x[subspace].reshape(1,-1))[0] for (clf,subspace) in self.trees]
        prediction = statistics.mode(predictions)
        return prediction

In [3]:
#Some helper functions

def train_tree_b(X,y):
    clf_b = DecisionTreeClassifier()
    X_b, y_b, random_feature_space = rand_sample_w_replacement(X,y)
    clf_b.fit(X_b,y_b)
    return clf_b, random_feature_space
    
def rand_sample_w_replacement(X,y):
    num_samples,num_features = X.shape
    random_sample_idx = random_idx_w_replacement(num_samples, num_samples)
    random_feature_space = random_subspace(num_features)
    resampledX = X[random_sample_idx][:,random_feature_space]
    resampled_y = y[random_sample_idx]
    return resampledX, resampled_y, random_feature_space

def random_subspace(num_features):
    if num_features > 10:
        num_features_per_split = m.floor(m.sqrt(num_features))
    else:
        
        num_features_per_split = m.floor(num_features*.75)
    random_feature_idx = random_idx_w_replacement(num_features, num_features_per_split)
    return random_feature_idx

def random_idx_w_replacement(num_choices, num_samples):
    sample = randint(0,num_choices,num_samples)
    return sample

In [4]:
# Can properly fit the data
data = datasets.load_iris()
X = data.data
y = data.target

myRFC = RandomForestClassifier()
myRFC.fit(X,y)
confusion_matrix(y,myRFC.predict(X))

array([[50,  0,  0],
       [ 0, 50,  0],
       [ 0,  0, 50]])

In [5]:
#Comparison of performance to sklearn RFC testing on a hold-out set

X_train, X_val, y_train, y_val = train_test_split(X,y)
myRFC = RandomForestClassifier()
myRFC.fit(X_train,y_train)

predictions = myRFC.predict(X_val)
print(confusion_matrix(y_val, predictions))

from sklearn import ensemble
skRFC = ensemble.RandomForestClassifier()
skRFC.fit(X_train, y_train)

predictions = skRFC.predict(X_val)
print(confusion_matrix(y_val, predictions))

[[16  0  0]
 [ 0 10  2]
 [ 0  0 10]]
[[16  0  0]
 [ 0 10  2]
 [ 0  0 10]]
