In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import mode
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierSK
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.base import ClassifierMixin, BaseEstimator

# Implement Random Forest Classifier

In [216]:
class RandomForestClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(
        self, n_estimators=100, criterion='gini', max_depth=None,
        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
        max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
        min_impurity_split=None):
        
        self.n_estimators = n_estimators
        self.tree_params = {
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'max_features': max_features,
        }
    
    def _get_bootstrap_indices(self):
        indices = np.random.choice(
            self.num_samples, size=self.num_samples, replace=True)
        return list(indices)
    
    @staticmethod
    def _validate_data(X):
        
        # Validate X is a numpy array
        if isinstance(X, pd.DataFrame):
            return X.values
        elif isinstance(X, np.ndarray):
            return X
        else:
            raise ValueError('X must be an np.ndarray or pd.DataFrame')
    
    def fit(self, X, y):
        
        X = self._validate_data(X)
        self.num_samples, self.num_features = X.shape
            
        # Fit each decision tree on bootstrapped dataset
        self.trees = [
            DecisionTreeClassifier(**self.tree_params)
            for i in range(self.n_estimators)]
        for tree in self.trees:
            indices = self._get_bootstrap_indices()
            tree.fit(X[indices], y[indices])
        
    def predict(self, X):
        
        X = self._validate_data(X)
        
        # shape = (n_samples, n_estimators)
        predictions = np.array([tree.predict(X) for tree in self.trees]).T
        
        # select out most frequent prediction for each sample
        predictions = mode(predictions, axis=1).mode.flatten()
        return predictions

# Import data and split

In [227]:
wine = load_wine()

X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.15)

# Test new implementation of Random Forest

In [228]:
rfc = RandomForestClassifier(n_estimators=1000, max_features='auto', max_depth=2)
rfc.fit(X_train, y_train)
print('Train accuracy:', rfc.score(X_train, y_train))
print('Val accuracy:', rfc.score(X_val, y_val))

Train accuracy: 0.9867549668874173
Val accuracy: 0.9629629629629629


# Test sklearn implementation of Random Forest

In [230]:
rfc_sk = RandomForestClassifierSK(n_estimators=1000, max_features='auto', max_depth=2)
rfc_sk.fit(X_train, y_train)
print('Train accuracy:', rfc_sk.score(X_train, y_train))
print('Val accuracy:', rfc_sk.score(X_val, y_val))

Train accuracy: 0.9867549668874173
Val accuracy: 0.9629629629629629
