In [15]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from mlcore.decision_tree import CustomDecisionTreeClassifier

In [29]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy
X, y = datasets.make_classification(n_samples=10000, n_features=20, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [30]:
from collections import Counter

class CustomRandomForest():
    def __init__(
        self,
        n_estimators=100,
        max_depth=None,
        max_features="sqrt",
        min_samples_split=2,
        min_impurity_decrease=1e-7,
        criterion="gini",
    ):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.min_impurity_decrease = min_impurity_decrease
        self.criterion = criterion
        self.trees = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        for _ in range(self.n_estimators):
            # bootstrapped dataset
            X_sample, y_sample = self._bootstrap_dataset(X, y)
            tree = CustomDecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_impurity_decrease=self.min_impurity_decrease,
                criterion=self.criterion,
            )
            # random sampling of feature set
            feature_sample_size = self._max_features_sample(n_features)
            selected_features = self._bootstrap_featureset(n_features, feature_sample_size)
            print(f"Selected features: {selected_features}")
            tree.fit(X_sample[:, selected_features], y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # X: (m,n) -> Predictions: (m,)
        # (n_estimators, m) -> Transpose -> (m, n_estimators)
        tree_predictions = np.array([tree.predict(X) for tree in self.trees]).T
        # perform majority voting
        predictions = []
        for preds in tree_predictions:
            prediction = Counter(preds).most_common(1)[0][0]
            predictions.append(prediction)
        return np.array(predictions)

    def _bootstrap_dataset(self, X, y):
        """ Create bootstrapped sampled dataset with replacement """
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True).T
        return X[indices], y[indices]
    
    def _bootstrap_featureset(self, n_features, feature_sample_size):
        """ Create bootstrapped sampled featureset without replacement """
        return np.random.choice(n_features, size=feature_sample_size, replace=False)
    
    def _max_features_sample(self, n_features):
        assert self.max_features in ("sqrt", "log2") or isinstance(self.max_features, int), \
            "max_features must be 'sqrt', 'log2', or an integer"
        if isinstance(self.max_features, int):
            assert self.max_features <= n_features, \
            "max_features must be less than or equal to the number of features in the dataset"
            return self.max_features
        elif self.max_features == "sqrt":
            return int(np.sqrt(n_features))
        elif self.max_features == "log2":
            return int(np.log2(n_features))
        return n_features



In [31]:
custom_rf = CustomRandomForest(n_estimators=10,
                                 max_depth=5,
                                 max_features="sqrt",
                                 min_samples_split=2,
                                 min_impurity_decrease=1e-7,
                                 criterion="gini")
custom_rf.fit(X_train, y_train)
y_pred = custom_rf.predict(X_test)  

print("Custom Random Forest Accuracy:", accuracy(y_test, y_pred))

Custom Random Forest Accuracy: 0.902


In [32]:
from sklearn.ensemble import RandomForestClassifier

sklearn_rf = RandomForestClassifier(n_estimators=10,
                                    max_depth=5,
                                    max_features="sqrt",
                                    min_samples_split=2,
                                    min_impurity_decrease=1e-7,
                                    criterion="gini")
sklearn_rf.fit(X_train, y_train)
y_pred_sklearn = sklearn_rf.predict(X_test)

print("Sklearn Random Forest Accuracy:", accuracy(y_test, y_pred_sklearn))

Sklearn Random Forest Accuracy: 0.8855
