In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sporgboost.preprocessing import onehot_encode, shuffle
from sporgboost.trees import AxisAlignedDecisionTree, SparseRandomDecisionTree as SPODT
from sklearn.tree import DecisionTreeClassifier
sktree = DecisionTreeClassifier()
model = SPODT(d = 2, s = 3)

In [2]:
# Dataset for testing
X, y = load_iris(return_X_y = True)

# Set seed to be able to reproduce
np.random.seed(1234)
    
# Preprocessing
y = onehot_encode(y)
X, y = shuffle(X, y)

In [13]:
import numpy as np
from sporgboost.common import best_split, gini_impurity
from sporgboost.common._predict import _predict
from sporgboost.utils import row_mean
# from numba import njit
from numba.types import uint32, float64, deferred_type, optional
from numba.experimental import jitclass

# np.random.choice is not supported in numba
def sparse_random(X, d, s):
    p = X.shape[1]
    out =  np.random.choice(
        [-1., 0., 1.],
        size=(p, d),
        p=[1 / (2 * s),
        1 - (1 / s),
        1 / (2 * s)
        ]
    )

    # If any choices are all zeroes, redraw
    while np.any(out.sum(axis=0) == 0.):
        out = sparse_random(X, d, s)

    return out

# ////////////////////////////////////////////////////////////////
# ////////////////////////////////////////////////////////////////
# ////////////////////////////////////////////////////////////////
node_type = deferred_type()

class SparseRandomDecisionTree():
    def __init__(self, d, s):
        self.tree = None
        self.n_classes = None
        self.d = d
        self.s = s
        
    def fit(self, X, y):
        self.tree = None
        self.n_classes = None
        self.tree = _grow_tree_sr(X, y, self.d, self.s)
        self.n_classes = y.shape[1]

    def predict(self, X):
        return _predict(self.tree, X)

# Node needs to be explicitly included in each tree type for numba
# to properly compile
@jitclass([
    ('value', optional(float64[:,:])),
    ('left', optional(node_type)),
    ('right', optional(node_type)),
    ('proj', optional(float64[:,:])),
    ('split', optional(float64)),
    ('n_classes', uint32)
])
class Node():
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.value = None
        self.left = None
        self.right = None
        self.proj = None
        self.split = None

    def is_leaf(self):
        return self.value is not None
        
node_type.define(Node.class_type.instance_type)

def _grow_tree_sr(X, y, d, s):
    # Identity projection
    A = sparse_random(X, d, s)
    X_ = X @ A

    col, split = best_split(X_, y)
    A_ = np.ascontiguousarray(A[:, col]).reshape((-1, 1))
    out = Node(y.shape[1])

    out.proj = A_
    out.split = split
    le = (X_[:, col] <= split)

    # Compute new split predictions
    pred_left = row_mean(y[le, :]).reshape((1, -1))
    pred_right = row_mean(y[~le, :]).reshape((1, -1))

    if gini_impurity(pred_left) == 0:
        # Return leaf value
        left = Node(out.n_classes)
        left.value = pred_left
        out.left = left
    else:
        # Grow another decision stump
        out.left = _grow_tree_sr(X[le, :], y[le, :], d, s)

    if gini_impurity(pred_right) == 0:
        # Return leaf value
        right = Node(out.n_classes)
        right.value = pred_right
        out.right = right
    else:
        out.right = _grow_tree_sr(X[~le, :], y[~le, :], d, s)
    
    return(out)

model = SparseRandomDecisionTree(d = 2, s = 3)
model.fit(X, y)

In [11]:
%%time
sktree.fit(X, y)

CPU times: total: 0 ns
Wall time: 1 ms


DecisionTreeClassifier()

In [14]:
%timeit model.fit(X, y)

SystemError: CPUDispatcher(<function best_split at 0x000002496BB9E3A0>) returned a result with an error set

In [16]:
%timeit sktree.fit(X, y)

307 µs ± 5.26 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
%timeit model.predict(X)

  X_ = np.dot(X, tree.proj)


The slowest run took 4.33 times longer than the fastest. This could mean that an intermediate result is being cached.
37.7 µs ± 26.1 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit sktree.predict(X)

71.3 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [17]:
np.all(model.predict(X) == sktree.predict(X))

True