In [1]:
from sporgboost.forests import *
from sklearn.metrics import roc_auc_score
from get_data import load
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_iris
from sporgboost.preprocessing import onehot_encode
# X, y = load("data/sparse_parity.parquet")
X, y = load_iris(return_X_y=True)
y = onehot_encode(y)


In [21]:
# from sporgboost.trees import AxisAlignedDecisionTree

aa = AxisAlignedDecisionTree()
D = np.full(fill_value=1/X.shape[0], shape=(X.shape[0]))
aa.fit(X, y)

In [9]:
from sporgboost.common._split import best_split
t = best_split(X, y, D[:, np.newaxis])
t

(3, 0.8)

In [102]:
from numba.experimental import jitclass
from numba.types import uint32, int64, DictType
from sporgboost.trees import *
from sporgboost._forest_base import _predict_forest, _predict_proba_forest, _ada_alpha, _ada_eta, _ada_misclassified, _ada_weight_update
import numpy as np
from sporgboost._arrays import choice_replacement_weighted

class AdaBoost():
    def __init__(self, n_trees = 3, max_depth = 1, seed = 1234):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.seed = seed
        self.alpha = np.empty(shape=(self.n_trees))

    def fit(self, X, y):
        self.n_classes = y.shape[1]

        # Initalize trees
        forest = {}

        # Boosted trees must be fit sequentially
        # Give all samples equal weight initially
        D = np.full(shape=(X.shape[0]), fill_value=1/X.shape[0])

        for idx_forest in range(self.n_trees):
            invalid_tree = True
            while invalid_tree:
                # Draw a sample
                idx = np.random.choice(np.arange(X.shape[0]), size=X.shape[0], replace=True, p=D)
                X_, y_ = X[idx, :], y[idx, : ]
                # X_, y_ = choice_replacement_weighted(X, y, D)

                # Init and train a tree
                forest[idx_forest] = AxisAlignedDecisionTree(self.max_depth)
                forest[idx_forest].fit(X_, y_)

                # Update weights based on forest errors
                y_pred = forest[idx_forest].predict(X)

                # Perform a weight update
                miss = _ada_misclassified(y, y_pred)
                eta = _ada_eta(miss, D)

                # Discard rules
                # https://github.com/scikit-learn/scikit-learn/blob/37ac6788c95
                # 04ee409b75e5e24ff7d86c90c2ffb/sklearn/ensemble/_weight_
                # boosting.py#L637
                if (eta <= 0.) or (eta >= 1. - (1.0 / y.shape[1])):
                    # Tree is worse than random, or overfit, redraw
                    continue
                
                # Tree is valid, we can update weights and break the loop
                invalid_tree = False

                # Save new weights and the alpha value for prediction
                self.alpha[idx_forest] = _ada_alpha(eta)
                D = _ada_weight_update(y, y_pred, D, eta, miss)
                print(D[:5])
                print(D.sum())

        self.forest = forest

    def predict(self, X):
        return _predict_forest(X, self.forest, self.n_classes, weights=self.alpha)

    def predict_proba(self, X):
        return _predict_proba_forest(X, self.forest, self.n_classes, weights=self.alpha)

ada = AdaBoost()
ada.fit(X, y)
ada.predict(X).sum(axis=0)

[0.01 0.01 0.01 0.01 0.01]
1.0000000000000002
[0.01 0.01 0.01 0.01 0.01]
0.9999999999999999
[0.01470588 0.01470588 0.01470588 0.01470588 0.01470588]
1.0


array([ 50.,   0., 100.])

In [97]:
from sklearn.ensemble import AdaBoostClassifier

ada_sk = AdaBoostClassifier(algorithm='SAMME', n_estimators=3)
ada_sk.fit(X, y)
np.sum(ada_sk.predict(X) == y) / 150

0.96

144

In [74]:
np.array([tree.tree_proj[1] for tree in ada.forest.values()]).sum(axis=0)

array([[  0.],
       [  0.],
       [  4.],
       [996.]])

In [58]:
ada.forest[0].tree_proj

{1: array([[0.],
        [0.],
        [0.],
        [1.]])}

In [54]:
ada.forest[1].tree_split

{1: 0.5}

In [53]:
ada.forest[2].tree_split

{1: 0.5}

In [57]:
ada.forest[3].tree_proj

{1: array([[0.],
        [0.],
        [0.],
        [1.]])}

In [31]:
ada.predict_proba(X).sum(axis=0)

array([77.90623028,  6.46907305, 65.62469667])

In [18]:
ada.predict(X).sum(axis=0)

array([23., 91., 36.])

In [19]:
np.all(ada.predict(X) == y, axis=1).sum()

30

In [2]:
%%time
ada.fit(X, y)

KeyboardInterrupt: 

In [2]:
from sporgboost.trees import RotationalDecisionTree
rot = RotationalDecisionTree(K=2)
rot.fit(X, y)

In [15]:
%%time
from sporgboost.forests import *
rf = RandomForest()
rf.fit(X, y)

CPU times: total: 1min
Wall time: 5.09 s


In [9]:
sporf = SPORF(d=2, s=3)
sporf.fit(X, y)

In [5]:
rrf = RotationalForest(K=2)
rrf.fit(X, y)

In [19]:
# models = {
#     # 'rf' : RandomForest(),
#     # 'ab' : AdaBoost(),
#     'sporf' : SPORF(d=2, s=3),
#     'sporgboost' : SPORGBoost(d=2, s=3, max_depth=2),
#     # 'rrf' : RotationalRandomForest(K=2),
#     # 'rotboost' : RotBoost(K=2)
# }

# for k, m in models.items():
#     print(f"Fitting {k}")
#     m.fit(X_train, y_train)


Fitting sporf
Fitting sporgboost


In [6]:
roc_auc_score(y, rrf.predict(X), multi_class='ovo')

0.6357403228749623

In [10]:
roc_auc_score(y, sporf.predict_proba(X), multi_class='ovo')

0.976297372398488

In [8]:
roc_auc_score(y, rf.predict_proba(X), multi_class='ovo')

0.5819648490844376

In [24]:
from sklearn.metrics import accuracy_score
# accuracy_score(y_test, rf.predict(X_test))

In [26]:
# Get AUC scores
auc = {key : accuracy_score(y_test, model.predict(X_test)) for key, model in models.items() if key in {'sporf','sporgboost'}}
auc

{'sporf': 0.7630681818181818, 'sporgboost': 0.6098863636363636}