In [3]:
import numpy as np
from statistics import mode 
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from dtree import *
from sklearn.utils import resample

In [None]:
a = range(10)

In [None]:
X = np.array([[1., 0.], [2., 1.], [0., 0.]])
y = np.array([0, 1, 2])

In [None]:
from scipy.sparse import coo_matrix

In [None]:
X_sparse = coo_matrix(X)

In [None]:
X_sparse

In [None]:
X, X_sparse, y = resample(X, X_sparse, y, random_state=0)

In [None]:
a

In [None]:
bag_index = [2, 0, 1, 4, 2, 5, 6, 0, 4, 0]

In [None]:
[x for x in a if x not in set(bag_index)]

In [None]:
sum([1,2,3])

## Decision Tree

In [16]:
import random
class DecisionNode:
    def __init__(self, col, split, lchild, rchild):
        self.col = col
        self.split = split
        self.lchild = lchild
        self.rchild = rchild

    def predict(self, x_test):
        # Make decision based upon x_test[col] and split
        if x_test[self.col] > self.split:
            return self.rchild
        return self.lchild

    def leaf(self, x_test):
        # pass in a single x observation
        branch = self.predict(x_test)
        if isinstance(branch, LeafNode):
            return branch
        return branch.leaf(x_test)


class LeafNode:
    def __init__(self, y, prediction):
        "Create leaf node from y values and prediction; prediction is mean(y) or mode(y)"
        self.n = len(y)
        self.prediction = prediction
        self.y = y

    def predict(self, x_test):
        # return prediction
        return self.prediction

def gini(y):
    "Return the gini impurity score for values in y"
    _, counts = np.unique(y, return_counts=True)
    p = counts / len(y)
    return 1 - np.sum( p**2 )


class DecisionTree621():
    def __init__(self, min_samples_leaf=1, loss=None, max_features=1):
        self.min_samples_leaf = min_samples_leaf
        self.loss = loss # loss function; either np.std or gini
        self.max_features = max_features

    def fit(self, X, y):
        """
        Create a decision tree fit to (X,y) and save as self.root, the root of
        our decision tree, for either a classifier or regressor.  Leaf nodes for classifiers
        predict the most common class (the mode) and regressors predict the average y
        for samples in that leaf.  
              
        This function is a wrapper around fit_() that just stores the tree in self.root.
        """
        self.root = self.fit_(X, y)


    def fit_(self, X, y):
        """
        Recursively create and return a decision tree fit to (X,y) for
        either a classifier or regressor.  This function should call self.create_leaf(X,y)
        to create the appropriate leaf node, which will invoke either
        RegressionTree621.create_leaf() or ClassifierTree621. create_leaf() depending
        on the type of self.
        
        This function is not part of the class "interface" and is for internal use, but it
        embodies the decision tree fitting algorithm.

        (Make sure to call fit_() not fit() recursively.)
        """
        if len(y) <= self.min_samples_leaf:
            return self.create_leaf(y)
        
        col, split = self.find_best_split(X, y, self.loss, self.max_features, self.min_samples_leaf)

        if col == -1:
            return self.create_leaf(y)

        lchild = self.fit_(X[X[:,col]<=split],y[X[:,col]<=split])
        rchild = self.fit_(X[X[:,col]>split],y[X[:,col]>split])

        return DecisionNode(col, split, lchild, rchild)


    def predict(self, X_test):
        """
        Make a prediction for each record in X_test and return as array.
        This method is inherited by RegressionTree621 and ClassifierTree621 and
        works for both without modification!
        """
        X_pred = np.zeros(len(X_test))
        for i, val in enumerate(X_test):
            pred = self.dfs(self.root, val)
            X_pred[i] = pred
        return X_pred


    def dfs(self, tree, x):
        # pass in a single x observation
        if isinstance(tree, LeafNode):
            return tree.prediction
        branch = tree.predict(x)
        return self.dfs(branch, x)


    def find_best_split(self, X, y, loss, max_features, min_samples_leaf):
        best = (-1, -1, loss(y))
        ncol = len(X[0,:])
        selected_feature = int(max_features*ncol)
        vars = [random.choice(range(ncol)) for i in range(selected_feature)]
        for col in vars:
            candidates = [random.choice(X[:,col]) for i in range(11)]
            for split in candidates:
                yl = y[X[:,col]<=split]
                yr = y[X[:,col]>split]
                if len(yl) < min_samples_leaf or len(yr) < min_samples_leaf:
                    continue
                l = (len(yl)*loss(yl) + len(yr)*loss(yr))/len(y)
                if l == 0:
                    return (col, split)
                if l < best[2]:
                    best = (col, split, l)
        return (best[0], best[1])



class RegressionTree621(DecisionTree621):
    def __init__(self, min_samples_leaf=1, max_features=1):
        super().__init__(min_samples_leaf, loss=np.std, max_features=max_features)
    def score(self, X_test, y_test):
        "Return the R^2 of y_test vs predictions for each record in X_test"
        return r2_score(y_test, self.predict(X_test))
    def create_leaf(self, y):
        """
        Return a new LeafNode for regression, passing y and mean(y) to
        the LeafNode constructor.
        """
        return LeafNode(y, prediction=np.mean(y))


class ClassifierTree621(DecisionTree621):
    def __init__(self, min_samples_leaf=1, max_features=1):
        super().__init__(min_samples_leaf, loss=gini, max_features=max_features)
    def score(self, X_test, y_test):
        "Return the accuracy_score() of y_test vs predictions for each record in X_test"
        return accuracy_score(y_test, self.predict(X_test))
    def create_leaf(self, y):
        """
        Return a new LeafNode for classification, passing y and mode(y) to
        the LeafNode constructor.
        """
        return LeafNode(y, prediction=mode(y))


if __name__ == "__main__":
    from test_rf import *
    X, y = load_boston(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    dt = RegressionTree621(min_samples_leaf=1, max_features=10)
    dt.fit(X,y)
    print(dt.max_features)

10


# Forest

In [48]:
import numpy as np
from statistics import mode 
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from dtree import *
from sklearn.utils import resample
from collections import Counter


class RandomForest621:
    def __init__(self, n_estimators=10, oob_score=False):
        self.n_estimators = n_estimators
        self.oob_score = oob_score
        self.oob_score_ = np.nan
        self.trees = None

    def fit(self, X, y):
        """
        Given an (X, y) training set, fit all n_estimators trees to different,
        bootstrapped versions of the training data.  Keep track of the indexes of
        the OOB records for each tree.  After fitting all of the trees in the forest,
        compute the OOB validation score estimate and store as self.oob_score_, to
        mimic sklearn.
        """
        self.forest = []
        self.oob_index = []
        for i in range(self.n_estimators):
            tree = self.trees(self.min_samples_leaf,self.max_features)
            bag_idx = resample(range(len(X)))
            oob_idx = [x for x in range(len(X)) if x not in set(bag_idx)]
            bag_X = X[bag_idx]
            bag_y = y[bag_idx]
            tree.fit(bag_X,bag_y)
            self.forest.append(tree)
            self.oob_index.append(oob_idx)


        if self.oob_score:
            self.oob_score_ = None #... compute OOB score ...

class RandomForestRegressor621(RandomForest621):
    def __init__(self, n_estimators=10, min_samples_leaf=3, max_features=0.3, oob_score=False):
        super().__init__(n_estimators, oob_score=oob_score)
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.trees = RegressionTree621

    def predict(self, X_test) -> np.ndarray:
        """
        Given a 2D nxp array with one or more records, compute the weighted average
        prediction from all trees in this forest. Weight each trees prediction by
        the number of samples in the leaf making that prediction.  Return a 1D vector
        with the predictions for each input record of X_test.
        """
        result = []
        for row in X_test:
            leaves = [tree_.root.leaf(row) for tree_ in self.forest]
            nobs = sum([leaf.n for leaf in leaves])
            y_sum = sum([leaf.n *leaf.prediction for leaf in leaves]) #np.sum(leaf.y)
            result.append(y_sum/nobs) 
        return np.array(result)
        
    def score(self, X_test, y_test) -> float:
        """
        Given a 2D nxp X_test array and 1D nx1 y_test array with one or more records,
        collect the prediction for each record and then compute R^2 on that and y_test.
        """
        return r2_score(y_test, self.predict(X_test))
class RandomForestClassifier621:
    def __init__(self, n_estimators=10, min_samples_leaf=3, max_features=0.3, oob_score=False):
        super().__init__(n_estimators, oob_score=oob_score)
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.trees = ClassifierTree621

    def predict(self, X_test) -> np.ndarray:
        result = []
        for row in X_test:
            counter = Counter()
            for sub_tree in self.forest:
                counter += Counter(sub_tree.root.leaf(row).y)
            result.append(counter.most_common(1)[0][0])
        return np.array(result)        
        
    def score(self, X_test, y_test) -> float:
        """
        Given a 2D nxp X_test array and 1D nx1 y_test array with one or more records,
        collect the predicted class for each record and then compute accuracy between
        that and y_test.
        """
        return accuracy_score(y_test, self.predict(X_test)) 

In [4]:
from sklearn.datasets import \
    load_boston, load_iris, load_diabetes, load_wine, \
    load_breast_cancer, fetch_california_housing

In [43]:
X, y = load_iris(return_X_y=True)

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.20)

In [49]:
rf = RandomForestClassifier621(min_samples_leaf=5, max_features=0.6, oob_score=False, n_estimators=18)

In [19]:
rf.trees.max_features

0.6

In [25]:
rf.fit(X_train, y_train)

In [26]:
rf.predict(X_train)

array([28.37142857, 17.4875    , 18.35714286, 10.08333333, 21.08888889,
       23.94      , 13.82857143, 13.71428571, 27.52857143, 27.64      ,
       18.35714286, 13.8       , 22.44      , 27.52857143, 23.66      ,
       45.74      , 19.2       ,  8.43333333, 24.42      , 23.66      ,
       17.33333333, 24.86      , 45.74      , 17.4875    , 10.08333333,
       24.42      , 27.64      , 45.74      , 36.8       , 14.38      ,
       17.33333333, 18.54      , 22.73333333, 25.3625    , 13.675     ,
       22.73333333, 22.28333333, 28.38      , 13.675     , 27.64      ,
       22.44      , 28.37142857, 21.24285714, 18.35714286, 17.03333333,
       19.2       , 19.7       , 14.4       , 14.38      , 13.675     ,
       20.9       , 11.55      , 24.42      , 22.28333333, 21.11666667,
       27.775     , 22.28333333, 27.64      , 20.92      ,  8.43333333,
       36.8       , 23.04      , 28.37142857, 23.94      , 14.4       ,
       20.9       , 34.54      , 19.2       , 27.52857143, 23.72

In [27]:
y_train

array([24.4, 14.4, 23.1, 12. , 21. , 24.5, 14.9, 11.8, 28.2, 31. , 18. ,
        8.3, 24.5, 29. , 23.5, 43.8, 20. ,  7.5, 23. , 22.5, 16.8, 15. ,
       48.5, 16.6,  7. , 24.1, 33.8, 41.7, 27. , 15.6, 18.2, 16.5, 23.4,
       24.1, 21.4, 22.9, 23.3, 37.2, 16.7, 22.8, 25. , 29.8, 17. , 19.2,
       13.4, 17.2, 18.8, 14.1, 13.5, 21.4, 21.7, 11.9, 24.8, 20.6, 20.4,
       32. , 22. , 36. , 23.1,  8.4, 21.9, 22.6, 28.6, 24. , 12.6, 24.3,
       33.3, 20.7, 27.5, 23.7, 31.5, 24.8, 28.7, 20.8,  8.8, 33. , 22.2,
       26.7, 19.3, 10.5, 14.9, 18.9, 20.6, 20.5, 34.9, 14.4, 24.2, 13.4,
       25.3, 48.3, 50. , 29. , 15.4, 39.8, 16.4, 19.1, 17.7, 18.7, 17.2,
       23.1, 23. , 17.8, 17.3, 27.1, 23.2, 14.1, 19.4,  7.4, 20.6, 18.3,
       44.8, 23.3, 17.5, 13.2, 28.4, 27.9, 35.4, 13.9, 13.3, 20.1, 34.9,
       21.5, 35.4,  8.3, 21.2, 43.5, 25. , 25. , 17.2, 18.2, 27.5, 28. ,
       50. , 19.8, 22.6, 14.3, 22. , 50. , 43.1, 29.6, 19.4, 13.4, 19.9,
       18.5, 17.4, 20.9, 13.6, 23.9, 50. , 21.9, 20

In [28]:
from collections import Counter

In [29]:
a = Counter([1,2,2,3,3,3,3])
b = Counter([2,2,2,2,2])

In [30]:
a+b

Counter({1: 1, 2: 7, 3: 4})

In [38]:
c = Counter(np.array([2,2,2]))

In [39]:
c

Counter({2: 3})

In [37]:
c+a+b

TypeError: unsupported operand type(s) for +: 'type' and 'Counter'

In [33]:
a += b

In [34]:
a

Counter({1: 1, 2: 7, 3: 4})

In [40]:
a.most_common(1)[0][0]

2

In [41]:
a.most_common(1)

[(2, 7)]

## Classification Loss Function: gini v.s. entropy

$$Gini = 1 - \sum{p^2}$$

$$Entropy=–\sum{p}⋅log_2{p}$$

In [4]:
#Gini
y = [0,1,1,0,0,0]
def gini(y):
    "Return the gini impurity score for values in y"
    _, counts = np.unique(y, return_counts=True)
    p = counts / len(y)
    return 1 - np.sum( p**2 )

In [5]:
gini(y)

0.4444444444444444

In [7]:
np.unique(y, return_counts=True)

(array([0, 1]), array([4, 2]))

In [8]:
# entropy
from scipy.stats import entropy
entropy(y)

0.6931471805599453

In [25]:
def entropy(y):
    "Return the gini impurity score for values in y"
    _, counts = np.unique(y, return_counts=True)
    p = counts / len(y)
    return - np.sum( p*np.log(p) )

In [26]:
entropy(y)

0.6365141682948128

In [27]:
import pandas as pd
import math
def entropy3(labels, base=None):
    vc = pd.Series(labels).value_counts(normalize=True, sort=False)
    base = math.e if base is None else base
    return -(vc * np.log(vc)/np.log(base)).sum()

In [28]:
entropy3(y)

0.6365141682948128