In [236]:
import numpy as np
import anytree as at
import pandas as pd
import torch


class DTreeRegressor(object):

    def __init__(self, min_sample_split=1, max_depth=None, split_metric='mse'):
        self.msp = min_sample_split
        self.md = max_depth
        self.sm = split_metric

    def fit(self, data):
        root_info = self.get_root(data)
        print(root_info)
        root = self.create_node(root_info)

    #         self.grow(root)
        print(at.RenderTree(root))

    # for a dataset, find best feature and value for split:
    def get_root(self, data, debug=False):

        print('DF check')
        if isinstance(data, pd.core.frame.DataFrame):
            df = True
            print('DataFrame received')
            features = data.columns
            data = data.to_numpy()
        else:
            df = False

        best_score = np.infty
        best_split = None
        root = None
        for i in np.arange(data.shape[1] - 1):
            feature_i = data[:, i]
            split = self.choose_split(data, feature_i)
            if split['score'] < best_score:
                best_split = split
                feature = i

        if df:
            best_split['feature'] = features[feature]
        else:
            best_split['feature'] = feature

        if debug:
            print(f'get_root debug')
            print(split)
            print('-------------------')

        return best_split

    # for a given feature, find best split of data. data sorted by feature argsort
    def choose_split(self, data, feature, debug=False):

        sort_idx = feature.argsort()
        feature = feature[sort_idx]
        data = data[sort_idx]
        target = data[:, -1]

        best_score = np.infty
        best_groups = None

        for i in np.arange(len(feature) - 1):
            score = self.score_split(i, target)
            if score < best_score:
                best_score = score
                best_index = i
                best_groups = self.get_groups(i, data)

        if debug:
            print(debug)
            print('Choose Split Debug:')
            print(f'best score is {best_score}')
            print(f'split at feature value {feature[best_index]}')
            print('End Debug: ----------------')

        return {'score': best_score,
                'samples': len(data),
                'target_avg': target.mean(),
                'mse': self.mse(target.mean(), target),
                'split_val': feature[best_index],
                'groups': best_groups}

    # create an AnyTree Node object from split data
    def create_node(self, split_info):

        feature = split_info['feature']
        thresh = split_info['split_val']
        route = f'feature {feature} <= {thresh}'
        return at.AnyNode(**{
            'id': route,
            'feature': feature,
            'samples': split_info['samples'],
            'value': split_info['target_avg'],
            'mse': split_info['mse']
        })

    # def grow(self, root):

    # get left and right target means
    def get_means(self, y1, y2):
        return y1.mean(), y2.mean()

    # compute mean squared error
    def mse(self, mean, y):
        return (1 / len(y)) * np.sum((y - mean) ** 2)

    # get groups based on last index of left split
    def get_groups(self, index, data):
        left, right = np.array([]), np.array([])
        left = data[:(index + 1)]
        right = data[(index + 1):]
        return left, right

    # score a split based on weighted error (weight are sample sizes of splits)
    def score_split(self, index, target):
        ly, ry = self.get_groups(index, target)
        lm, rm = ly.mean(), ry.mean()
        return len(ly) * self.mse(lm, ly) + len(ry) * self.mse(rm, ry)




def test():
    test = pd.DataFrame({'Size':[7,3,4,19],
                  'Color':['red', 'blue', 'green', 'orange'],
                  'Cost':[22.4, 73.2, 40.1, 55.6]})

    # print(test)

    dtree = DTreeRegressor()

    dtree.fit(test)

test()

DF check
DataFrame received
{'score': 548.5400000000001, 'samples': 4, 'target_avg': 47.825, 'mse': 352.61187500000005, 'split_val': 'orange', 'groups': (array([[3, 'blue', 73.2],
       [4, 'green', 40.1],
       [19, 'orange', 55.6]], dtype=object), array([[7, 'red', 22.4]], dtype=object)), 'feature': 'Color'}
AnyNode(feature='Color', id='feature Color <= orange', mse=352.61187500000005, samples=4, value=47.825)


In [237]:
test = pd.DataFrame({'Size':[7,3,4,19],
              'Color':['red', 'blue', 'green', 'orange'],
              'Cost':[22.4, 73.2, 40.1, 55.6]})
test

Unnamed: 0,Size,Color,Cost
0,7,red,22.4
1,3,blue,73.2
2,4,green,40.1
3,19,orange,55.6


In [238]:
data = test.to_numpy()
feature = data[:, 0]

In [239]:
dtree = DTreeRegressor()
dtree.choose_split(data, feature)

{'score': 551.9266666666667,
 'samples': 4,
 'target_avg': 47.825,
 'mse': 352.61187500000005,
 'split_val': 3,
 'groups': (array([[3, 'blue', 73.2]], dtype=object),
  array([[4, 'green', 40.1],
         [7, 'red', 22.4],
         [19, 'orange', 55.6]], dtype=object))}

In [240]:
dtree.fit(test.to_numpy())

DF check
{'score': 548.5400000000001, 'samples': 4, 'target_avg': 47.825, 'mse': 352.61187500000005, 'split_val': 'orange', 'groups': (array([[3, 'blue', 73.2],
       [4, 'green', 40.1],
       [19, 'orange', 55.6]], dtype=object), array([[7, 'red', 22.4]], dtype=object)), 'feature': 1}
AnyNode(feature=1, id='feature 1 <= orange', mse=352.61187500000005, samples=4, value=47.825)
