In [5]:
import numpy as np
import pandas as pd
import torch

In [42]:
class DTreeRegressor(object):

    def __init__(self, min_sample_split=0, max_depth=None, split_metric='mse'):
        self.msp = min_sample_split
        self.md = max_depth
        self.sm = split_metric

    def fit(self, x, y):
        r, s = self.get_root(x, y)

    def get_root(self, x, y, debug=True):
        
        best_score = np.infty
        best_split = None
        root = None
        for i in np.arange(x.shape[1]):
            v = x[:,i]
            idx = np.argsort(v)
            y_i = y[idx]
            score, split = self.choose_split(v, y_i)
            if score < best_score:
                best_score = score
                best_split = split
                root = i
        if debug:
            print('Get Root Debug:')
            print(f'root is feature {i}')
            print(f'split root at index {best_split}')
            print('End Debug: ----------------')
        return root, best_split
        

    def choose_split(self, x, y, debug=True):
        
        best_score = np.infty
        leftx = np.array([])
        lefty = np.array([])
        rightx = x
        righty = y
        lss = 0
        rss = len(x)
        
        for i in np.arange(rss - 1):
            lss += 1
            rss -= 1
            leftx = np.append(leftx, rightx[0])
            lefty = np.append(lefty, righty[0])
            rightx = rightx[1:]
            righty = righty[1:]
            lmean, rmean = self.get_means(lefty, righty)
            score = self.score_split(lmean, rmean, lefty, righty, lss, rss)
            if score < best_score:
                best_score = score
                bl = lss
                br = rss
        if debug:
            print('Choose Split Debug:')
            print(f'best score is {best_score}')
            print(f'split at sample {bl}')
            print('End Debug: ----------------')
        return best_score, bl

    
    def get_means(self, y1, y2):
        return (y1.mean(), y2.mean())

    def error(self, mean, y):
        return (1/len(y))*np.sum((y - mean)**2)

    def score_split(self, lm, rm, ly, ry, ls, rs):
        return ls*self.error(lm, ly) + rs*self.error(rm, ry)

In [43]:
test = pd.DataFrame({'Size':[7,3,4,19],
              'Color':['red', 'blue', 'green', 'orange'],
              'Cost':[22.4, 73.2, 40.1, 55.6]})
test

Unnamed: 0,Size,Color,Cost
0,7,red,22.4
1,3,blue,73.2
2,4,green,40.1
3,19,orange,55.6


In [44]:
dtree = DTreeRegressor()
x = test.drop(['Cost'], axis=1).to_numpy()
y = test.Cost.to_numpy()
dtree.fit(x, y)


Choose Split Debug:
best score is 551.9266666666667
split at sample 1
End Debug: ----------------
Choose Split Debug:
best score is 548.5400000000001
split at sample 3
End Debug: ----------------
Get Root Debug:
root is feature 1
split root at index 3
End Debug: ----------------
