In [1]:
import numpy as np

def binSplitDataSet(dataSet, feature, value):
    """Split the data set into two data sets based on the feature and values of the data set"""
    mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

def chooseBestSplit(dataSet, ops=(1,4,10)):
    tolS = ops[0]   #min_impurity_decrease
    tolN = ops[1]   #min_samples_leaf 
    
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist())) == 1 or len(dataSet)<=ops[2]: #exit cond 1
        return None, np.mean(dataSet[:,-1])

    m,n = dataSet.shape
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = np.var(dataSet[:,-1]) * dataSet.shape[0]  
    bestS = np.inf  
    bestIndex = 0   
    bestValue = 0   
    for featIndex in range(n-1):  
        for splitVal in set(dataSet[:,featIndex]):     
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (mat0.shape[0] < tolN) or (mat1.shape[0] < tolN):  
                continue  
            newS = np.var(mat0[:,-1]) * mat0.shape[0] + np.var(mat1[:,-1]) * mat1.shape[0]   
            if newS < bestS:    #if the cut ssr is less than the previously uncut ssr, the optimal segmentation is updated 
                bestIndex = featIndex   
                bestValue = splitVal
                bestS = newS            
    if (S - bestS) < tolS:     #if the decrease (S-bestS) is less than a threshold don't do the split
        return None, np.mean(dataSet[:,-1])
    
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    
    if (mat0.shape[0] < tolN) or (mat1.shape[0] < tolN):  
        return None, np.mean(dataSet[:,-1])
    return bestIndex,bestValue    #returns the best feature to split onand the value used for that split
              

def createTree(dataSet,  ops=(1,4)):
    #assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, ops)   #choose the best split
    if feat == None: 
        return val 
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet,  ops)  #left,value>split_value
    retTree['right'] = createTree(rSet,  ops)  #right,value<=split_value
    return retTree  

def isTree(obj):
    return isinstance(obj,dict)

def getMean(tree):
    if isTree(tree['right']): 
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): 
        tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    if testData.shape[0] == 0: #If the test set already has no data, the model no longer splits and returns the mean directly 
        return getMean(tree)        
    if (isTree(tree['right']) or isTree(tree['left'])):#If one of the left and right branches is a tree, split it 
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])        
    if isTree(tree['left']):        #If the left branch is a tree, after pruning the left branch 
        tree['left'] = prune(tree['left'], lSet)       
    if isTree(tree['right']):    #If the right branch is a tree, split the right branch 
        tree['right'] =  prune(tree['right'], rSet)       
    if not isTree(tree['left']) and not isTree(tree['right']): #If both left and right branches are leaf nodes, try merging        
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) #Split the dataset according to the left and right branches    
        errorNoMerge = sum(np.power(lSet[:,-1] - tree['left'],2))+sum(np.power(rSet[:,-1] - tree['right'],2))    #Calculate an unmerged ssr      
        treeMean = (tree['left']+tree['right'])/2.0      #Calculate the value of the left and right leaves after the merge       
        errorMerge = sum(np.power(testData[:,-1] - treeMean,2))  #Calculate the combined ssr 
        if errorMerge < errorNoMerge:   
            return treeMean
        else: 
            return tree
    else: 
        return tree
        

    
def regTreeEval(model, inDat):
    return model.astype(float)

def treeForeCast(tree, inData,modelEval=regTreeEval):
    if not isTree(tree):             
        return modelEval(tree, inData) 
    if inData[tree['spInd']] > tree['spVal']:  
        if isTree(tree['left']):                 
            return treeForeCast(tree['left'], inData,modelEval)    
        else:                  #f left branch is not a tree,return the value
            return modelEval(tree['left'], inData)
    else:              #right path
        if isTree(tree['right']):     #if it is a tree,growths
            return treeForeCast(tree['right'], inData, modelEval)
        else:                     #if it is a leaf node,return mean label value
            return modelEval(tree['right'], inData)
        
def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    y_pred= np.zeros(m)
    for i in range(m):
        y_pred[i] = treeForeCast(tree, testData[i], modelEval)
    return y_pred

class RegressionTree:
    def __init__(self,min_impurity_decrease=1,
                 min_samples_leaf=4,
                 min_samples_split=10
                ):
        self.criterion='mse'
        self.splitter='best'
        self.max_depth=None
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.min_impurity_decrease =min_impurity_decrease 
    
    def fit(self,X,y):     
        train_data=np.concatenate([X,y.reshape(-1,1)],axis=-1)
        self_tree=createTree(train_data,ops=(self.min_impurity_decrease ,self.min_samples_leaf,self.min_samples_split))
        pure_data=np.concatenate([X_test,y_test.reshape(-1,1)],axis=1)
        self.tree=prune(self_tree,pure_data)
        
    def predict(self,X):
        return createForeCast(self.tree,X)

In [2]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
X,y=load_boston(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
sklearn_regressiontree=DecisionTreeRegressor().fit(X_train,y_train)
pre=sklearn_regressiontree.predict(X_test)
mean_squared_error(pre,y_test)

15.730196078431366

In [3]:
my_tree=RegressionTree()
my_tree.fit(X_train,y_train)
y_pred=my_tree.predict(X_test)
y_pred

array([19.625     , 13.21142857, 38.64      , 28.725     , 24.07055556,
       14.76666667, 26.8875    , 18.98571429, 20.18      , 22.625     ,
       28.725     , 20.76690476, 33.85714286, 22.34      , 21.72      ,
       19.3       , 14.76666667, 24.89375   , 22.5       , 17.55      ,
       12.1       , 20.18      , 21.72      , 20.76690476, 20.18      ,
       21.72      , 14.9       , 13.575     , 16.61428571, 24.07055556,
       19.625     , 12.1       ,  9.66291667, 33.85714286, 16.61428571,
       19.625     , 46.09583333, 20.76690476, 20.76690476, 18.06      ,
       18.775     , 16.62857143, 33.25      , 36.35      , 13.21142857,
       17.175     , 23.84      , 14.76666667,  9.66291667, 26.8875    ,
       12.1       ,  9.66291667, 11.5       , 13.21142857, 21.72      ,
       16.1       , 20.92539683, 20.76690476, 26.8875    , 16.62857143,
       24.07055556, 24.89375   , 17.72      , 12.1       ,  9.66291667,
       25.6       ,  9.66291667, 24.89375   , 33.11666667, 13.21

In [4]:
mean_squared_error(y_test,y_pred)

16.039199961284666