In [1]:
import numpy as np

In [2]:
MIN_SPLIT = 10
MIN_SAMPLES_LEAF = 5
MAX_HEIGHT = 5
CONTINUOUS = "CO"
CATEGORICAL = "CA"

In [3]:
#Sorts a matrix by a given column
def sort_by_column(matrix,sort_coordinate):
    new_order = matrix[:,sort_coordinate].argsort()
    out = matrix.copy()[new_order]
    return out

In [4]:
class Node:
    def __init__(self,val,column,function,prediction=None):
        self.val = val
        self.function = function #recieves two numbers and returns either True or False, True go left, False go right
        self.column = column
        self.left = None
        self.right = None
        self.prediction = prediction #Mean of all samples that pass trough node

In [5]:
class DecisionTreeRegressor:
    def __init__(self,error = "se",max_height = MAX_HEIGHT,min_samples_leaf = MIN_SAMPLES_LEAF,min_split = MIN_SPLIT):
        self.head = None
        self.error_metric = self.__return_error_metric(error)
        self.max_height = max_height
        self.min_samples_leaf = min_samples_leaf
        self.min_split = min_split

    def __is_smaller(self,a,b):
        return a<b

    def __is_equal(self,a,b):
        return a==b

    def __measure_array_se(self,arr):
        return sum((arr-np.mean(arr))**2)

    def __measure_array_ae(self,arr):
        return sum(np.abs(arr-np.mean(arr)))

    def __return_error_metric(self,error):
        if error == 'ae':
            return self.__measure_array_ae
        else:
            return self.__measure_array_se
        
    def __find_continuous_column_best_split(self,data,column):
        data = sort_by_column(data,column)
        
        column_data = data[:,column]
        
        best_error = np.inf

        best_split_val = None
        
        best_left,best_right = None,None

        last_val = column_data[0]
            
        for i in range(self.min_samples_leaf,len(column_data)-self.min_samples_leaf):
            if column_data[i] == last_val:
                continue

            last_val = column_data[i]

            split_val = (column_data[i] + column_data[i-1])/2
            
            left_arr = data[:i , :]
            right_arr = data[i: , :]
            
            split_error = self.error_metric(left_arr[:,-1]) + self.error_metric(right_arr[:,-1]) 

            
            if split_error < best_error:
                
                best_error =  split_error
                best_split_val = split_val
                best_left = left_arr
                best_right = right_arr
                
        
        return best_split_val,best_error,best_left,best_right
    
    def __find_categorical_column_best_split(self,data,column):    
        column_data = data[:,column]
    
        classes = np.unique(column_data)
        
        best_error = np.inf
        
        best_split_val = None
        
        best_left,best_right = None,None
            
        for c in classes:
            left_arr = data[column_data == c]
            right_arr = data[column_data != c]
    
            if len(left_arr) < self.min_samples_leaf or len(right_arr) < self.min_samples_leaf:
                continue
            
            split_error = self.error_metric(left_arr[:,-1]) + self.error_metric(right_arr[:,-1]) 
            if split_error < best_error:
                
                best_error =  split_error
                best_split_val = c
                best_left = left_arr
                best_right = right_arr
                
        
        return best_split_val,best_error,best_left,best_right
    
    def __find_column_best_split(self,data,column,data_type):
        if data_type == CATEGORICAL:
            return self.__find_categorical_column_best_split(data,column)
        if data_type == CONTINUOUS:
            return self.__find_continuous_column_best_split(data,column)

    def __find_best_split(self,data,data_types):
        best_column = -1
        best_val = data[0][0]
        best_mse = np.inf
        best_left = None
        best_right = None
        for col in range(len(data[0])-1): #Iterate over all columns but the last one which is the label
            col_best_split_val,col_best_mse,col_best_left,col_best_right = self.__find_column_best_split(data,col,data_types[col])
            if col_best_mse < best_mse:
                best_val,best_mse,best_left,best_right = col_best_split_val,col_best_mse,col_best_left,col_best_right
                best_column = col
        return best_column,best_val,best_left,best_right
    
    def __build_tree(self,data,data_types,height=0):
        if (height >= self.max_height) or (len(data) <= self.min_split):
            return None
        best_column,best_val,left_data,right_data = self.__find_best_split(data,data_types)
        if left_data is None:
            return None
        function = self.__is_smaller if data_types[best_column] == CONTINUOUS else self.__is_equal
        node = Node(val=best_val,column=best_column,function=function,prediction=np.mean(data[:,-1]))
        node.left = self.__build_tree(left_data,data_types,height+1)
        node.right = self.__build_tree(right_data,data_types,height+1)
        return node
    
    def fit(self,X,y,data_types):
        H = np.hstack([X,y.reshape(len(y),1)]).copy()
        self.head = self.__build_tree(H,data_types)

    def __predict_point(self,x):
        prev = None
        curr = self.head
        while curr is not None:
            col_to_check = curr.column
            val = curr.val
            function = curr.function
            prev = curr
            if function(x[col_to_check] , val):
                curr = curr.left
            else:
                curr = curr.right
        return prev.prediction

    def predict(self,X):
        out = []
        for x in X:
            out.append(self.__predict_point(x))
        return np.array(out)

<h3>Using our model and comparing it to Sklearn's DecisionTreeRegressor</h3>

In [6]:
import pandas as pd
#Using only a few columns for testing
train_data = pd.read_csv('train.csv')[['LotFrontage','Street','LotShape','Utilities','YrSold','1stFlrSF','SalePrice']].dropna() 

<h5>Some sample data for the sale price of a house (from kaggle getting started datasets)</h5>

In [7]:
train_data

Unnamed: 0,LotFrontage,Street,LotShape,Utilities,YrSold,1stFlrSF,SalePrice
0,65.0,Pave,Reg,AllPub,2008,856,208500
1,80.0,Pave,Reg,AllPub,2007,1262,181500
2,68.0,Pave,IR1,AllPub,2008,920,223500
3,60.0,Pave,IR1,AllPub,2006,961,140000
4,84.0,Pave,IR1,AllPub,2008,1145,250000
...,...,...,...,...,...,...,...
1455,62.0,Pave,Reg,AllPub,2007,953,175000
1456,85.0,Pave,Reg,AllPub,2010,2073,210000
1457,66.0,Pave,Reg,AllPub,2010,1188,266500
1458,68.0,Pave,Reg,AllPub,2010,1078,142125


In [8]:
data_types = [CONTINUOUS,CATEGORICAL,CATEGORICAL,CATEGORICAL,CONTINUOUS,CONTINUOUS]

In [9]:
X = train_data.values[:,:-1]
y = train_data.values[:,-1]
test_size = int(len(X)/5)
X_train, X_test, y_train, y_test = X[:-test_size],X[-test_size:],y[:-test_size],y[-test_size:]

In [10]:
DT = DecisionTreeRegressor()
DT.fit(X_train,y_train,data_types)
my_pred = DT.predict(X_test) 

In [11]:
# def printTree(node, level=0):
#     if node != None:
#         printTree(node.left, level + 1)
#         print(' ' * 4 * level + '-> ' + str(node.prediction))
#         printTree(node.right, level + 1)
# printTree(DT.head)

<h3>Now lets try with Sklearn</h3>

<h3>Turning categorical columns to one-hot encoding so that Sklearn's decision tree can accept it</h3>

In [12]:
pd.set_option('future.no_silent_downcasting', True)
sklearn_train_data = train_data.copy()
categorical_columns = ['Street','LotShape','Utilities']
for col in categorical_columns:
    train_temp = pd.get_dummies(sklearn_train_data[col]).replace({True:1,False:0})
    train_temp.columns = [col + "_" +str(i) for i in range(len(train_temp.columns))]
    sklearn_train_data = pd.concat([sklearn_train_data,train_temp],axis=1)
    sklearn_train_data.drop(col,axis=1,inplace=True)

In [13]:
sk_X = sklearn_train_data.drop('SalePrice',axis = 1).values
sk_X_train, sk_X_test, y_train, y_test = sk_X[:-test_size],sk_X[-test_size:],y[:-test_size],y[-test_size:]

In [14]:
from sklearn.tree import DecisionTreeRegressor as DTR
DTR = DTR(max_depth=MAX_HEIGHT,min_samples_leaf=MIN_SAMPLES_LEAF,min_samples_split=MIN_SPLIT)
DTR.fit(sk_X_train,y_train)
sklearn_pred = DTR.predict(sk_X_test)

<h3>Now lets compare the performance</h3>

In [15]:
def mse(a,b):
    return np.sqrt(sum((a-b)**2))

In [16]:
print("Sklearn's tree performance : " , mse(sklearn_pred,y_test)/10000)
print("Our tree's performance : " , mse(my_pred,y_test)/10000)

Sklearn's tree performance :  100.21191991740572
Our tree's performance :  98.4245019138269


<h3>We WIN!......but I guess that with some finetunning of the parameters we can get Sklearn's tree to match our's or even surpass it </h3>