In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
%matplotlib inline

In [143]:
def is_numeric(value):
    # This function is to determine if a value is numerical
    return isinstance(value, int) or isinstance(value, float)

def unique_vals(dataset, feature):
    """Find the unique values for a column in a dataset."""
    return sorted(set(dataset[feature]))    
    
def average(labels):
    return sum(labels)/len(labels)

In [144]:
class Split:

    def __init__(self, dataset, feature, splitPoint):
        self.dataset = dataset
        self.feature = feature
        self.splitPoint = splitPoint

    def match(self, comparision_sample):
        # This is the function that compare the plit point we choosed to a given value
        # If it is numerical, then True if splitPoint >= comparision_sample, False if splitPoint < comparision_sample
        # If it is categorical, then True if splitPoint == comparision_sample, False if splitPoint != comparision_sample
        val = comparision_sample[self.feature]

        if is_numeric(self.splitPoint):
            return self.splitPoint <= val
        else:
            return self.splitPoint == val 
        
    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        header = list(self.dataset.columns)
        condition = "=="
        if is_numeric(self.splitPoint):
            condition = ">="
        return "Is %s %s %s?" % (
            header[header==self.feature], condition, str(self.splitPoint))

class Leaf:
    """ A leaf node

    A leaf node holds data including unique values and their counts in a dictionary.
    """
    def __init__(self, dataset):
        self.prediction = average(dataset.iloc[:,-1])


class Decision_Node:
    """A decision node

    A decision node holds the split method and its two child tree.
    """
    def __init__(self, split, left_tree, right_tree):
        self.split = split
        self.left_tree = left_tree
        self.right_tree = right_tree

In [145]:
class DecisionTreeRegressor:
    
    def __init__(self):
        pass
    
    def partition(self, dataset, split):
        """partition the dataset into left and right

        For each value a in a feature, compare it to the split point. Partition the dataset into two subsets.

        """
        
        left_set = dataset[split.match(dataset)]
        
        right_set = dataset[split.match(dataset)==False]

        return left_set, right_set
    

    
    def sum_square_error(self, dataset):
        
        labels = dataset.iloc[:,-1]
        
        return sum((labels-average(labels))**2)
    

    def find_best_split(self, dataset):
        """
        Parameters
        ------------------
        X_data: {array-like} discrete features
        preprocessed dataset, could not deal with continuous features, 
        and categorical feature should be better as binary form

        y_labels: {array-like}

        Return
        ------------------
        best_gain: the maximum gini index gain
        best_split: the feature and split point that get the maximum gini index gain

        """
        # Exclude Labels
        features = list(dataset.columns[:-1])


        best_error = np.inf
        best_split = None

        # Loops:
        # First for loop: features
        # Second for loop: unique values in a feature.

        for feature in features:

            unique_val = unique_vals(dataset, feature)

            for val in unique_val:

                split = Split(dataset, feature, val)

                left_set, right_set = self.partition(dataset, split)

                # Skip this split if it doesn't divide the dataset
                if len(left_set) == 0 or len(right_set) == 0:
                    continue

                error = self.sum_square_error(left_set) + self.sum_square_error(right_set)

                if error <= best_error:
                    best_error = error 
                    best_split = split

        return best_error, best_split

    
    def fit(self,dataset):
        """

        """

        #Step1: Find the best split feature and point, and create a root.
        error, split = self.find_best_split(dataset)

        # If there is no gain, or gain is less than a threshold
        # We will not split any more
        # And left it as a leaf
        if error == 0:
            return Leaf(dataset)

        # Partition the dataset into two sub-trees
        left_set, right_set = self.partition(dataset, split)

        # Recursively build sub-trees, start from left to right
        left_tree = self.fit(left_set)
        right_tree = self.fit(right_set)
        #
        
        return Decision_Node(split, left_tree, right_tree)
    
    def print_tree(self, node, spacing=""):
        
        """Copy from
        World's most elegant tree printing function.
        """

        # Base case: we've reached a leaf
        if isinstance(node, Leaf):
            print (spacing + "Predict", self.print_leaf(node.prediction))
            return

        # Print the question at this node
        print (spacing + str(node.split))

        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        self.print_tree(node.left_tree, spacing + "  ")

        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        self.print_tree(node.right_tree, spacing + "  ")

    def regression(self, dataset, node):
        
        """See the 'rules of recursion' above."""

        # Base case: we've reached a leaf
        if isinstance(node, Leaf):
            return node.prediction

        # Decide whether to follow the true-branch or the false-branch.
        # Compare the feature / value stored in the node,
        # to the example we're considering.
        if node.split.match(dataset):
            return self.regression(dataset, node.left_tree)
        else:
            return self.regression(dataset, node.right_tree)
        
    def print_leaf(self, values):
        """A nicer way to print the predictions at a leaf."""
#         total = sum(counts.values()) * 1.0
#         probs = {}
#         for lbl in counts.keys():
#             probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
        
        return np.mean(values)
    
    def save_tree(self, inputTree, filename):
        import pickle
        fw = open(filename, 'wb')
        pickle.dump(inputTree, fw)
        fw.close()
        
    def read_tree(self, filename):
        import pickle
        tr = open(filename,'rb')
        return pickle.load(tr)
    
    def predict(self, dataset, node):
        predictions = []
        m,n = dataset.shape
        for i in range(m):
            leaf = self.regression(dataset.iloc[i],node)
#             print('The prediction is: {}'.format(self.print_leaf(leaf)))
            predictions.append(self.print_leaf(leaf))

        return pd.DataFrame(predictions)

In [146]:
data = pd.DataFrame([
    
    [20,40.1],
    [21,40.3],
    [35,70.4],
    [36,70.2]
    
],
columns=['size','price'])

In [147]:
data

Unnamed: 0,size,price
0,20,40.1
1,21,40.3
2,35,70.4
3,36,70.2


In [148]:
rt = DecisionTreeRegressor()

In [149]:
split = Split(data,0,22)

In [139]:
tree = rt.fit(data)

In [140]:
rt.print_tree(tree)

Is size >= 35?
--> True:
  Predict 70.30000000000001
--> False:
  Predict 40.2


In [141]:
rt.predict(data,tree)

Unnamed: 0,0
0,40.2
1,40.2
2,70.3
3,70.3
