# DecisionTree
## Core Idea
Seperate as pure a subset as possible.
## Pipeline 
Choose the feature -- Split by the feature -- Calculate the measure --
## Pros and Cons

In [None]:
import numpy as np
from sklearn.base import BaseEstimator

In [None]:
class DecisionTree(BaseEstimator):
    # initialize a tree
    def __init__(self, loss_function, leaf_value_estimator, max_depth=5,current_depth=0,min_sample=5) -> None:
        self.loss_function = loss_function # Regression or Classification: Gini, InformationGain, etc.
        self.leaf_value_estimator = leaf_value_estimator
        self.max_depth = max_depth
        self.current_depth = current_depth
        self.min_sample = min_sample
        # tree structure
        self.isleaf = None
        self.left = None
        self.right = None 
        self.value = None 
    
    # Given a feature, calculate the best split position
    def split(self, X, y):
        best_loss = self.loss_function(y)
        best_split_id = None
        best_split_position = None
        best_split_value = None
        best_X_left = None
        best_X_right = None
        best_y_left = None
        best_y_right = None
        num_feature = X.shape[1]
        # sort by given feature
        Xy = np.concatenate([X, y], 1)
        for feature_id in range(num_feature):
            Xy_sorted = np.array(sorted(Xy, key=lambda x: x[feature_id])) 
            # choose the best split value of this feature
            for split_position in range(len(Xy_sorted)-1):
                X_left = Xy_sorted[:split_position+1,:-1]
                X_right = Xy_sorted[split_position+1:,:-1]
                y_left = Xy_sorted[:split_position+1,-1]
                y_right = Xy_sorted[split_position+1:,-1]
                # calculate loss
                loss_left = len(y_left)/len(y) * self.loss_function(y_left)
                loss_right = len(y_right)/len(y) * self.loss_function(y_right)
                # update the split position
                if (loss_left + loss_right < best_loss):
                    best_split_id = feature_id
                    best_split_position = split_position
                    best_split_value = Xy_sorted[best_split_position, best_split_id]
                    best_loss = loss_left + loss_right
                    best_X_left = X_left
                    best_X_right = X_right
                    best_y_left = y_left
                    best_y_right = y_right
                    
        return best_split_id, best_split_position, best_split_value, best_loss, best_X_left, best_X_right, best_y_left, best_y_right




        



    # Choose the feature: run if the number of remaining features > 0 and the classification has not meet the standards
    def fit(self, X, y):
        num_sample, num_feature = X.shape
        isunique = (np.unique_counts(y) == 1)
        # If the number of remaining features = 0 or the classification has meet the standards, return as a leaf node
        if self.current_depth == self.max_depth or num_sample <= self.min_sample or isunique:
            self.isleaf = True
            return self
        # Else, split and recurse to left and right subtrees



            
