# Regression Tree

1. [Regression Tree](#regression-tree)
    - [intuition](#intuition)
    - [recursion](#recursion)
    - [stopping criterion](#stopping-criterion)
    - [finding the best split](#finding-the-best-split)
    - [implement decision tree regressor from scratch](#implement-decision-tree-regressor-from-scratch)
1. [Resources](#resources)

## Regression Tree

### Intuition 

At each node of the tree, find the best split (a feature and a threshold) in terms of variance reduction, and grow the children accordingly in the same way. If there's no such split or other stopping criterion is met, we use the mean target in this node as the prediction.


### Recursion

The easiest way to implement the decision tree is by recursion.

At every node, starting from the root,
- return this Node(val=mean_y) if stopping criterion is met
- greedily search for the best splits in terms of variance reduction among candidate splits
  - if such split doesn't exsits, return this Node(val=mean_y)
  - if so, grow 2 child nodes
- return this Node(left_child, right_child, feature, threshold)

### Stopping Criterion

Common stopping criterions are
- maximum depth
- minimum number of samples in a leaf node
- minimum information gain

### Finding the Best Split

At every node, we sample a subset of features $F$, greedily find the best feature $f$, threshold $t$ that maximizes the information gain.

Information gain (IG) is defined as 

\begin{align}
H - \frac{n_l}{n}H_l - \frac{n_r}{n}H_p 
\end{align}

- $H$ is the homegeneity measure of the current node, for regression tree, it is the variance.
- $n$ is the number of observations in this node
- $n_l$ is the number of observations in the left child of the proposed split
- $n_r$ is the number of observations in the right child of the proposed split
- $H_l$ is the homegeneity of the left child of the propsed split
- $H_r$ is the homegeneity of the left child of the propsed split

### Implement Decision Tree Regressor from Scratch

In [1]:
from dataclasses import dataclass

import numpy as np


@dataclass
class Node:
    # attributes for non-leaf node
    left_child: "Node" = None
    right_child: "Node" = None
    feature: int = None
    threshold: float = None

    # attribute for leaf node
    value: float = None

    def is_leaf(self):
        return self.value is not None
    
    
def quality(x: np.ndarray) -> float:
    return np.var(x)


class DTRegressor:
    def __init__(self, max_depth=3, min_samples_split=2, min_samples_leaf=1, max_features=3, min_information_gain=0):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.min_information_gain = min_information_gain
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_row(row_x, self.root) for row_x in X])
    
    def _predict_row(self, x: np.ndarray, node: Node) -> float:
        if node.is_leaf():
            return node.value
        elif x[node.feature] < node.threshold:
            return self._predict_row(x, node.left_child)
        else:
            return self._predict_row(x, node.right_child)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.root = self._grow_tree(X, y)
    
    def _split(self, X, y, feature, threshold, return_X=False) -> tuple:
        left_mask = X[:, feature] < threshold
        right_mask = X[:, feature] >= threshold

        if return_X:
            return X[left_mask], X[right_mask], y[left_mask], y[right_mask]
        else:
            return y[left_mask], y[right_mask]
    
    def _get_best_split(self, X: np.ndarray, y: np.ndarray):

        best_ig = - np.inf
        best_feature = None
        best_threshold = None

        q_parent = quality(y)
        m, n = X.shape

        for feature in np.random.choice(n, self.max_features, replace=False):
            # sorted unique thresholds
            thresholds = np.unique(X[:, feature])[1:]
            for threshold in thresholds:
                y_left, y_right = self._split(X, y, feature, threshold)
                q_left = quality(y_left)
                q_right = quality(y_right)

                ig = q_parent - len(y_left) / m * q_left - len(y_right) / m * q_right

                if ig > best_ig:
                    best_ig = ig
                    best_feature = feature
                    best_threshold = threshold
        
        if best_ig <= self.min_information_gain:
            return None, None
        
        return best_feature, best_threshold

    
    def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int=1) -> Node:
        # check if stopping criterion is met
        if depth > self.max_depth or len(np.unique(y)) == 1 or len(y) < self.min_samples_split:
            return Node(value=np.mean(y))
        
        best_feature, best_threshold = self._get_best_split(X, y)

        if best_feature is None:
            return Node(value=np.mean(y))

        X_left, X_right, y_left, y_right = self._split(X, y, best_feature, best_threshold, return_X=True)

        # check if the propsed split is legal
        if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
            return Node(value=np.mean(y))

        left_node = self._grow_tree(X_left, y_left, depth=depth+1)
        right_node = self._grow_tree(X_right, y_right, depth=depth+1)      
        return Node(left_child=left_node, right_child=right_node, feature=best_feature, threshold=best_threshold)
    
    def _print_node(self, node, indent):
        if node.is_leaf():
            print(f"{' ' * indent}value: {node.value:.2f}")
        if not node.is_leaf():
            print(f"{' ' * indent}{node.feature}: {node.threshold}")
            self._print_node(node.left_child, indent+1)
            self._print_node(node.right_child, indent+1)

    def print_tree(self):
        self._print_node(self.root, 0)

In [2]:
from sklearn.datasets import load_diabetes


data = load_diabetes()
print(data.keys())
X = data.data
y = data.target
print(X.shape)

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
(442, 10)


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import pandas as pd

df_train = pd.DataFrame(np.hstack([X_train, y_train[:, np.newaxis]]), columns=data.feature_names+["y"])
df_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,0.070769,0.05068,0.012117,0.056301,0.034206,0.049416,-0.039719,0.034309,0.027364,-0.001078,144.0
1,-0.009147,0.05068,-0.018062,-0.033213,-0.020832,0.012152,-0.072854,0.07121,0.000272,0.019633,150.0
2,0.005383,-0.044642,0.04984,0.097615,-0.015328,-0.016345,-0.006584,-0.002592,0.017036,-0.013504,280.0
3,-0.02731,-0.044642,-0.035307,-0.02977,-0.056607,-0.05862,0.030232,-0.039493,-0.049872,-0.129483,125.0
4,-0.023677,-0.044642,-0.065486,-0.081413,-0.03872,-0.05361,0.059685,-0.076395,-0.037129,-0.042499,59.0


In [5]:
dtr = DTRegressor(max_depth=2, max_features=10, min_samples_leaf=30)
dtr.fit(X_train, y_train)
dtr.print_tree()
y_pred = dtr.predict(X_test)

2: 0.005649978676881689
 8: 0.007027139682585861
  value: 100.56
  value: 164.67
 value: 205.54


In [6]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

3804.099594494935

In [7]:
import plotly.express as px


df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})

fig = px.scatter(df, x="y_test", y="y_pred")
fig.add_scatter(x=np.linspace(0, 300), y=np.linspace(0, 300))

# Resources

- [scikit-learn tree mathematical formation](https://scikit-learn.org/stable/modules/tree.html#tree-mathematical-formulation)