# Build A Tree

Pseudo code:

    Function BuildTree(Data, Attribution):

        if Attribution is empty or all Data labels are the same:

            set node status as leaf
 
            set class in this situation as most common class

        else:

            set node status as internal

            use function bestAttribute(Data, Attribution) to gain a, the decision attribute

            LeftNode = BuildTree(Data(satisfy a = 1), A exclude a)

            RightNode = BuildTree(Data(satisfy a = 0), A exclude a)

        end

    end     

**Entropy** quantifies the amount the uncertainty associated with a specific probability distribution, it can be shown that the higher the entropy, the less confident we are in the outcome.
    
$$H(X) = \sum_c -\mathbb{P}(X = c)\log_2\mathbb{P}(X = c)$$

**Conditional Entropy** quantifies the amount the uncertainty given some information.
    
$$H(Y\mid X) = \sum_i \mathbb{P}(X = i)H(Y\mid X = i)$$

**Information Gain** is how much do we gain from knowing one of the attributes, in other words, the reduction in entropy.
    
$$IG(Y\mid X) = H(Y) - H(Y\mid X)\geq 0$$
    
    

\begin{align*}
IG(Y\mid X) &= H(Y) - H(Y\mid X)\\
                &= \sum_c -\mathbb{P}(Y = c)\log_2\mathbb{P}(Y = c) - \sum_i \mathbb{P}(X = i)H(Y\mid X = i)\\
                &= \sum_c -\mathbb{P}(Y = c)\log_2\mathbb{P}(Y = c) - \sum_i \mathbb{P}(X = i)\sum_c -\mathbb{P}(Y = c\mid X = i)\log_2\mathbb{P}(Y = c\mid X = i)\\
                &= -\mathbb{E}[\log_2p(y)] + \mathbb{E}[\mathbb{E}_X[\log_2p(y\mid X=i)]]\\
                &\geq -\mathbb{E}[\log_2p(y)] + \log_2(\mathbb{E}[\mathbb{E}_X[p(y\mid X=i)]])\\
                &= -\log_2p(y) + \log_2p(y)\\
                &= 0\\
\end{align*}

In [1]:
import numpy as np

In [2]:
class DecisionTree:
    
    def __init__(self):
        self.tree = {}
        self.order = []
        
    def fit(self, X, y):
        num_features = X.shape[-1]
        X = np.vstack((np.arange(num_features), X))
        self.tree = self.build_tree(X, y)
        

    def predict(self, X):
        num_observation = X.shape[0]
        prediction = np.zeros(num_observation)
        for i in range(num_observation):
            root = self.tree.keys()[0]
            for j in range(X.shape[-1]):
                self.tree.get(j) 
            
    def build_tree(self, X, y):
        if X.shape[-1] == 1:
            count = self.class_count(y)
            return max(count, key = count.get)
        if len(np.unique(y)) == 1:
            return y[0]
        
        optimal_feature = self.feature_selection(X, y)
        self.order.append(optimal_feature)
        optimal_feature_index = np.where(X[0] == optimal_feature)[0][0]
        #print optimal_feature
        tree = {optimal_feature: {}}
        for val in np.unique(X[1:, optimal_feature_index]):
            index = np.where(X[1:, optimal_feature_index] == val)[0]
            X_tilde = np.vstack((X[0], X[index + 1]))
            X_tilde = np.hstack((X_tilde[:, :optimal_feature_index], X_tilde[:, optimal_feature_index + 1: ]))
            tree[optimal_feature][val] = self.build_tree(X_tilde, y[index])
        return tree
        
    def feature_selection(self, X, y):
        num_features = X.shape[-1]
        information_gain_collection = np.zeros(num_features)
        # Scan all features
        for j in range(num_features):
            conditional_entropy_collection = []
            margin_probability_collection = []
            # Scall all values of feature j
            for val in np.unique(X[1:, j]):
                # \mathbb{H}[Y \mid X(j)=val] = -\sum_{y}\mathbb{P}[Y=y \mid X(j) = val] * log(\mathbb{P}[Y=y \mid X(j) = val])
                index = np.where(X[1:, j] == val)[0]
                distribution = self.class_distribution(y[index])
                conditional_entropy = self.shannon_entropy(distribution)
                conditional_entropy_collection.append(conditional_entropy)
                # \mathbb{P}[X(j) = val] = \frac{\sum_i \mathbb{1}[X_i(j) = val]}{n}
                margin_probability_collection.append(len(index) * 1.0 / len(X))
            # \mathbb{H}[Y \mid X(j)] = \sum_{val} \mathbb{P}[X(j) = val]\mathbb{H}[Y \mid X(j) = val]
            information_gain_collection[j] = self.shannon_entropy(self.class_distribution(y)) - \
                                             np.dot(np.array(margin_probability_collection), 
                                                    np.array(conditional_entropy_collection))
        return X[0, np.argmax(information_gain_collection)]

    def shannon_entropy(self, distribution):
        return -np.dot(distribution, np.log2(distribution))
    
    def class_count(self, labels):
        label_count = {}
        for label in labels:
            if label not in label_count.keys(): label_count[label] = 1
            else: label_count[label] += 1
        return label_count
    
    def class_distribution(self, labels):
        label_count = self.class_count(labels)
        v = np.array(label_count.values())
        return v * 1.0 / v.sum()

In [3]:
dataset = np.array([["Comedy","Short","Adamson","No","Yes"],["Animated","Short","Lasseter","No","No"],
                          ["Drama","Medium","Adamson","No","Yes"],["Animated","Long","Lasseter","Yes","No"],
                          ["Comedy","Long","Lasseter","Yes","No"],["Drama","Medium","Singer","Yes","Yes"],
                          ["Animated","Short","Singer","No","Yes"],["Comedy","Long","Adamson","Yes","Yes"],
                          ["Drama","Medium","Lasseter","No","Yes"]])
X = dataset[:,:-1]
y = dataset[:,-1]

In [4]:
dt = DecisionTree()

In [5]:
dt.fit(X, y)

In [6]:
dt.tree

{'2': {'Adamson': 'Yes',
  'Lasseter': {'0': {'Animated': 'No', 'Comedy': 'No', 'Drama': 'Yes'}},
  'Singer': 'Yes'}}