# I. Algorithm

## 1. Mathematics
### Boostrapping:
Bootstrapping is a statistical resampling technique that involves random sampling of a dataset with replacement. To create bootstrapped dataset, we need to randomly choose rows from orginal data and duplication is allowed. In random forest, we only consider subset of features in bootstrapped dataset. So I use __bootstrap_size__ to indicate (__number of rows__, __number of features__).

## 2. Code
- Step 1: Create random bootstrapped datasets based on the number of trees.
- Step 2: Put bootstrapped datasets into Decision Tree.
- Step 3: Predict new dataset based on the average result of all trees.

In [4]:
import pandas as pd
import numpy as np

class Question(object):
    def __init__(self, column, condition):
        self.column = column
        self.condition = condition
        
    def match(self,row):
        if isinstance(self.condition, str):
            return row[self.column] == self.condition
        else:
            return row[self.column] >= self.condition
         
class LeafNode(object):
    def __init__(self, label, samples, depth):
        self.label = label
        self.samples = samples
        self.depth = depth

class DecisionNode(object):
    def __init__(self, question, true, false, samples, depth):
        self.question = question
        self.true = true
        self.false = false
        self.samples = samples
        self.depth = depth
        
class DecisionTree(object):
    def __init__(self, max_depth= 10, criterion = "gini", min_samples_split = 2, min_samples_leaf = 1):
        self.train = pd.DataFrame()
        self.test = pd.DataFrame()
        self.label = []
        self.criterion = criterion
        self.max_depth = max_depth  
        self.min_samples_split = min_samples_split 
        self.min_samples_leaf = min_samples_leaf 
        self.tree = None
        
    def fit(self,train):
        self.train = train
        self.tree = self.build_tree(self.train, 0)
        
    # Step 1: Create a function to calculate the information gain of each question based on gini index or entropy
    def impurity(self, label):
        if self.criterion == "gini":
            return 1 - ((label.value_counts()/label.value_counts().sum())**2).sum()
        if self.criterion == "entropy":
            p = label.value_counts()/label.value_counts().sum()
            return - (p*np.log(p)).sum()      
    
    def info_gain(self, true_label, false_label, current_uncertainty):
        p = float(len(true_label)) / (len(true_label) + len(false_label))
        return current_uncertainty - p * self.impurity(true_label) - (1 - p) * self.impurity(false_label)
    
    # Step 2: Ask questions and split dataset into subnote
    def split(self, data, question):  
        if isinstance(question.condition, str):
            true = data[data[question.column] == question.condition]
            false = data[data[question.column] != question.condition]
        else:
            true = data[data[question.column] >= question.condition]
            false = data[data[question.column] < question.condition]
        return true,false
    
    # Step 3: Run a for loop through each value to find a best question - a question have highest information gain
    def find_best_split(self, data):
        best_gain = 0  
        best_question = Question(None, None)
        current_uncertainty = self.impurity(data["label"])
        for column in data.columns[:-1]:
            for condition in data[column].unique():
                true, false = self.split(data, Question(column, condition))
                if len(true) == 0 or len(false) == 0:
                    continue
                gain = self.info_gain(true["label"], false["label"], current_uncertainty)
                if gain >= best_gain:
                    best_gain, best_question = gain, Question(column, condition)
        return best_gain, best_question
    
    # Step 4: Use a recursive algorithm to build a tree
    def build_tree(self, data, depth):
        # Find best question         
        gain, question = self.find_best_split(data)
        samples = data["label"].value_counts().sum()
        # Can not find question or the samples is smaller than min samples split          
        if gain == 0 or samples < self.min_samples_split or depth == self.max_depth:
            label = (data["label"].value_counts()/data["label"].value_counts().sum()).apply(lambda x: int(x*100)).to_dict()
            return LeafNode(label, samples, depth)
        # Split based on best question         
        true, false = self.split(data, question)
        true_samples = true["label"].value_counts().sum() 
        false_samples = false["label"].value_counts().sum() 
        # Check if leaf node is smaller than min samples leaf or not
        if true_samples < self.min_samples_leaf or false_samples < self.min_samples_leaf:
            label = (data["label"].value_counts()/data["label"].value_counts().sum()).apply(lambda x: int(x*100)).to_dict()
            return LeafNode(label, samples, depth)
        true = self.build_tree(true, depth + 1)
        false = self.build_tree(false, depth + 1)
        return DecisionNode(question, true, false, samples, depth)
    # Print tree
    def print_tree(self, node, spacing=""):
        # Base case: we've reached a leaf
        if isinstance(node, LeafNode):
            print (spacing + "Predict", node.label, ", Samples: ", node.samples, ", Depth: ", node.depth)
            return

        # Print the question at this node
        print (spacing + str(node.question.column) + " " + str(node.question.condition), "Samples: ", node.samples, ", Depth: ", node.depth)

        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        self.print_tree(node.true, spacing + "  ")

        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        self.print_tree(node.false, spacing + "  ")
    # Step 5: Predict new data based on the tree already built
    def classify(self, index, node):
        row = self.test.loc[index]
        # Base case: we've reached a leaf
        if isinstance(node, LeafNode):
            self.label.append(node.label)
        # Decide whether to follow the true-branch or the false-branch.
        # Compare the feature / value stored in the node,
        # to the example we're considering.
        if isinstance(node, DecisionNode):
            if node.question.match(row):
                return self.classify(index, node.true)
            else:
                return self.classify(index, node.false)
    def predict(self, test):
        self.test = test
        for i in range(test.shape[0]):
            self.classify(i, self.tree)
        return self.label
class RandomForest(object):
    def __init__(self, boostrap_size, n_trees, max_depth= 10, criterion = "gini", min_samples_split = 2, min_samples_leaf = 1):
            self.train = pd.DataFrame()
            self.test = pd.DataFrame()
            self.boostrap_size = boostrap_size
            self.n_trees = n_trees
            self.criterion = criterion
            self.max_depth = max_depth  
            self.min_samples_split = min_samples_split 
            self.min_samples_leaf = min_samples_leaf 
            self.forest = []
    def fit(self,train):
        self.train = train
        self.create_forest()
    def bootstrapping(self):
        sample_indices = np.random.randint(low=0, high=self.train.shape[0], size = self.boostrap_size[0])
        feature_indices = np.random.choice(self.train.shape[1]-1, self.boostrap_size[1], replace=False)
        boostrap = self.train.iloc[sample_indices,np.append(feature_indices,-1)]
        return boostrap
    def create_forest(self):
        for i in range(self.n_trees):
            tree = DecisionTree(max_depth = self.max_depth, criterion = self.criterion, min_samples_split = self.min_samples_split, min_samples_leaf = self.min_samples_leaf)
            tree.fit(self.bootstrapping())
            self.forest.append(tree)
    def predict(self, test):
        self.test = test
        labels = pd.DataFrame(np.zeros((self.test.shape[0], len(self.train["label"].unique()))), columns = self.train["label"].unique(), dtype = np.int32)
        for tree in self.forest:
            label = tree.predict(test)
            for i in range(len(label)):
                labels.loc[i][list(label[i].keys())[0]] += list(label[i].values())[0]
        labels = labels/self.n_trees
        labels = labels.astype("str") + "%"
        return pd.concat([self.test,labels], axis = 1)

# II. Practice

In [5]:
# Create data
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 3, 'Apple'],
    ['Yellow', 3, 'Lemon'],
    ['Blue', 1, 'Berry'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Blue', 2, 'Berry'],
    ['Red', 1, 'Grape'],
    ['Yellow', 5, 'Banana'],
    ['Red', 1, 'Grape'],
    ['Green', 4, 'Banana'],
    ['Blue', 2, 'Berry'],
]
header = ["color", "diameter", "label"]
data = pd.DataFrame(data = training_data, columns = header)

In [10]:
# Create model
model = RandomForest((10,1), 2)
model.fit(data)
# Plot model
for i in range(len(model.forest)):
    print("Tree", i)
    model.forest[i].print_tree(model.forest[i].tree)

Tree 0
diameter 2 Samples:  10 , Depth:  0
--> True:
  diameter 3 Samples:  7 , Depth:  1
  --> True:
    diameter 4 Samples:  5 , Depth:  2
    --> True:
      Predict {'Banana': 100} , Samples:  1 , Depth:  3
    --> False:
      Predict {'Lemon': 50, 'Apple': 50} , Samples:  4 , Depth:  3
  --> False:
    Predict {'Berry': 100} , Samples:  2 , Depth:  2
--> False:
  Predict {'Grape': 100} , Samples:  3 , Depth:  1
Tree 1
diameter 3 Samples:  10 , Depth:  0
--> True:
  diameter 4 Samples:  5 , Depth:  1
  --> True:
    Predict {'Banana': 100} , Samples:  2 , Depth:  2
  --> False:
    Predict {'Apple': 100} , Samples:  3 , Depth:  2
--> False:
  diameter 2 Samples:  5 , Depth:  1
  --> True:
    Predict {'Berry': 100} , Samples:  1 , Depth:  2
  --> False:
    Predict {'Grape': 75, 'Berry': 25} , Samples:  4 , Depth:  2


In [12]:
# Predict
test = data.loc[0:3][["color","diameter"]]
model.predict(test)

Unnamed: 0,color,diameter,Apple,Grape,Lemon,Berry,Banana
0,Green,3,50.0%,0.0%,25.0%,0.0%,0.0%
1,Yellow,3,50.0%,0.0%,25.0%,0.0%,0.0%
2,Red,1,0.0%,87.5%,0.0%,0.0%,0.0%
3,Red,3,50.0%,0.0%,25.0%,0.0%,0.0%


# III. References
QuantStart - Bootstrap Aggregation, Random Forests and Boosted Trees [https://www.quantstart.com/articles/bootstrap-aggregation-random-forests-and-boosted-trees]

Youtube - StatQuest: Random Forests Part 1 - Building, Using and Evaluating[https://www.youtube.com/watch?v=J4Wdy0Wc_xQ&t=313s]