In [1]:
# upload the data
# IRIS dataset is chosen for training, testing and predictions

import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline
import numpy as np

# import Iris Dataset 
iris = datasets.load_iris()
data = iris.data  # we only take the first two features.
labels = iris.target

# take a look at the data
print(len(data))
print(data[0:5])
print(labels[0:5])

# shuffle both the data and its labels 
random_indices = np.random.permutation(len(data))
print(random_indices[:5])
data = data[random_indices]
labels = labels[random_indices]

# fix the randomly chosen samples, so that they will not keep shuffling everytime the cell is executed

# take a look at the shuffled data
print(data[:10])
print(labels[:10])

# split the data for training and testing. 
# Build the model with training set and then check the accuracy with testing set
# use 80 percent of the data for training and 20 percent for testing
X_train = data[:130] 
y_train = labels[:130]
X_test = data[130:]
y_test = labels[130:]
#prediction_data = data[140:]
#prediction_labels = labels[140:]
#len(prediction_data) == len(prediction_labels)




150
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]
[117  26  71  17 108]
[[7.7 3.8 6.7 2.2]
 [5.  3.4 1.6 0.4]
 [6.1 2.8 4.  1.3]
 [5.1 3.5 1.4 0.3]
 [6.7 2.5 5.8 1.8]
 [6.4 2.8 5.6 2.1]
 [5.  2.  3.5 1. ]
 [5.9 3.2 4.8 1.8]
 [6.1 3.  4.6 1.4]
 [5.7 3.  4.2 1.2]]
[2 0 1 0 2 2 1 1 1 1]


In [2]:
# To make a Random Forest there needs to be n decision trees which are the building blocks of a Random Forest.

import numpy as np

class Decision_Tree():
    def __init__(self, training_data, training_labels, test_data, test_labels, min_training_gini = 0.05): 
        # initiate the decision tree with all the training data & labels, set an optional min_training_gini value
        self.training_data = training_data
        self.training_labels = training_labels
        self.test_data = test_data
        self.test_labels = test_labels
        self.min_training_gini = min_training_gini

                
    def train(self):
        # create the root node
        self.root = Node(data = self.training_data, labels = self.training_labels)
        # make this part below recallable
        key_node = self.root
        gini_of_key_node = self.gini(key_node.labels)
        if gini_of_key_node < self.min_training_gini: 
            print('the requested min gini is too high, decrease the min required gini for splits')
            return
        if len(key_node.labels) <= 1: return
        assert(key_node.data != None).all() 
        if gini_of_key_node > self.min_training_gini:
            self.split(key_node)
    
    
    def split(self, key_node):
        # if there is not enough data to split, return
        # if gini is reaching the min standard, assign the label of the node 
        if len(key_node.labels) <= 1 or self.gini(key_node.labels) <= self.min_training_gini:
            key_node.node_id = self.most_common_label(key_node.labels)
            return 
        else:
            # do the split
            split_loc, split_value, True_list, True_label, False_list, False_label = self.best_split(key_node.data, key_node.labels)
            # assign the attributes to the key_node that is split into two parts
            key_node.split_loc = split_loc
            key_node.split_value = split_value
            
            # if split does not yield two seperate nodes, give an ID to the node and return
            if True_list == None or False_list == None: # No need to branch out
                key_node.node_id = self.most_common_label(key_node.labels)
                return
            
            else:
                # create a node, assign the attributes to the leftchild, connect with the parent node
                new_node = Node(data = True_list, labels = True_label, parent_T_F = True)
                new_node.parent = key_node
                key_node.leftchild = new_node

                # create a node, assign the attributes to the rightchild, connect with the parent node
                new_node = Node(data = False_list, labels = False_label, parent_T_F = False)
                new_node.parent = key_node
                key_node.rightchild = new_node
            
        # create branches recursively
        self.split(key_node.leftchild)
        self.split(key_node.rightchild)
    
 
    
    def test(self, test_data, test_labels):
        self.test_results = []
        for each_instance in test_data: # test each instance 
            current_node = self.root
            if self.root == None: return
            while current_node.leftchild != None and current_node.rightchild != None:
                # split_loc, split_value, split_T_F will be imported
                if current_node.split_value < each_instance[current_node.split_loc]: # go to the leftchild
                    current_node = current_node.leftchild
                else: # go to the rightchild
                    current_node = current_node.rightchild
            self.test_results.append(current_node.node_id)
        accuracy_of_test_data = self.accuracy(self.test_results, self.test_labels)     
        return accuracy_of_test_data
        
        
    def predict(self, data_to_be_labeled): # takes a 2d array with multiple instances and outputs predicted labels
        pred_results = []
        for each_instance in data_to_be_labeled: # test each instance 
            current_node = self.root
            while current_node.leftchild != None and current_node.rightchild != None:
                # split_loc, split_value, split_T_F will be imported
                if current_node.split_value < each_instance[current_node.split_loc]: # go to the leftchild
                    current_node = current_node.leftchild
                else: # go to the rightchild
                    current_node = current_node.rightchild
            if current_node.node_id != None:
                pred_results.append(current_node.node_id)   
        return pred_results
        
        
    def class_count(self, labels): # counts the number of labels in the target data (label list)
        a_dict = {}
        for label in labels:
            if label in a_dict.keys():
                a_dict[label] += 1
            else:
                a_dict[label] = 1
        return a_dict


    def most_common_label(self, labels): # returns the most commonly seen item in an array
        a_dict = self.class_count(labels)
        most_common = 0
        for k,v in a_dict.items():
            if v > most_common:
                most_common = v
                set_the_most_freq_item = k
        return set_the_most_freq_item

    
    def gini(self, labels): # calculates the impurity of the target list
        count = self.class_count(labels)
        gini_impurity = 1
        for k,v in count.items():
            prob = v / len(labels)
            gini_impurity -= prob**2
        return gini_impurity


    def information_gain(self, labels, T_labels, F_labels): 
        # after the split is done, compare the differences in gini and output the information gain
        T_len = len(T_labels)
        F_len = len(F_labels)
        total_len = len(labels)
        info_gain = self.gini(labels) - (T_len/total_len * self.gini(T_labels)) - (F_len/total_len * self.gini(F_labels))
        return info_gain


    def partition(self, data, labels, column, value): # does partition based on the specified column's value
        assert(len(data) == len(labels))
        if len(data) <= 1: return
        T_data, T_label, F_data, F_label = [], [], [], [] # both the data and labels will split into two lists
        for i in range(len(data)):
            if data[i][column] >= value:
                T_data.append(data[i])
                T_label.append(labels[i])
            else:
                F_data.append(data[i])
                F_label.append(labels[i])
        return T_data, T_label, F_data, F_label


    def best_split(self, data, labels): # decides which attribute is the best for the split that yields highest info gain
        if len(data) <= 1: return # not enough data, terminate the function
        if len(set(labels)) == 1: return None # there is nothing to split, return None
        max_info_gain = 0
        # loop over all columns of data
        for att_loc in range(len(data[0])):
            # find all the atttributes in each column
            set_of_att = set([row[att_loc] for row in data])
            # for each instance
            for each_att in set_of_att:
                # do the partition for column and attributes
                T_data, T_label, F_data, F_label = self.partition(data, labels, att_loc, each_att)
                info_gain = self.information_gain(labels, T_label, F_label)
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    best_split_loc = att_loc
                    best_split_att = each_att
                    True_list = T_data
                    False_list = F_data
                    True_label = T_label
                    False_label = F_label
        # return the best split's attributes and the T and F lists based on the best info gain achieved
        return (best_split_loc, best_split_att, True_list, True_label, False_list, False_label)



    # accuracy will be calculated by comparing obtained targets with the actual labels 
    # Ex: accuracy(our_obtained_target_list, actual_labels) for the accuracy  
    def accuracy(self, testing_labels, actual_labels):
        assert (len(testing_labels) == len(actual_labels))
        testing_labels = np.array(testing_labels)
        actual_labels = np.array(actual_labels)
        matching_labels = (testing_labels == actual_labels) 
        # this will output an array with False and True results for each labels 
        accuracy_of_model = sum(matching_labels) / len(testing_labels)
        return accuracy_of_model * 100

    
    

class Node():
    def __init__(self, data = None, labels = None, parent = None, node_id = None,
                 leftchild = None, rightchild = None, split_loc = None, split_value = None, parent_T_F = None):
    
        self.data = data
        self.labels = labels
        self.parent = parent
        self.node_id = node_id
        self.leftchild = leftchild
        self.rightchild = rightchild
        self.split_loc = split_loc
        self.split_value = split_value
        self.parent_T_F = parent_T_F
        


In [3]:
# test the decision tree with data
testing_results = []
# output the test and prediction scores based on the different min accepted gini level for a split 
test_D_T = Decision_Tree(X_train, y_train, X_test, y_test, min_training_gini = 0.4)     
test_D_T.train()
test_accuracy = test_D_T.test(X_test, y_test)
print(test_accuracy)

80.0


In [4]:
# Sample with replacement, n training dataset chosen randomly
# Train n decision trees with the randomly chosen portion of the training dataset
# Take the majority vote of these trees to make a decision
# This technique is called bagging, or bootstrap aggregating: Training the same classifier on different subsets 
# The sampling with replacement is known as a bootstrap sampling


class Random_Forest():
    # load all the training and test data and decide for 'n trees' and sample percentage, 'm', for each tree 
    def __init__(self, n, m, X_train, y_train, X_test, y_test, min_training_gini = 0.3): 
        self.n = n # number of trees to be created
        self.m = m # percentage of the data allocated for each tree to train (ex: 60 means 60 percent)
        self.min_training_gini = min_training_gini # set a min training gini for each tree
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
    def fit_and_test(self): # create and train multiple trees 
        # keep track of the accuracy of the voted averages of all trees with the test set  
        # create a table to store all the test set predictions of n trees
        self.votes = np.zeros([self.n, len(self.X_test)])
        # with a portion of training samples, sample with replacement (BAGGING)
        for i in range(self.n):
            decision_tree = None
            train_random_indices = np.random.permutation(len(self.X_train)) # shuffle the training set indices
            portion = len(train_random_indices) * self.m // 100
            m_random_ind_picked = train_random_indices[:portion] # take a portion of the indices
            training_set = self.X_train[m_random_ind_picked] # prepare the training set 
            training_labels = self.y_train[m_random_ind_picked] # prepare the training labels
            # create decision trees based on the training set and labels for each decision tree 
            decision_tree = Decision_Tree(training_set, training_labels, self.X_test, self.y_test, self.min_training_gini)
            # train the decision tree
            decision_tree.train()
            # test the decsion tree with the test set
            results = decision_tree.predict(self.X_test) # outputs the predicted labels of the test set
            # make all predictions and append the prediction into a voting 2d array
            # print('results', results)
            self.votes[i] = results
        # when the voting table is completed take the average of votes with axis = 1 and output the predictions
        most_common_label = self.voted_average()
        # compare the accuracy of results with the voted predictions
        accuracy_of_test = self.accuracy(most_common_label, self.y_test)
        return accuracy_of_test
        
        
    def voted_average(self): # gets the most common item seen on axis = 1
        N, D = self.votes.shape
        voted_predictions = np.zeros(D)
        # for each instance results, pick the most common element on axis = 1
        for i in range(D):
            labels = self.votes[:,i]
            a_dict = {}
            for label in labels:
                if label in a_dict.keys():
                    a_dict[label] += 1
                else:
                    a_dict[label] = 1
            # now a_dict has the number of each labels
            most_common = 0
            set_the_most_freq_item = ''
            for k,v in a_dict.items():
                if v > most_common:
                    most_common = v
                    set_the_most_freq_item = k
            voted_predictions[i] = set_the_most_freq_item
        return voted_predictions
        
    
    def accuracy(self, testing_labels, actual_labels):
        return np.mean(testing_labels == actual_labels)*100
    
    
    

In [5]:
# test the model
rf = Random_Forest(5, 30, X_train, y_train, X_test, y_test, min_training_gini = 0.4)
rf.fit_and_test()



80.0

In [6]:
# computationally too costly for large data, good for learning purposes
# choose the best model for n trees with m portion of the dataset that were trained with some min gini k
def best_model():
    # find the best model for the random forest for this dataset and save this model for predictions
    # grid search for the best hyperparameters
    best_accuracy = 0
    for i in range(3,7): # 2 trees minimum and 10 trees maximum (n)
        for j in np.arange(40,100,20): # 10 percent to underfit and 90 percent to overfit (m)
            for k in np.linspace(0,30,5)*0.01: # gini values set
                # instantiate the object with different i,j,k values and find the best accuracy
                random_forest = Random_Forest(i, j, X_train, y_train, X_test, y_test, min_training_gini = k)
                current_accuracy = random_forest.fit_and_test()
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    best_n = i
                    best_m = j
                    best_gini = k
    return best_gini, best_n, best_m, best_accuracy
best_gini, best_n, best_m, best_accuracy = best_model()
print('best_gini: {}, best_n: {}, best_m: {}, best_accuracy:{}'.format(best_gini, best_n, best_m, best_accuracy))


best_gini: 0.0, best_n: 3, best_m: 40, best_accuracy:95.0
