#### Applied Machine Learning - Mini Project 2 (Tasnim Ahmed, ta1743)

# Decision Trees

In [None]:
# node - feature attribute (does the item meet the specific category)
# branches - decision (rule - yes or no)
# leaf - an outcome (the category - leaf name or spam)

In [6]:
import numpy as np 
import sklearn 
import pandas as pd
import math

Given the fact both of the datasets have continuous features we will implement decision trees that have binary splits.

In [352]:
class Node: 
    def __init__(self, left = None, right = None, feature = None, threshold = None, label = None):
        self.left = left
        self.right = right
        self.feature = feature        # on which feature was the node divided upon 
        self.threshold = threshold    # on what threshold of the feature was it divided upon
        
        self.label = label            # place the label if it is a tree node, else none
        
    def check_leaf_node(self):
        return self.label != None
        
    def label_value(self):        # return the label value if it is a leaf node
        return self.label
            
        
class DecisionTreeClassifier:
    def __init__(self, n_min):
        self.root = None                # the root of the node
        self.n_min = n_min              # the stopping criteria
    
        
    def fit(self, X, y):
        #set the stopping hyperparameter
        min_sample_split = X.shape[0]*(self.n_min/100)
        self.root = self.grow_tree(X, y, min_sample_split)
        

    def grow_tree(self, X, y, min_sample_split):    #recursive function
        n_samples, n_features = X.shape
        labels = np.unique(y)
        n_labels = len(labels)
        
        # checking the stopping criteria 
        if (n_samples <= min_sample_split or n_labels == 1):
            #find the max label of the dataset and set that as the leaf node value
            leaf_node = Node(label = self.max_label(y.to_numpy()))
            return leaf_node   
           
        # get the best feature split and threshold for the given dataset
        split_feature, split_threshold = self.find_best_split(X, y, n_features)
        
        # create child nodes
        X_col = X.iloc[:, split_feature]
        left_ind = np.argwhere(X_col.to_numpy() < split_threshold).flatten() 
        right_ind = np.argwhere(X_col.to_numpy() >= split_threshold).flatten() 
        
        X_left, y_left = X.iloc[left_ind], y.iloc[left_ind]
        X_right, y_right = X.iloc[right_ind], y.iloc[right_ind]
        
        # call the grow tree function again on the subset of the data
        left_node = self.grow_tree(X_left, y_left, min_sample_split)
        right_node = self.grow_tree(X_right, y_right, min_sample_split)
             
        return Node(left_node, right_node,  split_feature, split_threshold)
      
        
    def find_best_split(self, X, y, n_features):
        max_ig = 0
        best_feature = None
        best_threshold = None
        
        for i in range(n_features):              # iterate over the columnns 
            X_col = X.iloc[:, i]                 # extract the column with index i
            max_col_ig, max_col_thr = self.col_information_gain(X_col, y) # get the max ig and thr of column[i]
            
            if max_col_ig > max_ig:
                max_ig = max_col_ig
                best_feature = i
                best_threshold = max_col_thr
                
        return best_feature, best_threshold
        
    
    def max_label(self, y):
        labels = np.unique(y)
        n_labels = len(labels)
        label_dict = {}
        
        for i in range(n_labels):
            label_dict[labels[i]] = 0
        
        for i in range(n_labels):
            for j in range(len(y)):
                if y[j] == labels[i]:
                    label_dict[labels[i]] += 1
                    
        #find the max label and return that    
        return max(label_dict, key = label_dict.get)
    
    
    def entropy(self, column):
        counts = np.bincount(column)
        probs = counts/len(column)
        entropy = np.sum([p*math.log(p, 2) for p in probs if p > 0])
        return -entropy
    
    
    def information_gain(self, X_col, y, threshold):
        # calculate the original entropy of the target before split
        main_entropy = self.entropy(y)
        
        # creating the branch splits based on the threshold
        splits =  self.split_dataset(X_col, y, threshold)
        branch_splits = [splits[0], splits[1]]
        target_splits = [splits[2], splits[3]]
        
        #do I need to check the lenght of the splits to return 0? because there will be math error right?
        
        
        # sum the entropies*prob of the child branches
        sub = 0
        for i in range(len(branch_splits)):
            prob = len(branch_splits[i])/len(X_col)
            sub += prob*self.entropy(target_splits[i])
                                     
        return main_entropy - sub
    
    
    def split_dataset(self, X_col, y, threshold):    #there should be changes here
        
        left_split, left_target, right_split, right_target = [], [], [], []
        
        #Split the data based on the threshold 
        for i in range(len(X_col)):
            # get the rows less than the threshold
            if X_col[i] < threshold:
                left_split.append(X_col[i]) 
                left_target.append(y[i])
            # get the rows more than or equal to the threshold
            else:
                right_split.append(X_col[i])
                right_target.append(y[i])
    
        return [left_split, right_split, left_target, right_target]
    
        
    def col_information_gain(self, X_col, y):
        # dictionary that stores the theshold values with the information gain {threshold: ig}
        X_col = X_col.to_numpy()
        y = y.to_numpy()
        col_igs = {}
        max_ig, max_threshold = None, None
        
        for i in range(len(X_col) - 1):
            # calculate the average of adjacent row values
            avg_thr = (X_col[i] + X_col[i+1])/2

            # get the information gain and add it to the dictionary 
            col_igs[avg_thr] = self.information_gain(X_col, y, avg_thr)

            max_threshold = max(col_igs, key=col_igs.get)
            max_ig = col_igs.get(max_threshold)

        return max_ig, max_threshold
    
    
    def predict_y(self, X):
        y_pred = []
        for i in range(len(X)):
            X_row = X.iloc[i]
            y_pred.append(self.traverse_tree(X_row, self.root))
            
        return y_pred
           
    
    def traverse_tree(self, X, node):   #recursive function
        if node.check_leaf_node():
            return node.label
        
        if X.iloc[node.feature] >= node.threshold:
            return self.traverse_tree(X, node.right)
        return self.traverse_tree(X, node.left)
        

## Iris 

In [167]:
# Loading the data
iris_data  = pd.read_csv("iris.csv", names = ["f0", "f1", "f2", "f3", "target"])
# Verifying if the data has been loaded properly
iris_data.head()

Unnamed: 0,f1,f2,f3,f4,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [168]:
# Checking the shape of the data
iris_data.shape

(150, 5)

In [169]:
# Separating the features and the target variable columns
X = iris_data.drop('target', axis = 'columns') #independent variables
y = iris_data['target']  #dependent variable

In [170]:
# set the species values to 0, 1 and 2
y_labelled = []
for i in y:
    if i == "Iris-setosa": 
        y_labelled.append(0)
        
    elif i == "Iris-versicolor":
        y_labelled.append(1)
        
    else:
        y_labelled.append(2)
              
y_labelled = (pd.DataFrame(y_labelled)).iloc[:, 0]

In [363]:
from sklearn.metrics import accuracy_score

In [364]:
# K-Fold Cross Validation
from sklearn.model_selection import KFold
kfold_value = 10
kfold = KFold(n_splits = kfold_value)

In [366]:
n_mins = [5, 10, 15, 20]

n_min_accuracy = []
for n in n_mins:
    
    fold_accuracy = []
    for train, test in kfold.split(X):
        # Split the dataset into training and test set
        X_train, X_test = X.iloc[train], X.iloc[test]  
        y_train, y_test = y_labelled.iloc[train], y_labelled.iloc[test]
        
        dt_iris = DecisionTreeClassifier(n)
        dt_iris.fit(X_train, y_train)
        y_pred = dt_iris.predict_y(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        fold_accuracy.append(acc)
        
    n_min_accuracy.append(np.average(fold_accuracy))
     

print(n_min_accuracy)

[0.9266666666666667, 0.9200000000000002, 0.9200000000000002, 0.9200000000000002]


## Spambase

In [None]:
## Load the data

In [None]:
# run through thr decision tree with kfold and n_min array

In [None]:
# check the accuracy