# <center>Introduction to ML - Decision Tree Coursework</center>
### <center>COMP70050</center>

### Installing dependencies

Step 1

In [4]:
import numpy as np

clean_data = x = np.loadtxt("wifi_db/clean_dataset.txt", delimiter='\t')
print(clean_data.shape)

noisy_data = x = np.loadtxt("wifi_db/noisy_dataset.txt")
print(noisy_data.shape)

(2000, 8)
(2000, 8)


In [6]:
def entropy(labels):
    _, counts = np.unique(labels, return_counts=True)
    prob = counts / np.sum(counts)
    H = - np.sum(prob * np.log2(prob))

    return H

In [7]:
class Node:
    def __init__(self, attribute, value, left, right, is_leaf):
        self.attribute = attribute
        self.value = value
        self.left = left
        self.right = right
        self.is_leaf = is_leaf

In [29]:
class DecisionTree:
    def __init__(self):
        self.root: Node = None
        self.depth = 0

    def find_split(self, dataset):
        # We split out dataset, x is the array of features and y is the labels
        x = dataset[:, :-1]
        y = dataset[:, -1]

        # n is the number of samples and k is the number of features
        n, k = x.shape

        H_total = entropy(y)
        max_gain = 0
        max_gain_attribute = -1
        max_gain_split = -1

        for attribute in range(k):
            arr = x[:, attribute]
            uniques = np.unique(arr)

            for split in uniques:
                left_ds = y[arr <= split]
                right_ds = y[arr > split]

                remainder = ((len(left_ds)/len(arr)) * entropy(left_ds)) + ((len(right_ds)/len(arr)) * entropy(right_ds))
                if max_gain < H_total - remainder:

                    max_gain = H_total - remainder
                    max_gain_attribute = attribute
                    max_gain_split = split
        
        return max_gain_attribute, max_gain_split

    def decision_tree_learning(self, dataset, depth=1):
        x = dataset[:, :-1]
        y = dataset[:, -1]
        if len(np.unique(y)) == 1:
            return (Node(None, y[0], None, None, True), depth)
        else:
            attribute, value = self.find_split(dataset)

            l_dataset = dataset[dataset[:, attribute] <= value]
            r_dataset = dataset[dataset[:, attribute]  > value]

            l_branch, l_depth = self.decision_tree_learning(l_dataset, depth + 1)
            r_branch, r_depth = self.decision_tree_learning(r_dataset, depth + 1)

            node = Node(attribute, value, l_branch, r_branch, False)

            self.root = node
            self.depth = max(l_depth, r_depth)
            return (node, max(l_depth, r_depth))
        
    def predict(self, x):
        y = np.zeros(len(x))
        for i in range(len(y)):
            current_node = self.root
            while not current_node.is_leaf:
                if x[i, current_node.attribute] <= current_node.value :
                    current_node = current_node.left
                else :
                    current_node = current_node.right

            y[i] = current_node.value

        return y



In [27]:
def find_split(dataset):
    # We split out dataset, x is the array of features and y is the labels
    x = dataset[:, :-1]
    y = dataset[:, -1]

    # n is the number of samples and k is the number of features
    n, k = x.shape

    H_total = entropy(y)
    max_gain = 0
    max_gain_attribute = -1
    max_gain_split = -1

    for attribute in range(k):
        arr = x[:, attribute]
        uniques = np.unique(arr)
    

        for split in uniques:
            left_ds = y[arr <= split]
            right_ds = y[arr > split]



            remainder = ((len(left_ds)/len(arr)) * entropy(left_ds)) + ((len(right_ds)/len(arr)) * entropy(right_ds))
            if max_gain < H_total - remainder:

                max_gain = H_total - remainder
                max_gain_attribute = attribute
                max_gain_split = split
    
    return max_gain_attribute, max_gain_split

        # sorted_indecies = np.argsort(x[:, attribute])
        # sorted_attribute = x[:, attribute][sorted_indecies]
        # idx = 0
        # for value in uniques:
        #     current = sorted_attribute[idx]
        #     while current == value:
        #         idx+=1

        # for split in len(1, sorted_attribute):
        #     if sorted_attribute[split - 1] == sorted_attribute[split]:
        #         continue
        
            
        

    return H_total

In [28]:
def decision_tree_learning(dataset, depth=1):
    x = dataset[:, :-1]
    y = dataset[:, -1]
    if len(np.unique(y)) == 1:
        return (Node(None, y[0], None, None, True), depth)
    else:
        attribute, value = find_split(dataset)

        l_dataset = dataset[dataset[attribute] <= value]
        r_dataset = dataset[dataset[attribute]  > value]

        l_branch, l_depth = decision_tree_learning(l_dataset, depth+1)
        r_branch, r_depth = decision_tree_learning(r_dataset, depth+1)

        node = Node(attribute, value, l_branch, r_branch, False)

        return (node, max(l_depth, r_depth))

### Step 3 : Evaluation

In [42]:
def confusion_matrix(y_gold, y_prediction, class_labels=None):
    """ Compute the confusion matrix.

    Args:
        y_gold (np.ndarray): the correct ground truth/gold standard labels
        y_prediction (np.ndarray): the predicted labels
        class_labels (np.ndarray): a list of unique class labels.
                               Defaults to the union of y_gold and y_prediction.

    Returns:
        np.array : shape (C, C), where C is the number of classes.
                   Rows are ground truth per class, columns are predictions
    """

    # if no class_labels are given, we obtain the set of unique class labels from
    # the union of the ground truth annotation and the prediction
    if not class_labels:
        class_labels = np.unique(np.concatenate((y_gold, y_prediction)))

    confusion = np.zeros((len(class_labels), len(class_labels)), dtype=np.int8)

    # TODO: Complete this
    # for each correct class (row),
    # compute how many instances are predicted for each class (columns)
    for i, correct_class in enumerate(class_labels):
        for j, predicted_class in enumerate(class_labels):
            count = np.count_nonzero(np.logical_and(y_gold == correct_class, y_prediction == predicted_class))
            confusion[i][j] = count

    return confusion

def accuracy(confusion):
    """ Compute the accuracy given the confusion matrix

    Args:
        confusion (np.ndarray): shape (C, C), where C is the number of classes.
                    Rows are ground truth per class, columns are predictions

    Returns:
        float : the accuracy
    """

    if np.sum(confusion) > 0:
        # TODO: Complete this
        return np.trace(confusion) / np.sum(confusion)
    else:
        return 0.

def precision(confusion):
    """ Compute the precision score per class given the ground truth and predictions

    Also return the macro-averaged precision across classes.

    Args:
        y_gold (np.ndarray): the correct ground truth/gold standard labels
        y_prediction (np.ndarray): the predicted labels

    Returns:
        tuple: returns a tuple (precisions, macro_precision) where
            - precisions is a np.ndarray of shape (C,), where each element is the
              precision for class c
            - macro-precision is macro-averaged precision (a float)
    """

    # TODO: Complete this function

    # Compute the precision per class
    p = np.diag(confusion) / np.sum(confusion, axis=1)

    # Compute the macro-averaged precision
    macro_p = np.mean(p)

    return (p, macro_p)

def recall(confusion):
    """ Compute the recall score per class given the ground truth and predictions

    Also return the macro-averaged recall across classes.

    Args:
        y_gold (np.ndarray): the correct ground truth/gold standard labels
        y_prediction (np.ndarray): the predicted labels

    Returns:
        tuple: returns a tuple (recalls, macro_recall) where
            - recalls is a np.ndarray of shape (C,), where each element is the
                recall for class c
            - macro-recall is macro-averaged recall (a float)
    """

    # TODO: Complete this function

    # Compute the recall per class
    r = np.diag(confusion) / np.sum(confusion, axis=0)

    # Compute the macro-averaged recall
    macro_r = np.mean(r)

    return (r, macro_r)

def f1_score(confusion):
    """ Compute the F1-score per class given the ground truth and predictions

    Also return the macro-averaged F1-score across classes.

    Args:
        y_gold (np.ndarray): the correct ground truth/gold standard labels
        y_prediction (np.ndarray): the predicted labels

    Returns:
        tuple: returns a tuple (f1s, macro_f1) where
            - f1s is a np.ndarray of shape (C,), where each element is the
              f1-score for class c
            - macro-f1 is macro-averaged f1-score (a float)
    """

    (precisions, macro_p) = precision(confusion)
    (recalls, macro_r) = recall(confusion)

    # just to make sure they are of the same length
    assert len(precisions) == len(recalls)

    # TODO: Complete this to compute the per-class F1
    f = (2*precisions*recalls) / (precisions + recalls)

    # TODO: Compute the macro-averaged F1
    macro_f = np.mean(f)

    return (f, macro_f)

In [39]:
# From Lab 3
def k_fold_split(n_splits, n_instances, random_generator=np.random.default_rng()):
    """ Split n_instances into n mutually exclusive splits at random.

    Args:
        n_splits (int): Number of splits
        n_instances (int): Number of instances to split
        random_generator (np.random.Generator): A random generator

    Returns:
        list: a list (length n_splits). Each element in the list should contain a
            numpy array giving the indices of the instances in that split.
    """

    # generate a random permutation of indices from 0 to n_instances
    shuffled_indices = random_generator.permutation(n_instances)

    # split shuffled indices into almost equal sized splits
    split_indices = np.array_split(shuffled_indices, n_splits)

    return split_indices

def train_test_k_fold(n_folds, n_instances, random_generator=np.random.default_rng()):
    """ Generate train and test indices at each fold.

    Args:
        n_folds (int): Number of folds
        n_instances (int): Total number of instances
        random_generator (np.random.Generator): A random generator

    Returns:
        list: a list of length n_folds. Each element in the list is a list (or tuple)
            with two elements: a numpy array containing the train indices, and another
            numpy array containing the test indices.
    """

    # split the dataset into k splits
    split_indices = k_fold_split(n_folds, n_instances, random_generator)

    folds = []
    for k in range(n_folds):
        test_indices = split_indices[k]
        train_indices = np.concatenate(split_indices[:k] + split_indices[k+1:])

        folds.append([train_indices, test_indices])

    return folds

In [50]:
dataset = clean_data
n_folds = 10
n_instances = len(dataset)

confusion_matrices = np.zeros((n_folds, 4, 4))
accuracies = np.zeros(n_folds)
precisions = np.zeros(n_folds)
recalls = np.zeros(n_folds)
f1_scores = np.zeros(n_folds)

for i, (train_indices, test_indices) in enumerate(train_test_k_fold(n_folds, len(x))):
    # Splitting the train and test
    x_train = dataset[train_indices, :-1]
    y_train = dataset[train_indices, -1]
    x_test = dataset[test_indices, :-1]
    y_test = dataset[test_indices, -1]

    model = DecisionTree()
    model.decision_tree_learning(dataset[train_indices])
    y_pred = model.predict(x_test)

    confusion_matrices[i] = confusion_matrix(np.int8(y_test), np.int8(y_pred))
    accuracies[i] = accuracy(confusion_matrices[i])
    precisions[i] = precision(confusion_matrices[i])[1]
    recalls[i] = recall(confusion_matrices[i])[1]
    f1_scores[i] = f1_score(confusion_matrices[i])[1]

print(accuracies.mean())
print(precisions.mean())
print(recalls.mean())
print(f1_scores.mean())

0.9710000000000001
0.9712722451857989
0.9710197756661246
0.970741418018506
[[47.  0.  0.  0.]
 [ 0. 59.  5.  0.]
 [ 1.  2. 38.  0.]
 [ 0.  0.  0. 48.]]
