In [18]:
# Author: 
# Date:
# Project: 
# Acknowledgements: 
#


In [10]:
from typing import Union
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree

from tools import load_iris, split_train_test

In [19]:
def prior(targets: np.ndarray, classes: list) -> np.ndarray:
    '''
    Calculate the prior probability of each class type
    given a list of all targets and all class types
    '''
    n_size = len(targets)
    if n_size == 0:
        return 0
    probabilities = np.zeros(len(classes))
    for i, a_class in enumerate(classes):
        probabilities[i] = sum(targets == a_class) / n_size

    return probabilities

In [12]:
def split_data(
    features: np.ndarray,
    targets: np.ndarray,
    split_feature_index: int,
    theta: float
) -> Union[tuple, tuple]:
    '''
    Split a dataset and targets into two seperate datasets
    where data with split_feature < theta goes to 1 otherwise 2
    '''
    ft = features[:, split_feature_index] < theta
    
    features_1 = features[ft]
    targets_1 = targets[ft]

    features_2 = features[~ft]
    targets_2 = targets[~ft]

    return (features_1, targets_1), (features_2, targets_2)


In [13]:
def gini_impurity(targets: np.ndarray, classes: list) -> float:
    '''
    Calculate:
        i(S_k) = 1/2 * (1 - sum_i P{C_i}**2)
    '''
    i = (1/2) * (1 - np.sum(np.power((prior(targets, classes)),2)))
    return i

In [14]:
def weighted_impurity(
    t1: np.ndarray,
    t2: np.ndarray,
    classes: list
) -> float:
    '''
    Given targets of two branches, return the weighted
    sum of gini branch impurities
    '''
    g1 = gini_impurity(t1, classes)
    g2 = gini_impurity(t2, classes)
    n1 = t1.shape[0]
    n2 = t2.shape[0]

    return (n1*g1 + n2*g2) / (n1 + n2)

In [15]:
def total_gini_impurity(
    features: np.ndarray,
    targets: np.ndarray,
    classes: list,
    split_feature_index: int,
    theta: float
) -> float:
    '''
    Calculate the gini impurity for a split on split_feature_index
    for a given dataset of features and targets.
    '''
    (, t1), (, t2) = split_data(features, targets, split_feature_index, theta)
    return weighted_impurity(t1, t2, classes)

In [16]:
def brute_best_split(
    features: np.ndarray,
    targets: np.ndarray,
    classes: list,
    num_tries: int
) -> Union[float, int, float]:
    '''
    Find the best split for the given data. Test splitting
    on each feature dimension num_tries times.

    Return the lowest gini impurity, the feature dimension and
    the threshold
    '''
    best_gini, best_dim, best_theta = float("inf"), None, None
    # iterate feature dimensions
    for i in range(features.shape[1]):
        # create the thresholds
        row = features[:,i]
        thetas = np.linspace(np.min(row), np.max(row), num_tries+2)[1:-1]
        # iterate thresholds
        for theta in thetas:
            gini = total_gini_impurity(features, targets, classes, i, theta)
            if gini < best_gini:
                best_gini = gini
                best_dim = i
                best_theta = theta
            
    return best_gini, best_dim, best_theta


In [21]:
class IrisTreeTrainer:
    def __init__(
        self,
        features: np.ndarray,
        targets: np.ndarray,
        classes: list = [0, 1, 2],
        train_ratio: float = 0.8
    ):
        '''
        train_ratio: The ratio of the Iris dataset that will
        be dedicated to training.
        '''
        self.features = features
        self.targets = targets

        (self.train_features, self.train_targets),\
            (self.test_features, self.test_targets) 
            split_train_test(features, targets, train_ratio)
        
        self.classes = classes
        self.tree = DecisionTreeClassifier()

    def train(self):
        self.tree.fit(self.train_features, self.train_targets)

    def accuracy(self):
        return self.tree.score(self.test_features, self.test_targets)

    def plot(self):
        plot_tree(self.tree)
        plt.show()

    def guess(self):
        return self.tree.predict(self.test_features)

    def confusion_matrix(self):
        self._confusion_matrix = np.zeros((len(self.classes), len(self.classes)))
        guesses = self.guess()
        for i in range(len(guesses)):
            true_value = self.test_targets[i]
            guess_value = guesses[i]
            self._confusion_matrix[true_value, guess_value] += 1
        return self._confusion_matrix

    def plot_progress(self, start=0.01, end=0.99, count=99):
        ratios = np.linspace(start, end, count)
        scores = np.zeros(count)

        for i, ratio in enumerate(ratios):
            temp_tree = IrisTreeTrainer(self.features, self.targets, classes=self.classes, train_ratio=ratio)
            temp_tree.train()
            scores[i] = temp_tree.accuracy()

        plt.plot(scores)


IndentationError: unexpected indent (1908596807.py, line 18)