In [1]:
def gini_index(groups, classes):
    # total number of samples
    n_instances = sum(len(group) for group in groups)

    # weighted gini index for all groups
    gini = 0.0
    for group in groups:
        group_size = len(group)
        if size == 0:
            continue
        # calculate the score for the group
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p ** 2
        # weight by the size of the group
        gini += (1 - score) * (group_size / n_instances)

    return gini

In [2]:
def test_split(index, value, dataset):
    """Split the dataset based on an attribute and attribute value."""
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def get_best_split(dataset):
    """Find the best split point for a dataset."""
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = None, None, float('inf'), None
    for index in range(len(dataset[0]) - 1): # Exclude the header label
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index,
            'value': best_value,
            'groups': best_groups,
           }        