In [1]:
import numpy as np
import pandas as pd
import random
import monkdata as m
import dtree as d
from drawtree_qt5 import drawTree

In [16]:
def compute_entropy(datasets_names, datasets):
    for dataset_name, dataset in zip(datasets_names, datasets):
        print(dataset_name, ':' , round(d.entropy(dataset),3))

In [2]:
def maximum_information_gain(datasets):
    attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6']
    information_gain_matrix = information_gain(datasets)
    maximum_information_gain_matrix = np.zeros((len(datasets),2),dtype=object)
    for i in range(len(datasets)):
        inf_gain_maximum = max(information_gain_matrix[i])
        if inf_gain_maximum == 0:
            e = 'NA',0
        if inf_gain_maximum != 0:
            x, = np.where(information_gain_matrix[i] == inf_gain_maximum)
            e = attributes_names[int(x)], inf_gain_maximum
        maximum_information_gain_matrix[i] = e
    return maximum_information_gain_matrix

In [3]:
def information_gain(datasets):
    attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6']
    information_gain_matrix = np.zeros((len(datasets), len(m.attributes)))
    for idx, dataset in enumerate(datasets):
        for i in range(len(attributes_names)):
            average_gain = round(d.averageGain(dataset, m.attributes[i]),4)
            information_gain_matrix[idx, i] = average_gain
    return information_gain_matrix


In [4]:
def split_tree_by_attribute_and_value(dataset, attribute_idx):
    attribute_values = m.attributes[attribute_idx].values
    attribute_values_list = [[i] for i in list(attribute_values)]
    dataset_by_attribute_and_value = []
    for value in attribute_values:
        dataset_by_attribute_and_value.append(d.select(dataset, m.attributes[attribute_idx], value))
    return dataset_by_attribute_and_value, attribute_values_list

In [32]:
def perform_buildTree(datasets):
    datasets_trees = []
    for dataset in datasets:
        datasets_trees.append(d.buildTree(dataset, m.attributes))
    return datasets_trees

In [63]:
len(datasets)

3

In [68]:
def check_correct_incorrect_classification(datasets, test_datasets, datasets_names):
    datasets_trees = perform_buildTree(datasets)
    check = {}
    check_e = np.zeros((len(datasets), 2))
    for dataset, dataset_name, dataset_tree, test_dataset in zip(datasets, datasets_names, datasets_trees, test_datasets):   
        correct_classification = round(d.check(dataset_tree, test_dataset),3)
        check[dataset_name] = correct_classification
       
        check_e[0] = round(1 - d.check(dataset_tree, dataset),3)
        check_e[1] = round((1 - correct_classification),3)
    return check, check_e

In [5]:

def partition(data, fraction):
    ldata = list(data)
    random.shuffle(ldata)
    breakPoint = int(len(ldata) * fraction)
    return ldata[:breakPoint], ldata[breakPoint:]

#monk1train, monk1val = partition(m.monk1, 0.6)

## Assignment 1:

In [11]:
datasets_names = ['monk1', 'monk2', 'monk3']
test_datasets_names = ['monk1test', 'monk2test', 'monk3test']
datasets = [m.monk1, m.monk2, m.monk3]
test_datasets = [m.monk1test, m.monk2test, m.monk3test]

In [18]:
compute_entropy(datasets_names, datasets)

monk1 : 1.0
monk2 : 0.957
monk3 : 1.0


In [19]:
compute_entropy(test_datasets_names, test_datasets)

monk1test : 1.0
monk2test : 0.914
monk3test : 0.998


## Assignment 3:

* Information gain of the three main data sets by attribute.

In [55]:
#pd.DataFrame(information_gain(datasets), index=datasets_names, columns=attributes_names)
print(information_gain(datasets))

[[0.0753 0.0058 0.0047 0.0263 0.287  0.0008]
 [0.0038 0.0025 0.0011 0.0157 0.0173 0.0062]
 [0.0071 0.2937 0.0008 0.0029 0.2559 0.0071]]


* Maximum values by attribute per dataset.

In [54]:
print(np.hstack(([[i] for i in list(datasets_names)], maximum_information_gain(datasets))))

[['monk1' 'A5' 0.287]
 ['monk2' 'A5' 0.0173]
 ['monk3' 'A2' 0.2937]]


## Assignment 5: First split:

### *Data set monk1*

Split dataset `monk1` according to the `maximum_information_gain` per `attribute_value`. Attribute `A5`.

In [57]:
split_monk1, monk1_values = split_tree_by_attribute_and_value(m.monk1, 4)

In [58]:
print(information_gain(split_monk1))

[[0.     0.     0.     0.     0.     0.    ]
 [0.0402 0.0151 0.0373 0.0489 0.     0.0258]
 [0.0331 0.0022 0.018  0.0191 0.     0.0451]
 [0.2063 0.0339 0.0259 0.0759 0.     0.0033]]


In [59]:
print(np.hstack((monk1_values, maximum_information_gain(split_monk1))))

[[1 'NA' 0]
 [2 'A4' 0.0489]
 [3 'A6' 0.0451]
 [4 'A1' 0.2063]]


### Full Decision Trees:

In [69]:
correct_class, incorrect_class = check_correct_incorrect_classification(datasets, test_datasets, datasets_names)

In [70]:
correct_class

{'monk1': 0.829, 'monk2': 0.692, 'monk3': 0.944}

In [71]:
incorrect_class

array([[0.   , 0.   ],
       [0.056, 0.056],
       [0.   , 0.   ]])