In [1]:
import numpy as np
import pandas as pd
import random
import monkdata as m
import dtree as d
from drawtree_qt5 import drawTree

In [2]:
def compute_entropy(datasets_names, datasets):
    for dataset_name, dataset in zip(datasets_names, datasets):
        print(dataset_name, ':' , round(d.entropy(dataset),3))

In [66]:
def entropy_matrix(datasets, attribute_index, max_att_list):
    entropy_matrix = np.zeros((len(datasets), len(m.attributes[attribute_index].values)))
    for idx, dataset in enumerate(datasets):
        att = m.attributes[max_att_list[idx]]
        for j, v in enumerate(att.values):
            entropy_matrix[idx,j] = d.entropy(d.select(dataset, att, v))
    print(entropy_matrix)

In [4]:
def information_gain(datasets):
    attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6']
    information_gain_matrix = np.zeros((len(datasets), len(m.attributes)))
    for idx, dataset in enumerate(datasets):
        for i in range(len(attributes_names)):
            average_gain = round(d.averageGain(dataset, m.attributes[i]),4)
            information_gain_matrix[idx, i] = average_gain
    return information_gain_matrix


In [51]:
def maximum_information_gain(datasets):
    attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6']
    information_gain_matrix = information_gain(datasets)
    maximum_information_gain_matrix = np.zeros((len(datasets),2),dtype=object)
    max_att_list = []
    for i in range(len(datasets)):
        inf_gain_maximum = max(information_gain_matrix[i])
        if inf_gain_maximum == 0:
            e = 0,0
            max_att_list.append(0)
        if inf_gain_maximum != 0:
            x, = np.where(information_gain_matrix[i] == inf_gain_maximum)
            e = attributes_names[int(x)], inf_gain_maximum
            maximum_information_gain_matrix[i] = e
            max_att_list.append(int(x))
        #flat_list = [item for sublist in max_att_list for item in sublist]
    return maximum_information_gain_matrix, max_att_list

In [5]:
def split_tree_by_attribute_and_value(dataset, attribute_idx):
    attribute_values = m.attributes[attribute_idx].values
    attribute_values_list = [[i] for i in list(attribute_values)]
    dataset_by_attribute_and_value = []
    for value in attribute_values:
        dataset_by_attribute_and_value.append(d.select(dataset, m.attributes[attribute_idx], value))
    return dataset_by_attribute_and_value, attribute_values_list

In [6]:
def perform_buildTree(datasets):
    datasets_trees = []
    for dataset in datasets:
        datasets_trees.append(d.buildTree(dataset, m.attributes))
    return datasets_trees

In [7]:
def check_correct_incorrect_classification(datasets, test_datasets, datasets_names):
    datasets_trees = perform_buildTree(datasets)
    check = {}
    check_e = np.zeros((len(datasets), 2))
    for i, dataset, dataset_name, dataset_tree, test_dataset in zip(range(len(datasets)),datasets, datasets_names, datasets_trees, test_datasets):   
        correct_classification = round(d.check(dataset_tree, test_dataset),3)
        check[dataset_name] = correct_classification
        err = round(1 - d.check(dataset_tree, dataset),3), round((1 - correct_classification),3)
        check_e[i] = err
    return check, check_e

In [8]:
def partition(data, fraction):
    ldata = list(data)
    random.shuffle(ldata)
    breakPoint = int(len(ldata) * fraction)
    return ldata[:breakPoint], ldata[breakPoint:]

#monk1train, monk1val = partition(m.monk1, 0.6)

## Assignment 1:

In [9]:
datasets_names = ['MONK-1', 'MONK-2', 'MONK-3']
test_datasets_names = ['monk1test', 'monk2test', 'monk3test']
datasets = [m.monk1, m.monk2, m.monk3]
test_datasets = [m.monk1test, m.monk2test, m.monk3test]

In [10]:
compute_entropy(datasets_names, datasets)

MONK-1 : 1.0
MONK-2 : 0.957
MONK-3 : 1.0


In [11]:
compute_entropy(test_datasets_names, test_datasets)

monk1test : 1.0
monk2test : 0.914
monk3test : 0.998


## Assignment 3:

* Information gain of the three main data sets by attribute.

In [12]:
#pd.DataFrame(information_gain(datasets), index=datasets_names, columns=attributes_names)
print(information_gain(datasets))

[[0.0753 0.0058 0.0047 0.0263 0.287  0.0008]
 [0.0038 0.0025 0.0011 0.0157 0.0173 0.0062]
 [0.0071 0.2937 0.0008 0.0029 0.2559 0.0071]]


* Maximum values by attribute per dataset.

In [47]:
maximum_information_gain, max_att = maximum_information_gain(datasets)

In [48]:
print(np.hstack(([[i] for i in list(datasets_names)], maximum_information_gain)))

[['MONK-1' 'A5' 0.287]
 ['MONK-2' 'A5' 0.0173]
 ['MONK-3' 'A2' 0.2937]]


In [68]:
entropy_matrix(datasets, 4, max_att)

[[0.         0.93831535 0.94807824 0.90817835]
 [0.91034806 1.         0.96333555 0.877962  ]
 [0.91829583 0.8296071  0.37764632 0.        ]]


## Assignment 5: First split:

### *Data set monk1*

Split dataset `monk1` according to the `maximum_information_gain` per `attribute_value`. Attribute `A5`.

In [69]:
split_monk1, monk1_values = split_tree_by_attribute_and_value(m.monk1, 4)
split_monk2, monk2_values = split_tree_by_attribute_and_value(m.monk2, 4)
split_monk3, monk3_values = split_tree_by_attribute_and_value(m.monk3, 1)

In [73]:
maximum_information_gain_monk1_a5, max_att_monk1_a5 = maximum_information_gain(split_monk1)
maximum_information_gain_monk2_a5, max_att_monk2_a5 = maximum_information_gain(split_monk2)
maximum_information_gain_monk3_a2, max_att_monk3_a2 = maximum_information_gain(split_monk3)

In [80]:
print(information_gain(split_monk1))
print('   ')
print(np.hstack((monk1_values, maximum_information_gain_monk1_a5)))

[[0.     0.     0.     0.     0.     0.    ]
 [0.0402 0.0151 0.0373 0.0489 0.     0.0258]
 [0.0331 0.0022 0.018  0.0191 0.     0.0451]
 [0.2063 0.0339 0.0259 0.0759 0.     0.0033]]
   
[[1 0 0]
 [2 'A4' 0.0489]
 [3 'A6' 0.0451]
 [4 'A1' 0.2063]]


In [81]:
print(information_gain(split_monk2))
print('   ')
print(np.hstack((monk2_values, maximum_information_gain_monk2_a5)))

[[0.0457 0.0785 0.1802 0.1401 0.     0.0048]
 [0.0026 0.0325 0.0457 0.0258 0.     0.0073]
 [0.0009 0.0102 0.0333 0.002  0.     0.0043]
 [0.0022 0.0496 0.0158 0.003  0.     0.0043]]
   
[[1 'A3' 0.1802]
 [2 'A3' 0.0457]
 [3 'A3' 0.0333]
 [4 'A2' 0.0496]]


In [82]:
print(information_gain(split_monk3))
print('   ')
print(np.hstack((monk3_values, maximum_information_gain_monk3_a2)))

[[ 0.0015 -0.      0.001   0.0501  0.8183  0.001 ]
 [ 0.0507  0.      0.0345  0.021   0.4767  0.0265]
 [ 0.0488  0.      0.086   0.1217  0.0804  0.004 ]]
   
[[1 'A5' 0.8183]
 [2 'A5' 0.4767]
 [3 'A4' 0.1217]]


In [76]:
print(max_att_monk1_a5)
print(max_att_monk2_a5)
print(max_att_monk3_a2)

[0, 3, 5, 0]
[2, 2, 2, 1]
[4, 4, 3]


### Full Decision Trees:

In [17]:
correct_class, incorrect_class = check_correct_incorrect_classification(datasets, test_datasets, datasets_names)

In [18]:
correct_class

{'MONK-1': 0.829, 'MONK-2': 0.692, 'MONK-3': 0.944}

In [19]:
print(np.hstack(([[i] for i in list(datasets_names)], incorrect_class)))

[['MONK-1' '0.0' '0.171']
 ['MONK-2' '0.0' '0.308']
 ['MONK-3' '0.0' '0.056']]
