In [1]:
import numpy as np
import pandas as pd

In [4]:
def populate_data(csv_name, target_column):
    df = pd.read_csv(csv_name)
    train_x = df.drop(columns=target_column)
    train_y = df[target_column]

    #return train_x, train_y
    return df

In [5]:
df = populate_data('car_evaluation.csv', 'class')


In [None]:
door_dict = {2:0, 3:0, 4:0, 5:0}
for index, row in df.iterrows():
    door_dict[row['doors']] += 1

print(door_dict)

In [6]:
def system_entropy(df, target_column):
    count_dict = dict(df[target_column].value_counts())
    unique_vals = (list(set(count_dict.keys())))
    total_len = len(df[target_column])

    sys_entropy = 0
    for value in unique_vals:
        sys_entropy = sys_entropy + (-(count_dict[value] / total_len) * np.log2((count_dict[value] / total_len)))
    return sys_entropy

In [68]:
def entropy(df, target_column, query_column):
    tiny = 0.0000000000001

    list1 = df.groupby([query_column, target_column]).size().reset_index(name="combo_count")
    list2 = df.groupby([query_column]).size().reset_index(name="var_count")

    list1['var_count'] = list1[query_column].map(dict(list2[[query_column, 'var_count']].values))

    list1['entropy'] = -(list1['combo_count'] / (list1['var_count'] + tiny)) * \
                   np.log2((list1['combo_count'] / (list1['var_count'] + tiny)))

    return np.abs(np.sum(list1['entropy']))

In [81]:
def max_information_gain(df, target_column):
    col_list = list(df.columns)
    col_list.remove(target_column)

    column_entropy = {}
    for col in col_list:
        column_entropy[col] = entropy(df, target_column, col)
    max_entropy_key = max(column_entropy, key=column_entropy.get)
    return max_entropy_key, column_entropy[max_entropy_key]

In [108]:
def build_tree(df, target_column, tree_struct=None):
    max_ent_key, max_ent_val = max_information_gain(df, target_column)
    max_ent_vals = np.unique(df[max_ent_key])

    if not tree_struct:
        tree_struct = {}
        tree_struct[max_ent_key] = {}

    for value in max_ent_vals:
        only_val = df[df[max_ent_key] == value].reset_index(drop=True)
        only_val = only_val.drop(max_ent_key, axis=1)
        # print("onlyvals")
        # print("dropping", max_ent_key)
        # print(only_val)
        # break
        target_vals, target_counts = np.unique(only_val[target_column],return_counts=True)
        if len(target_counts) == 1:
            tree_struct[max_ent_key][value] = target_vals
        else:
            tree_struct[max_ent_key][value] = build_tree(only_val, target_column)

    return tree_struct

In [106]:
target_column = 'class'
# sys_ent = system_entropy(df, 'class')
#
# ent_key, ent_val = max_information_gain(df, target_column)
# print(ent_key)
# print(ent_val)

tree = build_tree(df, target_column)


In [110]:
import pprint
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(tree)

{'doors': {'2': {'maint': {'high': {'buying': {'high': {'lug_boot': {'big': {'persons': {'2': array(['unacc'], dtype=object),
                                                                                         '4': {'safety': {'high': array(['acc'], dtype=object),
                                                                                                          'low': array(['unacc'], dtype=object),
                                                                                                          'med': array(['acc'], dtype=object)}},
                                                                                         '5': {'safety': {'high': array(['acc'], dtype=object),
                                                                                                          'low': array(['unacc'], dtype=object),
                                                                                                          'med': array(['acc'], dtype=object)}}}},
    

In [137]:
test_dict = {'door':{'2':{'maint':{'high':'nahFam'}}, '3':{'maint':{'med':'yeahFam'}},
                     '4':{'maint':{'med':'waycool'}}, '5':{'maint':{'low':'OMFG'}}}}
def print_layer(layer_dict, tab_string=""):
    if not isinstance(layer_dict, dict):
        print(layer_dict)
        return
    tab_string = tab_string + "\t"
    for layer in layer_dict.keys():
        if isinstance(layer_dict[layer], dict):
            print(tab_string, layer, ":", list(layer_dict[layer].keys()))
            print_layer(layer_dict[layer], tab_string)
        else:
            print(tab_string, layer, ":", layer_dict[layer])


print_layer(tree)



	 doors : ['2', '3', '4', '5']
		 2 : ['maint']
			 maint : ['high', 'low', 'med', 'vhigh']
				 high : ['buying']
					 buying : ['high', 'low', 'med', 'vhigh']
						 high : ['lug_boot']
							 lug_boot : ['big', 'med', 'small']
								 big : ['persons']
									 persons : ['2', '4', '5']
										 2 : ['unacc']
										 4 : ['safety']
											 safety : ['high', 'low', 'med']
												 high : ['acc']
												 low : ['unacc']
												 med : ['acc']
										 5 : ['safety']
											 safety : ['high', 'low', 'med']
												 high : ['acc']
												 low : ['unacc']
												 med : ['acc']
								 med : ['persons']
									 persons : ['2', '4', '5']
										 2 : ['unacc']
										 4 : ['safety']
											 safety : ['high', 'low', 'med']
												 high : ['acc']
												 low : ['unacc']
												 med : ['unacc']
										 5 : ['safety']
											 safety : ['high', 'low', 'med']
												 high : ['acc']
												 low : ['unacc']
	