### Data analysis and a simple decision tree example using data from https://www.kaggle.com/uciml/zoo-animal-classification

In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE
import math as math

Read the data file

In [2]:
zoo_df = pd.read_csv("data\Decision-Tree_Zoo-Data\zoo-animal-classification\zoo.csv")
zoo_df = zoo_df.drop('animal_name', axis=1)

In [3]:
# assumes x and y are categorical data
def calc_entropy(y_actual):
    class_counter = Counter(y_actual)
    total_count = float(len(y_actual))
    entropy = 0;
    for clas, count in class_counter.items():
        probability = count/total_count
        entropy = entropy - (probability * math.log2(probability))
    print("Entropy Value: ", entropy)
    return entropy

def calc_information_gain(input_x, y_actual):
    total_count = float(len(y_actual))
    # divide the input_x column into k partitions
    partitions = Counter(input_x)
    sum_of_partition_entropies = 0;
    print(partitions.items())
    for partition_key, count in partitions.items():
        mask = input_x == partition_key
        sum_of_partition_entropies = sum_of_partition_entropies + ((count/total_count)*calc_entropy(y_actual[mask]))
    info_gain = calc_entropy(y_actual) - sum_of_partition_entropies
    return info_gain

In [4]:
print(calc_entropy(zoo_df['class_type']))

Total Items for entropy =  101.0
Entropy Value:  2.390559682294039
2.390559682294039


In [5]:
print(calc_information_gain(zoo_df['hair'], zoo_df['class_type']))

Total Items =  101.0
dict_items([(1, 43), (0, 58)])
Total Items for entropy =  43.0
Entropy Value:  0.446481346896803
Total Items for entropy =  58.0
Entropy Value:  2.4549947941466774
Total Items for entropy =  101.0
Entropy Value:  2.390559682294039
0.7906745736101797


In [6]:
# copy X columns
X = zoo_df.loc[:, zoo_df.columns != 'class_type']
y = zoo_df['class_type']

oversampled = SMOTE(k_neighbors=1)
X, y = oversampled.fit_resample(X, y)

In [7]:
entropies = list()
for column_index in range(X.shape[1]):
    print(zoo_df.columns[column_index])
    entropy_value = calc_information_gain(X[:,column_index], y)
    entropies.append((zoo_df.columns[column_index], entropy_value))

def index(tup):
    return tup[1]
entropies.sort(key=index, reverse=True)

hair
Total Items =  287.0
dict_items([(1, 54), (0, 233)])
Total Items for entropy =  54.0
Entropy Value:  0.8524051786494786
Total Items for entropy =  233.0
Entropy Value:  2.61736385444666
Total Items for entropy =  287.0
Entropy Value:  2.807354922057604
feathers
Total Items =  287.0
dict_items([(0, 246), (1, 41)])
Total Items for entropy =  246.0
Entropy Value:  2.584962500721156
Total Items for entropy =  41.0
Entropy Value:  0.0
Total Items for entropy =  287.0
Entropy Value:  2.807354922057604
eggs
Total Items =  287.0
dict_items([(0, 57), (1, 230)])
Total Items for entropy =  57.0
Entropy Value:  1.1706463440108583
Total Items for entropy =  230.0
Entropy Value:  2.605523307544938
Total Items for entropy =  287.0
Entropy Value:  2.807354922057604
milk
Total Items =  287.0
dict_items([(1, 41), (0, 246)])
Total Items for entropy =  41.0
Entropy Value:  0.0
Total Items for entropy =  246.0
Entropy Value:  2.584962500721156
Total Items for entropy =  287.0
Entropy Value:  2.8073549

In [8]:
print(entropies)

[('legs', 1.5281380230759691), ('backbone', 0.863120568566631), ('tail', 0.8573758507813407), ('toothed', 0.84031021518076), ('breathes', 0.6559426529091592), ('aquatic', 0.6010742315121358), ('feathers', 0.5916727785823275), ('milk', 0.5916727785823275), ('fins', 0.5606973433434317), ('hair', 0.5220738846598909), ('eggs', 0.4868071786988768), ('airborne', 0.4349011238822529), ('catsize', 0.3186898417840003), ('predator', 0.20044773153387885), ('venomous', 0.09365352002781746), ('domestic', 0.08120959679550266)]
