### Data analysis and a simple decision tree example using data from https://www.kaggle.com/uciml/zoo-animal-classification

In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE

Read the data file

In [2]:
zoo_df = pd.read_csv("data\Decision-Tree_Zoo-Data\zoo-animal-classification\zoo.csv")
zoo_df = zoo_df.drop('animal_name', axis=1)

In [3]:
# assumes x and y are categorical data
def calc_gini_value(y_actual):
    class_counter = Counter(y_actual)
    total_count = float(len(y_actual))
    gini_sum = 0;
    for clas, count in class_counter.items():
        gini_sum = gini_sum + (count/total_count)**2
    gini_value = 1 - gini_sum
    print("Gini Value: ", gini_value)
    return gini_value

def calc_gini_index(input_x, y_actual):
    total_count = float(len(y_actual))
    # divide the input_x column into k partitions
    partitions = Counter(input_x)
    gini_split_quality = 0;
    print(partitions.items())
    for partition_key, count in partitions.items():
        mask = input_x == partition_key
        gini_split_quality = gini_split_quality + (count/total_count) * calc_gini_value(y_actual[mask])
    return gini_split_quality

In [4]:
print(calc_gini_value(zoo_df['class_type']))

Gini Value:  0.7593373198706009
0.7593373198706009


In [5]:
print(calc_gini_index(zoo_df['hair'], zoo_df['class_type']))

dict_items([(1, 43), (0, 58)])
Gini Value:  0.1687398593834506
Gini Value:  0.7829964328180736
0.5214812579894718


In [6]:
# copy X columns
X = zoo_df.loc[:, zoo_df.columns != 'class_type']
y = zoo_df['class_type']

oversampled = SMOTE(k_neighbors=1)
X, y = oversampled.fit_resample(X, y)

In [7]:
gini_indexes = list()
for column_index in range(X.shape[1]):
    print(zoo_df.columns[column_index])
    gini_index = calc_gini_index(X[:,column_index], y)
    gini_indexes.append((zoo_df.columns[column_index], gini_index))

def index(tup):
    return tup[1]
gini_indexes.sort(key=index)

hair
dict_items([(1, 57), (0, 230)])
Gini Value:  0.43213296398891965
Gini Value:  0.831039697542533
feathers
dict_items([(0, 246), (1, 41)])
Gini Value:  0.8333333333333333
Gini Value:  0.0
eggs
dict_items([(0, 54), (1, 233)])
Gini Value:  0.40672153635116604
Gini Value:  0.832949584630404
milk
dict_items([(1, 41), (0, 246)])
Gini Value:  0.0
Gini Value:  0.8333333333333333
airborne
dict_items([(0, 227), (1, 60)])
Gini Value:  0.8337829183566536
Gini Value:  0.5227777777777777
aquatic
dict_items([(0, 164), (1, 123)])
Gini Value:  0.7869571683521713
Gini Value:  0.7323682992927489
predator
dict_items([(1, 161), (0, 126)])
Gini Value:  0.829366150997261
Gini Value:  0.8117913832199547
toothed
dict_items([(1, 149), (0, 138)])
Gini Value:  0.7436601954866898
Gini Value:  0.7248477210670028
backbone
dict_items([(1, 205), (0, 82)])
Gini Value:  0.7999999999999999
Gini Value:  0.5
breathes
dict_items([(1, 204), (0, 83)])
Gini Value:  0.814398308342945
Gini Value:  0.5989258237770358
venomous

In [8]:
print(gini_indexes)

[('legs', 0.5188428777154763), ('feathers', 0.7142857142857142), ('milk', 0.7142857142857142), ('backbone', 0.7142857142857142), ('fins', 0.7278761642398006), ('tail', 0.7285573678290214), ('toothed', 0.7346144760793141), ('hair', 0.7518143184047074), ('breathes', 0.7520839661165671), ('eggs', 0.7527533664872722), ('aquatic', 0.7635619387552759), ('airborne', 0.7687644220683869), ('catsize', 0.7915976381632046), ('predator', 0.8216503992901509), ('venomous', 0.8362755748687307), ('domestic', 0.8410830999066294)]


In [9]:
zoo_df[['legs', 'class_type']][:20]

Unnamed: 0,legs,class_type
0,4,1
1,4,1
2,0,4
3,4,1
4,4,1
5,4,1
6,4,1
7,0,4
8,0,4
9,4,1
