### Data analysis and a simple decision tree example using data from https://www.kaggle.com/uciml/zoo-animal-classification

In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE

Read the data file

In [2]:
zoo_df = pd.read_csv("data\Decision-Tree_Zoo-Data\zoo-animal-classification\zoo.csv")
zoo_df = zoo_df.drop('animal_name', axis=1)

In [3]:
# assumes x and y are categorical data
def calc_gini_split_quality(feature, y_actual):
    feature_set = set(feature)
    total_count = float(len(feature))
    
    gini_split_quality = 0
    for fea in feature_set:
        fea_mask = feature == fea
        y_values = y_actual[fea_mask]
        num_items_at_node = len(y_values)
        class_counter = Counter(y_values)
        gini_index = 0
        for clas, count in class_counter.items():
            print("Feature Value = ", fea, "Class = ", clas, "Count ", count, " of ", num_items_at_node)
            gini_index = gini_index + (count/num_items_at_node)**2
        gini_index = 1 - gini_index
        print("Gini Index: ", gini_index)
        gini_split_quality = gini_split_quality + (num_items_at_node/total_count) * gini_index
    print("gini split quality ", gini_split_quality)
    return gini_split_quality

In [10]:
print(calc_gini_split_quality(zoo_df['feathers'], zoo_df['class_type']))

Feature Value =  0 Class =  1 Count  41  of  81
Feature Value =  0 Class =  4 Count  13  of  81
Feature Value =  0 Class =  7 Count  10  of  81
Feature Value =  0 Class =  6 Count  8  of  81
Feature Value =  0 Class =  5 Count  4  of  81
Feature Value =  0 Class =  3 Count  5  of  81
Gini Index:  0.6867855509830818
Feature Value =  1 Class =  2 Count  20  of  20
Gini Index:  0.0
gini split quality  0.5507884121745508
0.5507884121745508


In [5]:
# copy X columns
X = zoo_df.loc[:, zoo_df.columns != 'class_type']
y = zoo_df['class_type']

oversampled = SMOTE(k_neighbors=1)
X, y = oversampled.fit_resample(X, y)

In [6]:
gini_qualities = list()
for column_index in range(X.shape[1]):
    print(zoo_df.columns[column_index])
    gini_split_quality = calc_gini_split_quality(X[:,column_index], y)
    gini_qualities.append((zoo_df.columns[column_index], gini_split_quality))

def index(tup):
    return tup[1]
gini_qualities.sort(key=index, reverse=False)

hair
Feature Value =  0 Class =  4 Count  41  of  225
Feature Value =  0 Class =  2 Count  41  of  225
Feature Value =  0 Class =  7 Count  41  of  225
Feature Value =  0 Class =  1 Count  2  of  225
Feature Value =  0 Class =  6 Count  18  of  225
Feature Value =  0 Class =  5 Count  41  of  225
Feature Value =  0 Class =  3 Count  41  of  225
Gini Index:  0.8274962962962963
Feature Value =  1 Class =  1 Count  39  of  62
Feature Value =  1 Class =  6 Count  23  of  62
Gini Index:  0.4667013527575442
gini split quality  0.7495545314900154
feathers
Feature Value =  0 Class =  1 Count  41  of  246
Feature Value =  0 Class =  4 Count  41  of  246
Feature Value =  0 Class =  7 Count  41  of  246
Feature Value =  0 Class =  6 Count  41  of  246
Feature Value =  0 Class =  5 Count  41  of  246
Feature Value =  0 Class =  3 Count  41  of  246
Gini Index:  0.8333333333333333
Feature Value =  1 Class =  2 Count  41  of  41
Gini Index:  0.0
gini split quality  0.7142857142857142
eggs
Feature Va

Feature Value =  1 Class =  1 Count  32  of  59
Feature Value =  1 Class =  4 Count  18  of  59
Feature Value =  1 Class =  2 Count  7  of  59
Feature Value =  1 Class =  7 Count  1  of  59
Feature Value =  1 Class =  3 Count  1  of  59
Gini Index:  0.5981039931054295
gini split quality  0.7901108703963298


In [7]:
print(gini_qualities)

[('legs', 0.5000286990297369), ('feathers', 0.7142857142857142), ('milk', 0.7142857142857142), ('backbone', 0.7142857142857142), ('fins', 0.7278761642398006), ('tail', 0.7285573678290214), ('toothed', 0.7323169033695349), ('eggs', 0.7434000796971508), ('breathes', 0.7460891565406407), ('hair', 0.7495545314900154), ('aquatic', 0.7639647266313933), ('airborne', 0.7643140589569162), ('catsize', 0.7901108703963298), ('predator', 0.8173306830294127), ('venomous', 0.8412520064205458), ('domestic', 0.8427170868347339)]


In [8]:
#[('legs', 0.5188428777154763), ('feathers', 0.7142857142857142), ('milk', 0.7142857142857142), ('backbone', 0.7142857142857142), ('fins', 0.7278761642398006), ('tail', 0.7285573678290214), ('toothed', 0.7346144760793141), ('hair', 0.7518143184047074), ('breathes', 0.7520839661165671), ('eggs', 0.7527533664872722), ('aquatic', 0.7635619387552759), ('airborne', 0.7687644220683869), ('catsize', 0.7915976381632046), ('predator', 0.8216503992901509), ('venomous', 0.8362755748687307), ('domestic', 0.8410830999066294)]

In [9]:
#[('eggs', 0.08924803963874132), ('backbone', 0.07288629737609328), ('breathes', 0.0675097947736926), ('tail', 0.04243672235034128), ('toothed', 0.035869817232329584), ('predator', 0.02867257957868947), ('aquatic', 0.020560490455481136), ('airborne', 0.005002177885416873), ('hair', 0.004979715864316363), ('fins', 0.003230343204030913), ('feathers', 0.0029154518950437313), ('milk', 0.0029154518950437313), ('catsize', 0.0027329638366113433), ('venomous', 0.00021116837915959296), ('domestic', 8.269915489923965e-05), ('legs', 5.287669750590769e-06)]