### Data analysis and a simple decision tree example using data from https://www.kaggle.com/uciml/zoo-animal-classification

### This example shows classificaiton using Random Forest method.


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from collections import Counter
import math as math
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

Using TensorFlow backend.


Read the data file

In [2]:
zoo_df = pd.read_csv("data\Decision-Tree_Zoo-Data\zoo-animal-classification\zoo.csv")

Check if we are missing any data

In [3]:
zoo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
animal_name    101 non-null object
hair           101 non-null int64
feathers       101 non-null int64
eggs           101 non-null int64
milk           101 non-null int64
airborne       101 non-null int64
aquatic        101 non-null int64
predator       101 non-null int64
toothed        101 non-null int64
backbone       101 non-null int64
breathes       101 non-null int64
venomous       101 non-null int64
fins           101 non-null int64
legs           101 non-null int64
tail           101 non-null int64
domestic       101 non-null int64
catsize        101 non-null int64
class_type     101 non-null int64
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


animal_name is not useful in classification. remove it from data set

In [4]:
animal_names = zoo_df['animal_name'].to_list()
zoo_df = zoo_df.drop('animal_name', axis=1)

In [5]:
#One-of-K Encode the "legs" column data
onehot_encoder = OneHotEncoder(sparse=False, categories = "auto", dtype=np.int)
onehot_encoded = onehot_encoder.fit_transform(zoo_df["legs"].values.reshape(-1,1))

# prepare column names
legs_clm_names = list()
col_index = 0
for name in onehot_encoder.categories_[0]:
    zoo_df["legs" + str(name)] = onehot_encoded[:, col_index]
    col_index = col_index + 1
zoo_df = zoo_df.drop("legs", axis=1)

In [6]:
for column in zoo_df.columns:
    zoo_df[column] = zoo_df[column].astype("category")

In [7]:
zoo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 22 columns):
hair          101 non-null category
feathers      101 non-null category
eggs          101 non-null category
milk          101 non-null category
airborne      101 non-null category
aquatic       101 non-null category
predator      101 non-null category
toothed       101 non-null category
backbone      101 non-null category
breathes      101 non-null category
venomous      101 non-null category
fins          101 non-null category
tail          101 non-null category
domestic      101 non-null category
catsize       101 non-null category
class_type    101 non-null category
legs0         101 non-null category
legs2         101 non-null category
legs4         101 non-null category
legs5         101 non-null category
legs6         101 non-null category
legs8         101 non-null category
dtypes: category(22)
memory usage: 4.6 KB


A glance over the data

In [8]:
zoo_df

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,...,tail,domestic,catsize,class_type,legs0,legs2,legs4,legs5,legs6,legs8
0,1,0,0,1,0,0,1,1,1,1,...,0,0,1,1,0,0,1,0,0,0
1,1,0,0,1,0,0,0,1,1,1,...,1,0,1,1,0,0,1,0,0,0
2,0,0,1,0,0,1,1,1,1,0,...,1,0,0,4,1,0,0,0,0,0
3,1,0,0,1,0,0,1,1,1,1,...,0,0,1,1,0,0,1,0,0,0
4,1,0,0,1,0,0,1,1,1,1,...,1,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,0,0,1,0,0,0,1,1,1,...,1,0,1,1,0,1,0,0,0,0
97,1,0,1,0,1,0,0,0,0,1,...,0,0,0,6,0,0,0,0,1,0
98,1,0,0,1,0,0,1,1,1,1,...,1,0,1,1,0,0,1,0,0,0
99,0,0,1,0,0,0,0,0,0,1,...,0,0,0,7,1,0,0,0,0,0


In [9]:
# copy X columns
X = zoo_df.loc[:, zoo_df.columns != 'class_type']
y = zoo_df['class_type']

## Split the data into training and testing sets

In [10]:
#oversampler = SMOTE(k_neighbors=1)
oversampler = RandomOverSampler()
X, y = oversampler.fit_resample(X, y)

In [11]:
# split the data (70-30 split)
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size = 0.30, stratify = y)

Build the machine learning model

In [12]:
model = RandomForestClassifier(n_estimators=9, 
                               bootstrap = True,
                               max_features = 'sqrt')
model.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=9,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Predict the class of test data

In [13]:
y_predicted = model.predict(test_X)

How good is the model?
check accuracy

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
print(accuracy_score(test_Y, y_predicted))

1.0


Build confusion matrix

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
c_matrix = confusion_matrix(test_Y, y_predicted)
print(c_matrix)

[[13  0  0  0  0  0  0]
 [ 0 12  0  0  0  0  0]
 [ 0  0 13  0  0  0  0]
 [ 0  0  0 12  0  0  0]
 [ 0  0  0  0 13  0  0]
 [ 0  0  0  0  0 12  0]
 [ 0  0  0  0  0  0 12]]
