#### Importing Python Machine Learning Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

#### Import Data

In [79]:
data = pd.read_csv('zoo.csv')
data.shape

(101, 18)

In [80]:
data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [62]:
data.tail()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
96,wallaby,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,1
97,wasp,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,6
98,wolf,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
99,worm,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,7
100,wren,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,2


In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
animal_name    101 non-null object
hair           101 non-null int64
feathers       101 non-null int64
eggs           101 non-null int64
milk           101 non-null int64
airborne       101 non-null int64
aquatic        101 non-null int64
predator       101 non-null int64
toothed        101 non-null int64
backbone       101 non-null int64
breathes       101 non-null int64
venomous       101 non-null int64
fins           101 non-null int64
legs           101 non-null int64
tail           101 non-null int64
domestic       101 non-null int64
catsize        101 non-null int64
class_type     101 non-null int64
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


In [81]:
data.columns.values

array(['animal_name', 'hair', 'feathers', 'eggs', 'milk', 'airborne',
       'aquatic', 'predator', 'toothed', 'backbone', 'breathes',
       'venomous', 'fins', 'legs', 'tail', 'domestic', 'catsize',
       'class_type'], dtype=object)

In [82]:
data = data.drop('animal_name', axis=1)

In [83]:
#Seperate dataset into training and testing data
from sklearn.preprocessing import LabelEncoder

X = data.values[:,:-1]
y = data.values[:,-1]

In [84]:
#Let’s split our data into training and test set. We will use sklearn’s train_test_split() method.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 100)

In [85]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(70, 16)
(31, 16)
(70,)
(31,)


In [86]:
# Decision Tree Classifier with criterion gini index
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [87]:
# Decision Tree Classifier with criterion information gain
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [88]:
y_pred_gini = clf_gini.predict(X_test)
y_pred_gini

array([2, 4, 7, 1, 1, 7, 1, 2, 2, 2, 1, 7, 1, 7, 7, 1, 4, 1, 2, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 7, 2, 1])

In [89]:
y_pred_entr = clf_entropy.predict(X_test)
y_pred_entr

array([2, 4, 2, 1, 1, 2, 1, 2, 2, 2, 1, 6, 1, 2, 6, 1, 4, 1, 2, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 6, 2, 1])

In [45]:
#Calculating Accuracy Score

In [90]:
# Accuracy for Decision Tree classifier with criterion as gini index
print("Accuracy is ", accuracy_score(y_test, y_pred_gini))

Accuracy is  0.8064516129032258


In [92]:
# Accuracy for Decision Tree classifier with criterion as information gain
print("Accuracy is ", accuracy_score(y_test, y_pred_entr))

Accuracy is  0.9032258064516129


In [94]:
pd.DataFrame({"Acutal":y_test, "Predict":y_pred_entr}).head()

Unnamed: 0,Acutal,Predict
0,2,2
1,4,4
2,5,2
3,1,1
4,1,1
