In [1]:
#Run this cell
#Importing necessary libraries 
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
import json
ans=[0]*5

In [2]:
#Import the dataset and define the feature as well as the target datasets / columns 
df = pd.read_csv('zoo.csv')

#We drop the animal names since this is not a good feature to split the data on. 
features = df.iloc[:, 1:17]
target = df.iloc[:, 17:18]
df.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [3]:
#Write a function to find the entropy on a split "target_col"

def entropy(target_col, n):
    if isinstance(target_col, pd.Series):
        p_i = target_col.value_counts()/target_col.shape[0]
        entropy = np.sum(-p_i*(np.log10(p_i)/np.log10(n)))
        return(entropy)

    else:
        raise('Object must be a Pandas Series.')
  

In [4]:
#Find the entropy of all the features in the dataset
#Save all the feature names in an array "feature names"
feature_names=['hair','feathers','eggs','milk','airborne','aquatic','predator','toothed','backbone', 
               'breathes','venomous','fins','legs','tail','domestic','catsize']

entropy_array = []
for i in feature_names:
    entropy_array.append(entropy(features[i], len(features[i].unique())))

print(entropy_array)

[0.9840304711717018, 0.7179499765002912, 0.9794662187017298, 0.9743197211096903, 0.7910662980902585, 0.9396846718728562, 0.9914266810680206, 0.9685867165455516, 0.6761627418829198, 0.7374895672137456, 0.3993820824245974, 0.653839880626333, 0.7867856278366623, 0.8228368841492258, 0.5538976334852962, 0.9880162151534646]


In [5]:
#Find the entropy of the feature "toothed"
ans[0]= entropy(features['toothed'], len(features['toothed'].unique()))
ans[0]

0.9685867165455516

In [6]:
#Write a function to calculate Information Gain on a split attribute and a target column
def InfoGain(data,split_attribute_name,target_name):       
    #Calculate the entropy of the total dataset  
    original_entropy = entropy(data[target_name], len(data[target_name].unique()))
    #Calculate the values and the corresponding counts for the split attribute   
    counts1 = []
    entropies1 = []
    for j in data[split_attribute_name].unique():
        target_col = []
        #print(j)
        for k in data[split_attribute_name].index:
            #print(features[split_attribue_name].iloc[k])
            if data[split_attribute_name].iloc[k] == j:
                #print(features[split_attribue_name].iloc[k])
                target_col.append(data[target_name].iloc[k])
        target_col_DF = pd.DataFrame(target_col, columns = ['class'])
        #print(j)
        counts1.append(len(target_col))
        entropies1.append(entropy(target_col_DF['class'], len(data[target_name].unique())))
    #Calculate the weighted entropy  
    counts = np.array(counts1)
    entropies = np.array(entropies1)
    weighted_entropy = np.sum(np.multiply(counts, entropies))/np.sum(counts)

    #Calculate the information gain  
    info_gain = original_entropy - weighted_entropy
    return info_gain
    

In [7]:
#Find the information gain having split attribute "hair" and the target feature name "milk"

ans[1]= InfoGain(df, 'hair', 'milk')
ans[1]

0.6599660577558697

In [8]:
#Find the Info gain having "milk" as the split attribute and all the other features as target features one at a time
split_attribute = 'milk'
for i in feature_names:
    if i != split_attribute:
        print("Information Gain with target feature '" + i + "' = " + str(InfoGain(df, split_attribute, i)))

Information Gain with target feature 'hair' = 0.6599660577558699
Information Gain with target feature 'feathers' = 0.17242769884415887
Information Gain with target feature 'eggs' = 0.7870598185734242
Information Gain with target feature 'airborne' = 0.11370352314621812
Information Gain with target feature 'aquatic' = 0.10181386403185944
Information Gain with target feature 'predator' = 0.0006367772440212249
Information Gain with target feature 'toothed' = 0.3465412540071714
Information Gain with target feature 'backbone' = 0.15262359382508262
Information Gain with target feature 'breathes' = 0.18259765312929555
Information Gain with target feature 'venomous' = 0.06284178150207931
Information Gain with target feature 'fins' = 0.018672543666944486
Information Gain with target feature 'legs' = 0.14201031256361096
Information Gain with target feature 'tail' = 0.03350320459269085
Information Gain with target feature 'domestic' = 0.019010135775093362
Information Gain with target feature 'cat

In [9]:
#Import Decision Tree Classifier from sklearn 
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
#Split the given data into 80 percent training data and 20 percent testing data
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state = 42)

In [10]:
#Fit the given data
tree = DecisionTreeClassifier(criterion = 'entropy')
tree = tree.fit(x_train, y_train)

In [11]:
#Make a prediction on the test data and return the percentage of accuracy
from sklearn import metrics

ypred_entropy = tree.predict(x_test)
accuracy = metrics.accuracy_score(y_test, ypred_entropy)
ans[2] = accuracy*100
ans[2]

95.23809523809523

In [12]:
#Run this cell to visualize the decision tree
from six import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, feature_names=feature_names,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [13]:
#Use sklearn to make a classification report and a confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report:-")
print(classification_report(y_test, ypred_entropy, zero_division = 0))
print("Confusion Matrix:-")
matrix = metrics.confusion_matrix(y_test, ypred_entropy)
print(print(metrics.confusion_matrix(y_test, ypred_entropy)))

Classification Report:-
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         2
           3       0.00      0.00      0.00         1
           4       1.00      1.00      1.00         2
           5       0.00      0.00      0.00         0
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         1

    accuracy                           0.95        21
   macro avg       0.71      0.71      0.71        21
weighted avg       0.95      0.95      0.95        21

Confusion Matrix:-
[[12  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  1]]
None


In [14]:
#Find the recall,f1-score for class type '3'
TP_FP = 0
for i in range(len(matrix)):
    TP_FP += matrix[i, 2]
if TP_FP == 0:
    precision = 0
    recall = 0
    f1 = 0
else:
    TP_FN = sum(matrix[2])
    if TP_FN != 0:
        recall = matrix[2,2]/TP_FN
        precision = matrix[2,2]/TP_FP
        f1 = 2*recall*precision/(recall + precision)
    else:
        precision = 0
        recall = 0
        f1 = 0
ans[3]= [recall, f1]
ans[3]

[0, 0]

In [15]:
#Calculate Mean Absolute Error,Mean Squared Error and Root Mean Squared Error
import math
ypred = np.array(ypred_entropy)
ytest1 = np.array(y_test['class_type'])
ytest = np.multiply(-1, ytest1)
mae = np.sum(np.abs(np.add(ytest, ypred)))/len(ytest)
print(mae)
mse = np.sum(np.square(np.add(ytest, ypred)))/len(ytest)
print(mse)
rmse = math.sqrt(mse)
print(rmse)

0.09523809523809523
0.19047619047619047
0.4364357804719847


In [16]:
#Find the mean absolute error and root mean square error, save then in a list [mae,rmse]
ans[4]= [mae, rmse]
ans[4]

[0.09523809523809523, 0.4364357804719847]

In [17]:
##do not change this code
import json
ans = [str(item) for item in ans]

filename = "rushali.c1710@gmail.com_Rushali_Chakraborty_DecisionTrees"

# Eg if your name is Saurav Joshi and email id is sauravjoshi123@gmail.com, filename becomes
# filename = sauravjoshi123@gmail.com_Saurav_Joshi_LinearRegression

## Do not change anything below!!
- Make sure you have changed the above variable "filename" with the correct value. Do not change anything below!!

In [18]:
from importlib import import_module
import os
from pprint import pprint

findScore = import_module('findScore')
response = findScore.main(ans)
response['details'] = filename
with open(f'evaluation_{filename}.json', 'w') as outfile:
    json.dump(response, outfile)
pprint(response)

{'Comments': 'Please make sure the Python script you are submitting is not '
             'having errors. Try running before submitting',
 'Message': 'Successfully submitted! Thanks for taking the exercise',
 'details': 'rushali.c1710@gmail.com_Rushali_Chakraborty_DecisionTrees',
 'status': 'Success'}
