In [189]:
#Run this cell
#Importing necessary libraries 
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
import json
ans=[0]*5

In [190]:
#Import the dataset and define the feature as well as the target datasets / columns 
df=pd.read_csv('zoo.csv')
#We drop the animal names since this is not a good feature to split the data on. 
df=df.drop('animal_name',axis=1)
df

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,1
97,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,6
98,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
99,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,7


In [191]:
#Write a function to find the entropy on a split "target_col"
def entropy(target_col):
    ans=0
    for i in target_col.unique():
        p=sum(target_col==i)/len(target_col)
        if(p>0):
            ans-=p*(np.log(p)/np.log(2))
    return ans

In [192]:
#Find the entropy of all the features in the dataset
#Save all the feature names in an array "feature names"
feature_names=['hair','feathers','eggs','milk','airborne','aquatic','predator','toothed','backbone', 
               'breathes','venomous','fins','legs','tail','domestic','catsize']
for i in feature_names:
    print(entropy(df[i]))

0.9840304711717017
0.7179499765002912
0.9794662187017299
0.9743197211096903
0.7910662980902585
0.9396846718728563
0.9914266810680207
0.9685867165455516
0.6761627418829198
0.7374895672137456
0.3993820824245975
0.653839880626333
2.0338113440641234
0.8228368841492257
0.5538976334852962
0.9880162151534646


In [193]:
#Find the entropy of the feature "toothed"
ans[0]=entropy(df['toothed'])
ans[0]

0.9685867165455516

In [194]:
#Write a function to calculate Information Gain on a split attribute and a target column
def InfoGain(data,split_attribute_name,target_name="class"):       
    #Calculate the entropy of the total dataset  
    original_entropy=entropy(data[target_name])
    #Calculate the values and the corresponding counts for the split attribute   
    values=data[split_attribute_name].unique()
    sub=0
    for i in values:
        split=data[data[split_attribute_name]==i]
        #Calculate the weighted entropy  
        sub+=split.shape[0]/data.shape[0]*entropy(split[target_name])
    #Calculate the information gain  
    return original_entropy-sub

In [195]:
#Find the information gain having split attribute "hair" and the target feature name "milk"
ans[1]=InfoGain(df,"hair","milk")
ans[1]

0.6599660577558697

In [196]:
#Find the Info gain having "milk" as the split attribute and all the other features as target features one at a time
for i in feature_names:
    if i!="milk":
        print(i+" - "+str(InfoGain(df,"milk",i)))

hair - 0.6599660577558697
feathers - 0.17242769884415887
eggs - 0.7870598185734243
airborne - 0.11370352314621823
aquatic - 0.10181386403185955
predator - 0.000636777244021336
toothed - 0.3465412540071715
backbone - 0.15262359382508262
breathes - 0.18259765312929555
venomous - 0.06284178150207947
fins - 0.018672543666944375
legs - 0.36709133269262484
tail - 0.03350320459269063
domestic - 0.01901013577509325
catsize - 0.2509271768405701


In [197]:
#Import Decision Tree Classifier from sklearn 
from sklearn.tree import DecisionTreeClassifier
#Split the given data into 80 percent training data and 20 percent testing data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df[feature_names],df['class_type'],test_size=0.2,random_state=16)

In [198]:
#Fit the given data
tree=DecisionTreeClassifier(random_state=16)
tree.fit(X_train,y_train)

DecisionTreeClassifier(random_state=16)

In [199]:
#Make a prediction on the test data and return the percentage of accuracy
y_pred=tree.predict(X_test)
ans[2]=tree.score(X_test,y_test)*100
ans[2]

95.23809523809523

In [202]:
#Run this cell to visualize the decision tree
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, feature_names=feature_names,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

ModuleNotFoundError: No module named 'sklearn.externals.six'

In [203]:
#Use sklearn to make a classification report and a confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
clf_report=classification_report(y_test,y_pred,output_dict=True)
cnf_matrix=confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
print(cnf_matrix)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         2
           3       0.50      1.00      0.67         1
           4       1.00      1.00      1.00         5
           5       0.00      0.00      0.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         2

    accuracy                           0.95        21
   macro avg       0.79      0.86      0.81        21
weighted avg       0.93      0.95      0.94        21

[[9 0 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 0 5 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 0 2]]


  _warn_prf(average, modifier, msg_start, len(result))


In [204]:
#Find the recall,f1-score for class type '3'
ans[3]=[clf_report['3']['recall'],clf_report['3']['f1-score']]
ans[3]

[1.0, 0.6666666666666666]

In [205]:
#Calculate Mean Absolute Error,Mean Squared Error and Root Mean Squared Error
from sklearn.metrics import mean_absolute_error,mean_squared_error
mae=mean_absolute_error(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
rmse=np.sqrt(mse)

In [206]:
#Find the mean absolute error and root mean square error, save then in a list [mae,rmse]
ans[4]=[mae,rmse]
ans[4]

[0.09523809523809523, 0.4364357804719847]

In [207]:
##do not change this code
import json
ans = [str(item) for item in ans]

filename = ""

# Eg if your name is Saurav Joshi and email id is sauravjoshi123@gmail.com, filename becomes
# filename = sauravjoshi123@gmail.com_Saurav_Joshi_LinearRegression

## Do not change anything below!!
- Make sure you have changed the above variable "filename" with the correct value. Do not change anything below!!

In [208]:
from importlib import import_module
import os
from pprint import pprint

findScore = import_module('findScore')
response = findScore.main(ans)
response['details'] = filename
with open(f'evaluation_{filename}.json', 'w') as outfile:
    json.dump(response, outfile)
pprint(response)

ModuleNotFoundError: No module named 'findScore'