In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [2]:
data = pd.read_csv('mushroom.csv')

In [3]:
data.shape

(8124, 23)

In [4]:
data.head()

Unnamed: 0,edible,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
#Convert object datatype (string) to categorical in numerical format

def convertCategoricalData(dataframe):
    for column in dataframe.columns:
        if dataframe[column].dtype == "object":
            dataframe[column] = dataframe[column].astype("category").cat.codes
    return dataframe

In [6]:
data = convertCategoricalData(data)

In [7]:
data.head()

Unnamed: 0,edible,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [8]:
features = data.drop("edible", axis = 1)
classLabels = data["edible"]

In [9]:
#Getting random sample from the dataset? shuffle data in any order
#random_state = 0 to make sure the shuffled data is the same the next time the code is ran -> reproducable
features = features.sample(frac = 1, random_state=0)
classLabels = classLabels.sample(frac = 1, random_state=0)

In [15]:
#verify the indices
assert all(features.index == classLabels.index)
x=round(0.8*len(data)) #Make a variable that is 80% of dataset
trainFeatures, trainClassLabels = features.iloc[:x], classLabels.iloc[:x]
testFeatures, testClassLabels = features.iloc[x:len(data)], classLabels.iloc[x:len(data)]

In [16]:
classLabels.head()

380     1
3641    0
273     0
1029    0
684     0
Name: edible, dtype: int8

In [17]:
print(trainFeatures.shape)
print(testFeatures.shape)

(6499, 22)
(1625, 22)


In [18]:
treeLearner = DecisionTreeClassifier(random_state=0)

#Train the model
classifier = treeLearner.fit(trainFeatures, trainClassLabels)

#Use the model to predict on test set
predictions = classifier.predict(testFeatures)

In [19]:
from sklearn.tree import export_graphviz
import graphviz

#Convert class labels to a list of strings
class_labels = trainClassLabels.astype(str).unique().tolist()

#Export the decision tree in DOT format
dot_data = export_graphviz (treeLearner, out_file=None,
                           feature_names=trainFeatures.columns,
                            class_names=class_labels,
                            filled=True, rounded=True,
                            special_characters=True)

graph= graphviz.Source(dot_data)
graph.render("decision_tree") # Save the tree as a PDF file
graph.view() # Display the tree in a GUI window

'decision_tree.pdf'

# Measure accuracy

In [21]:
def computeAccuracy (target, predicted):
    accuracy = (predictions == target).sum()/len(target)
    return accuracy

In [22]:
accuracy = computeAccuracy (testClassLabels, predictions)
print("Accuracy of the model= ", accuracy)

Accuracy of the model=  1.0


#### Evaluate with cross-validation

In [27]:
from sklearn.model_selection import cross_validate, cross_val_score, KFold

In [28]:
evalResults = cross_validate(treeLearner,X=features,y=classLabels,cv=10,scoring=["accuracy"])

In [29]:
print(evalResults)

{'fit_time': array([0.01665401, 0.01079059, 0.01200151, 0.01390719, 0.01299858,
       0.01299858, 0.0109992 , 0.01099825, 0.00799799, 0.00799823]), 'score_time': array([0.00300026, 0.00100183, 0.00299883, 0.00200152, 0.00200152,
       0.00200105, 0.00200105, 0.00200152, 0.00200176, 0.00200176]), 'test_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}


In [30]:
cv = KFold(n_splits=10)
# cross_validate also allows to specify metrics which you want to see
for i, score in enumerate(cross_validate(treeLearner, X=features,y=classLabels, cv=cv)["test_score"]):
    print(f"Accuracy for the fold no. {i} on the test set: {score}")

Accuracy for the fold no. 0 on the test set: 1.0
Accuracy for the fold no. 1 on the test set: 1.0
Accuracy for the fold no. 2 on the test set: 1.0
Accuracy for the fold no. 3 on the test set: 1.0
Accuracy for the fold no. 4 on the test set: 1.0
Accuracy for the fold no. 5 on the test set: 1.0
Accuracy for the fold no. 6 on the test set: 1.0
Accuracy for the fold no. 7 on the test set: 1.0
Accuracy for the fold no. 8 on the test set: 1.0
Accuracy for the fold no. 9 on the test set: 1.0
