<a href="https://colab.research.google.com/github/wooihaw/mushroom_classification/blob/main/mushrooms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load libraries
import numpy as np
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Load dataset and add column names
header = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
          'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
          'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring',
          'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
df = read_csv("agaricus-lepiota.data", names=header)

# Print 5 random data samples
df.sample(5)

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8000,e,b,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,n,c,l
669,e,x,y,w,t,a,f,c,b,w,...,s,w,w,p,w,o,p,n,n,g
6756,p,f,s,n,f,f,f,c,n,b,...,k,p,w,p,w,o,e,w,v,l
6080,p,f,y,e,f,y,f,c,n,b,...,s,p,w,p,w,o,e,w,v,l
3262,p,x,s,p,f,c,f,w,n,p,...,s,w,w,p,w,o,p,n,v,d


In [6]:
# Print the unique values of the classes column
print(f"Output classes: {df['classes'].unique()}")

# Check for missing values
print(f"Missing values: {df.isna().sum()}")

# Show information about the dataframe
df.info()

Output classes: ['p' 'e']
Missing values: classes                     0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes                   8124 non-

In [7]:
# Replace missing values with NaN
df = df.replace({"?": np.nan})

In [8]:
# Check for missing values
print(df.isna().sum())
df.info()

classes                        0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes 

In [9]:
# Drop the stalk-root column (too many missing values)
X = df.drop(columns=['classes', 'stalk-root'])
y = df['classes']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = split(X, y, stratify=y, random_state=42)

In [10]:
# One-hot encode the categorical columns
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
Xe_train = ohe.fit_transform(X_train)
Xe_test = ohe.transform(X_test)

In [11]:
# Train a decision tree classifier
dtc = DecisionTreeClassifier().fit(Xe_train, y_train)

# Evaluate the classifier
dtc.score(Xe_test, y_test)

1.0

In [12]:
# Train a k-nearest neighbors classifier
knn = KNeighborsClassifier().fit(Xe_train, y_train)

# Evaluate the classifier
knn.score(Xe_test, y_test)

1.0