In [1]:
# Load dataset and add column names
from pandas import read_csv

header = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
          'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 
          'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring',
          'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
df = read_csv("agaricus-lepiota.data", names=header)

# Print 5 random data samples
df.sample(5)

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5112,p,f,y,y,f,f,f,c,b,p,...,k,b,p,p,w,o,l,h,y,p
832,e,x,y,n,t,l,f,c,b,p,...,y,w,w,p,w,o,p,k,y,g
7787,p,f,y,n,f,f,f,c,n,b,...,k,p,w,p,w,o,e,w,v,d
248,e,f,y,y,t,l,f,c,b,w,...,y,w,w,p,w,o,p,k,y,p
2626,e,x,y,n,t,n,f,c,b,p,...,s,w,w,p,w,o,p,n,v,d


In [2]:
# Print the unique values of the classes column
print(f"Output classes: {df["classes"].unique()}")

# Check for missing values
print(f"Missing values: {df.isna().sum()}")

# Show information about the dataframe
df.info()

Output classes: ['p' 'e']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes                   8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14

In [3]:
# Replace missing values with NaN
import numpy as np
df = df.replace({"?": np.nan})

In [9]:
# Check for missing values
print(df.isna().sum())
df.info()

classes                        0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes 

In [5]:
from sklearn.model_selection import train_test_split as split

# Drop the stalk-root column (too many missing values)
X = df.drop(columns=['classes', 'stalk-root'])
y = df['classes']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = split(X, y, stratify=y, random_state=42)

In [6]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the categorical columns
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
Xe_train = ohe.fit_transform(X_train)
Xe_test = ohe.transform(X_test)

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Train a decision tree classifier
dtc = DecisionTreeClassifier().fit(Xe_train, y_train)

# Evaluate the classifier
dtc.score(Xe_test, y_test)

1.0

In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Train a k-nearest neighbors classifier
knn = KNeighborsClassifier().fit(Xe_train, y_train)

# Evaluate the classifier
knn.score(Xe_test, y_test)

1.0