In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [3]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/'
    'mushroom/agaricus-lepiota.data',
    header=None,
    engine='python'
)

column_name = [
    'classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
    'ring-type', 'spore-print-color', 'population', 'habitat'
]

df.columns = column_name

# 看前 5 行确认读取成功
df.head()


Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
# 标签：classes（e=edible, p=poisonous）
y = df['classes']

# 特征：其余 22 个欄位，全部都是类别型
X = df.drop('classes', axis=1)

# 看一下缺失值 '?' 的数量（只是确认用，非必须）
print("Missing '?' count per column:")
print((X == '?').sum())

# 切分训练 / 测试集，test_size=0.2，保持类别比例
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=0,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test  shape:", X_test.shape)


Missing '?' count per column:
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64
Train shape: (6499, 22)
Test  shape: (1625, 22)


In [20]:
preprocess = Pipeline([
    ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('scl', StandardScaler())
])

# KNN 模型
pipe_knn = Pipeline([
    ('preprocess', preprocess),
    ('clf', KNeighborsClassifier(n_neighbors=4, p=2, metric='minkowski'))
])

# SVM 模型（RBF kernel）
pipe_svm = Pipeline([
    ('preprocess', preprocess),
    ('clf', SVC(kernel='rbf', gamma=0.1, C=10.0, random_state=0))
])

# 训练并评估 KNN
pipe_knn.fit(X_train, y_train)
y_pred_knn = pipe_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print('[KNN] Accuracy: %.4f' % acc_knn)

# 训练并评估 SVM
pipe_svm.fit(X_train, y_train)
y_pred_svm = pipe_svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print('[SVM] Accuracy: %.4f' % acc_svm)



[KNN] Accuracy: 0.9982
[SVM] Accuracy: 0.9969


In [21]:
if acc_svm >= acc_knn:
    best_name = 'SVM (RBF kernel)'
    best_acc = acc_svm
else:
    best_name = 'KNN'
    best_acc = acc_knn

print('\nBest model: %s' % best_name)
print('Best test accuracy: %.4f' % best_acc)



Best model: KNN
Best test accuracy: 0.9982
