In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name

### Data preprocessing  

In [2]:
# NA check 
df = df.replace('?', np.nan)
df.isna().sum()[df.isna().sum()>0]

# 將stalk-root遺失值當成新的類別
df['stalk-root'].value_counts()
df['stalk-root'] = df['stalk-root'].fillna('k')

# 檢查特徵類別種類個數
for i in range(0,df.shape[1]):
    print('variable:',df.columns[i],'unique:',len(df.iloc[:,i].unique()))

# 移除只有一類的特徵
del df['veil-type']

# encode label first
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)

# encode categorical features
catego_features = [f for f in df.columns if f not in ['classes']]

catego_le = LabelEncoder()

# transform categorical values into numerical values
num_values = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    # store the total number of values
    num_values.append(len(classes_list))
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)

# 從網站看過資料簡介,發現變數均是類別型且為noimal category feature,直接做OneHotEncoder
catego_features_idx = []
for fea in catego_features:
    catego_features_idx.append(df.columns.tolist().index(fea))

# give the column index you want to do one-hot encoding
ohe = OneHotEncoder(categorical_features = catego_features_idx, sparse=False)
impute_onehot_data = ohe.fit_transform(df)

variable: classes unique: 2
variable: cap-shape unique: 6
variable: cap-surface unique: 4
variable: cap-color unique: 10
variable: bruises? unique: 2
variable: odor unique: 9
variable: gill-attachment unique: 2
variable: gill-spacing unique: 2
variable: gill-size unique: 2
variable: gill-color unique: 12
variable: stalk-shape unique: 2
variable: stalk-root unique: 5
variable: stalk-surface-above-ring unique: 4
variable: stalk-surface-below-ring unique: 4
variable: stalk-color-above-ring unique: 9
variable: stalk-color-below-ring unique: 9
variable: veil-type unique: 1
variable: veil-color unique: 4
variable: ring-number unique: 3
variable: ring-type unique: 5
variable: spore-print-color unique: 9
variable: population unique: 6
variable: habitat unique: 7


In [3]:
X = impute_onehot_data
y = df.classes.values

# 切分train和test data, 因為特徵均為noimal category feature不做標準化
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

### Grid search

In [4]:
param_C = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
param_gamma = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]

svm = SVC(random_state=0)
# set the param_grid parameter of GridSearchCV to a list of dictionaries
param_grid = [{'C': param_C, 
               'gamma': param_gamma, 
               'kernel': ['rbf']}]
gs = GridSearchCV(estimator=svm, 
                  param_grid=param_grid, 
                  scoring='accuracy')
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

1.0
{'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}


In [5]:
svm = SVC(kernel='rbf', random_state=0, gamma=0.01, C=1.0)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Misclassified samples: 0
Accuracy: 1.00


### Report

1.數據清洗:

  (1)將?轉NA,只有一個特徵有遺失值且因為NA數過多,故歸納成一類。
  
  (2)將單一種類類別的特徵刪除,對模型預測沒幫助。
  
  (3)先將資料作label-encode,閱讀資料的網站敘述發現沒有有序的類別變數,直接作one-hot-encode。
  
  (4)資料無連續型特徵,故不做標準化的動作。
  
  
  
2.使用SVM訓練模型,調參數用grid search的方法對Cost跟gamma做調整。

3.使用第二步的參數配適最後的模型。所得到的正確率為100%