In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#選擇圖形樣式
%matplotlib inline
plt.style.use('fivethirtyeight')

import warnings
#忽略警告，為了排版整齊
warnings.filterwarnings('ignore')

#資料集切割
from sklearn.model_selection import train_test_split

#自訂方法
from sklearn.base import TransformerMixin

#變數標籤轉換
from sklearn.preprocessing import LabelEncoder

#機器學習管線
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#機器學習模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#特徵選擇
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### 基本資料

自變數:22個，皆為類別變數；因變數:1個，為類別變數。__是為類別變數的分類目的__

資料筆數: 8416 筆

In [2]:
data_path = "./data/mushroom_dataset.csv"
mushroom_df = pd.read_csv(data_path)

mushroom_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,mushroom
0,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
2,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
3,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
4,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,EDIBLE
8412,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,EDIBLE
8413,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,EDIBLE
8414,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,EDIBLE


### 切分資料集

將訓練與測試資料切割，訓練80%、測試20%

In [3]:
train_df, test_df = train_test_split(mushroom_df, test_size=0.2)

print(train_df.shape, test_df.shape)

(6732, 23) (1684, 23)


### 遺漏值觀察

訓練資料與測試資料皆無遺漏值

In [4]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
mushroom                    0
dtype: int64
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                

### 變數轉換

In [5]:
#自訂轉換方法
class LabelEncoderAll(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        labelencoder = LabelEncoder()
        X = df.copy()
        
        for column_name in X.columns:
            X[column_name] = labelencoder.fit_transform(X[column_name])
            
        return X
    
    def fit(self, *_):
        return self

In [6]:
label_encoder_all = LabelEncoderAll()

In [7]:
#觀察欄位與標籤
labelencoder = LabelEncoder()
train_lea = train_df.copy()

for column_name in train_lea.columns:
    print(column_name)
    train_lea[column_name] = labelencoder.fit_transform(train_lea[column_name])
    print(list(labelencoder.classes_))

cap-shape
['BELL', 'CONICAL', 'CONVEX', 'FLAT', 'KNOBBED', 'SUNKEN']
cap-surface
['FIBROUS', 'GROOVES', 'SCALY', 'SMOOTH']
cap-color
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'GREEN', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
bruises
['BRUISES', 'NO']
odor
['ALMOND', 'ANISE', 'CREOSOTE', 'FISHY', 'FOUL', 'MUSTY', 'NONE', 'PUNGENT', 'SPICY']
gill-attachment
['ATTACHED', 'FREE']
gill-spacing
['CLOSE', 'CROWDED']
gill-size
['BROAD', 'NARROW']
gill-color
['BLACK', 'BROWN', 'BUFF', 'CHOCOLATE', 'GRAY', 'GREEN', 'ORANGE', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
stalk-shape
['ENLARGING', 'TAPERING']
stalk-root
['?', 'BULBOUS', 'CLUB', 'EQUAL', 'ROOTED']
stalk-surface-above-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-surface-below-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-color-above-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
stalk-color-below-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
vei

## 分類方法選擇

In [8]:
#空準確率，約為53%
train_df['mushroom'].value_counts(normalize=True)

EDIBLE       0.531788
POISONOUS    0.468212
Name: mushroom, dtype: float64

In [9]:
#最佳模型選擇
def get_best_model_and_accuracy(model, params, X, y):
    
    grid = GridSearchCV(model,
                       params,
                       error_score=0.)
    grid.fit(X, y)
    
    print("Best Accuracy: {}".format(grid.best_score_))
    print("Best Parameters: {}".format(grid.best_params_))

In [10]:
#切分X,y，分成train data與test data
train_X = train_df.drop('mushroom', axis=1)
train_y = train_df['mushroom']

test_X = test_df.drop('mushroom', axis=1)
test_y = test_df['mushroom']

train_X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1978,FLAT,FIBROUS,WHITE,NO,NONE,FREE,CROWDED,BROAD,BROWN,TAPERING,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,EVANESCENT,BROWN,SCATTERED,GRASSES
2890,CONVEX,SCALY,RED,BRUISES,NONE,FREE,CLOSE,BROAD,BROWN,TAPERING,...,SMOOTH,GRAY,PINK,PARTIAL,WHITE,ONE,PENDANT,BROWN,SOLITARY,WOODS
5056,FLAT,FIBROUS,YELLOW,NO,FOUL,FREE,CLOSE,BROAD,CHOCOLATE,ENLARGING,...,SILKY,BROWN,PINK,PARTIAL,WHITE,ONE,LARGE,CHOCOLATE,SOLITARY,GRASSES
7306,KNOBBED,SMOOTH,RED,NO,FISHY,FREE,CLOSE,NARROW,BUFF,TAPERING,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,WOODS
7633,KNOBBED,SCALY,RED,NO,FISHY,FREE,CLOSE,NARROW,BUFF,TAPERING,...,SILKY,WHITE,PINK,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,WOODS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,CONVEX,FIBROUS,BROWN,NO,NONE,FREE,CROWDED,BROAD,CHOCOLATE,TAPERING,...,FIBROUS,WHITE,WHITE,PARTIAL,WHITE,ONE,EVANESCENT,BLACK,ABUNDANT,GRASSES
5686,FLAT,SMOOTH,BUFF,BRUISES,FOUL,FREE,CLOSE,BROAD,WHITE,TAPERING,...,FIBROUS,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,CHOCOLATE,SEVERAL,URBAN
505,CONVEX,SCALY,WHITE,BRUISES,ALMOND,FREE,CLOSE,BROAD,BROWN,ENLARGING,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BLACK,SCATTERED,GRASSES
983,CONVEX,SCALY,WHITE,BRUISES,PUNGENT,FREE,CLOSE,NARROW,BLACK,ENLARGING,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,GRASSES


### 分類模型

In [11]:
#邏輯斯迴歸
print("Logstic Regression")
lr = LogisticRegression()
lr_params = {'clf__C':[1e-1, 1e0, 1e1, 1e2], 'clf__penalty':['l1','l2']}

pipe_lr = Pipeline([('label_encoder',label_encoder_all),('clf', lr)])

get_best_model_and_accuracy(pipe_lr, lr_params, train_X, train_y)

Logstic Regression
Best Accuracy: 0.959594431960959
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2'}


In [12]:
#決策樹
print("dscision tree")
d_tree = DecisionTreeClassifier()
tree_params = {'clf__max_depth':[None, 1, 3, 5, 7]}

pipe_tree = Pipeline([('label_encoder',label_encoder_all),('clf', d_tree)])
get_best_model_and_accuracy(pipe_tree, tree_params, train_X, train_y)

dscision tree
Best Accuracy: 0.9903417533432393
Best Parameters: {'clf__max_depth': 7}


In [13]:
#隨機森林
print("random forest")
forest = RandomForestClassifier()
forest_params = {'clf__n_estimators':[10, 50, 100], 'clf__max_depth':[None, 1, 3, 5, 7]}

pipe_forest = Pipeline([('label_encoder',label_encoder_all),('clf', forest)])
get_best_model_and_accuracy(pipe_forest, forest_params, train_X, train_y)

random forest
Best Accuracy: 1.0
Best Parameters: {'clf__max_depth': None, 'clf__n_estimators': 50}


### 模型訓練及應用到測試資料

In [14]:
pipe = Pipeline([('label_encoder',label_encoder_all),('clf', forest)])

pipe.set_params(clf__max_depth=None, clf__n_estimators=10)
pipe.fit(train_X, train_y)
pipe.score(test_X)

ValueError: Expected array-like (array or non-string sequence), got None