In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
#選擇圖形樣式
%matplotlib inline
plt.style.use('fivethirtyeight')

import warnings
#忽略警告，為了排版整齊
warnings.filterwarnings('ignore')

#資料集切割
from sklearn.model_selection import train_test_split

#自訂方法
from sklearn.base import TransformerMixin, BaseEstimator

#變數標籤轉換
from sklearn.preprocessing import LabelEncoder

#機器學習管線
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#機器學習模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#特徵選擇
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

#特徵轉換
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### 基本資料

自變數:22個，皆為類別變數；因變數:1個，為類別變數。__是為類別變數的分類目的__

資料筆數: 8416 筆

In [2]:
data_path = "./data/mushroom_dataset.csv"
mushroom_df = pd.read_csv(data_path)

mushroom_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,mushroom
0,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
2,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
3,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
4,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,EDIBLE
8412,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,EDIBLE
8413,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,EDIBLE
8414,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,EDIBLE


### 切分資料集

將訓練與測試資料切割，訓練80%、測試20%

In [3]:
train_df, test_df = train_test_split(mushroom_df, test_size=0.2)

print(train_df.shape, test_df.shape)

(6732, 23) (1684, 23)


### 遺漏值觀察

訓練資料與測試資料皆無遺漏值

In [4]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
mushroom                    0
dtype: int64
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                

### 變數轉換

In [5]:
#自訂轉換方法
class LabelEncoderAll(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        labelencoder = LabelEncoder()
        X = df.copy()
        
        for column_name in X.columns:
            X[column_name] = labelencoder.fit_transform(X[column_name])
            
        return X
    
    def fit(self, *_):
        return self

In [6]:
#觀察欄位與標籤
labelencoder = LabelEncoder()
train_lea = train_df.copy()

for column_name in train_lea.columns:
    print(column_name)
    train_lea[column_name] = labelencoder.fit_transform(train_lea[column_name])
    print(list(labelencoder.classes_))

cap-shape
['BELL', 'CONICAL', 'CONVEX', 'FLAT', 'KNOBBED', 'SUNKEN']
cap-surface
['FIBROUS', 'GROOVES', 'SCALY', 'SMOOTH']
cap-color
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'GREEN', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
bruises
['BRUISES', 'NO']
odor
['ALMOND', 'ANISE', 'CREOSOTE', 'FISHY', 'FOUL', 'MUSTY', 'NONE', 'PUNGENT', 'SPICY']
gill-attachment
['ATTACHED', 'FREE']
gill-spacing
['CLOSE', 'CROWDED']
gill-size
['BROAD', 'NARROW']
gill-color
['BLACK', 'BROWN', 'BUFF', 'CHOCOLATE', 'GRAY', 'GREEN', 'ORANGE', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
stalk-shape
['ENLARGING', 'TAPERING']
stalk-root
['?', 'BULBOUS', 'CLUB', 'EQUAL', 'ROOTED']
stalk-surface-above-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-surface-below-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-color-above-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
stalk-color-below-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
vei

In [7]:
#類別變數數值轉換
label_encoder_all = LabelEncoderAll()

### 特徵選擇

In [8]:
selected_feature = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type',
       'spore-print-color', 'population', 'habitat']

train_X = train_df[selected_feature]
train_y = train_df['mushroom']

test_X = test_df[selected_feature]
test_y = test_df['mushroom']

### 分類模型

In [9]:
def get_best_model_and_accuracy(model, params, X, y):
    
    grid = GridSearchCV(model,
                       params,
                       error_score=0.)
    grid.fit(X, y)
    
    print("Best Accuracy: {}".format(grid.best_score_))
    print("Best Parameters: {}".format(grid.best_params_))

In [10]:
#分類模型
lr = LogisticRegression()
lr_params = {'clf__C':[1e-1, 1e0, 1e1, 1e2], 'clf__penalty':['l1','l2']}

pipe_lr = Pipeline([('label_encoder',label_encoder_all),('clf', lr)])

get_best_model_and_accuracy(pipe_lr, lr_params, train_X, train_y)

Best Accuracy: 0.9587051077128086
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2'}


In [12]:
#設定參數
transform_pipe_params = {
    'preprocessing__pca__n_components':[i for i in range(1,len(selected_feature)+1)],
    'preprocessing__lda__n_components':[j for j in range(len(selected_feature)//2)]+[None],
    'clf__C':[1e-1, 1e0, 1e1, 1e2],
    'clf__penalty':['l1','l2']
}

print(transform_pipe_params)


preprocessing_pipe = Pipeline([
    ('pca', PCA()),
    ('lda', LinearDiscriminantAnalysis())
])

#分類模型
lr = LogisticRegression()

#建立完整管線
transform_pipe = Pipeline(steps=[
    ('label_encoder',label_encoder_all),
    ('preprocessing', preprocessing_pipe),
    ('clf', lr)
])

get_best_model_and_accuracy(transform_pipe, transform_pipe_params, train_X, train_y)

{'preprocessing__pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 'preprocessing__lda__n_components': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, None], 'clf__C': [0.1, 1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}
Best Accuracy: 0.933006152023483
Best Parameters: {'clf__C': 1.0, 'clf__penalty': 'l2', 'preprocessing__lda__n_components': 1, 'preprocessing__pca__n_components': 18}


In [13]:
pipe = Pipeline([('label_encoder',label_encoder_all), ('pca', PCA()), ('lda', LinearDiscriminantAnalysis()), ('clf', lr)])

pipe.set_params(clf__C = 1.0,
                clf__penalty = 'l2',
                lda__n_components = 1,
                pca__n_components = 21)

pipe.fit(train_X, train_y)

pipe.score(test_X, test_y)

0.9311163895486936