In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
#選擇圖形樣式
%matplotlib inline
plt.style.use('fivethirtyeight')

import warnings
#忽略警告，為了排版整齊
warnings.filterwarnings('ignore')

#資料集切割
from sklearn.model_selection import train_test_split

#自訂方法
from sklearn.base import TransformerMixin, BaseEstimator

#變數標籤轉換
from sklearn.preprocessing import LabelEncoder

#機器學習管線
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#特徵選擇
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

#機器學習模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### 基本資料

自變數:22個，皆為類別變數；因變數:1個，為類別變數。__是為類別變數的分類目的__

資料筆數: 8416 筆

In [2]:
data_path = "./data/mushroom_dataset.csv"
mushroom_df = pd.read_csv(data_path)

mushroom_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,mushroom
0,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
2,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
3,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,EDIBLE
4,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,EDIBLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,EDIBLE
8412,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,EDIBLE
8413,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,EDIBLE
8414,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,EDIBLE


### 切分資料集

將訓練與測試資料切割，訓練80%、測試20%

In [3]:
train_df, test_df = train_test_split(mushroom_df, test_size=0.2)

print(train_df.shape, test_df.shape)

(6732, 23) (1684, 23)


### 遺漏值觀察

訓練資料與測試資料皆無遺漏值

In [4]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
mushroom                    0
dtype: int64
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                

### 變數轉換

In [5]:
#自訂轉換方法
class LabelEncoderAll(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        labelencoder = LabelEncoder()
        X = df.copy()
        
        for column_name in X.columns:
            X[column_name] = labelencoder.fit_transform(X[column_name])
            
        return X
    
    def fit(self, *_):
        return self

In [6]:
label_encoder_all = LabelEncoderAll()

In [7]:
#觀察欄位與標籤
labelencoder = LabelEncoder()
train_lea = train_df.copy()

for column_name in train_lea.columns:
    print(column_name)
    train_lea[column_name] = labelencoder.fit_transform(train_lea[column_name])
    print(list(labelencoder.classes_))

cap-shape
['BELL', 'CONICAL', 'CONVEX', 'FLAT', 'KNOBBED', 'SUNKEN']
cap-surface
['FIBROUS', 'GROOVES', 'SCALY', 'SMOOTH']
cap-color
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'GREEN', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
bruises
['BRUISES', 'NO']
odor
['ALMOND', 'ANISE', 'CREOSOTE', 'FISHY', 'FOUL', 'MUSTY', 'NONE', 'PUNGENT', 'SPICY']
gill-attachment
['ATTACHED', 'FREE']
gill-spacing
['CLOSE', 'CROWDED']
gill-size
['BROAD', 'NARROW']
gill-color
['BLACK', 'BROWN', 'BUFF', 'CHOCOLATE', 'GRAY', 'GREEN', 'ORANGE', 'PINK', 'PURPLE', 'RED', 'WHITE', 'YELLOW']
stalk-shape
['ENLARGING', 'TAPERING']
stalk-root
['?', 'BULBOUS', 'CLUB', 'EQUAL', 'ROOTED']
stalk-surface-above-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-surface-below-ring
['FIBROUS', 'SCALY', 'SILKY', 'SMOOTH']
stalk-color-above-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
stalk-color-below-ring
['BROWN', 'BUFF', 'CINNAMON', 'GRAY', 'ORANGE', 'PINK', 'RED', 'WHITE', 'YELLOW']
vei

## 特徵選擇

In [8]:
#轉換資料
train_df = label_encoder_all.fit_transform(train_df)

train_X = train_df.drop('mushroom', axis=1)
train_y = train_df['mushroom']

In [9]:
#搭配機器學習，觀察選擇成效
def get_best_model_and_accuracy(model, params, X, y):
    
    grid = GridSearchCV(model,
                       params,
                       error_score=0.)
    grid.fit(X, y)
    
    print("Best Accuracy: {}".format(grid.best_score_))
    print("Best Parameters: {}".format(grid.best_params_))

In [10]:
lr = LogisticRegression()
lr_params = {'clf__C':[1e-1, 1e0, 1e1, 1e2], 'clf__penalty':['l1','l2']}

pipe_lr = Pipeline([('label_encoder',label_encoder_all),('clf', lr)])

get_best_model_and_accuracy(pipe_lr, lr_params, train_X, train_y)

Best Accuracy: 0.9588529239485467
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2'}


### 卡方檢驗(chi square)
檢測類別變數之間的相關性

卡方值:相關分數

p-value:相關性是否顯著

In [11]:
#手動檢定，觀察不同特徵之間的score與p-value
chi2_select = SelectKBest(chi2)
chi2_select.fit_transform(train_X, train_y)

#卡方值 chi2_select.scores_
#p-value chi2_select.pvalues_
chi2_score_pvalue = pd.DataFrame({'column':train_X.columns, 'score':chi2_select.scores_, 'p_value':chi2_select.pvalues_}).sort_values('p_value')

high_chi2_select = chi2_score_pvalue[chi2_score_pvalue['p_value'] < .001]['column']

chi2_score_pvalue

Unnamed: 0,column,score,p_value
19,spore-print-color,4978.817379,0.0
7,gill-size,1361.188705,5.705462e-298
18,ring-type,1347.310438,5.9175249999999995e-295
10,stalk-root,1110.024737,2.188136e-243
8,gill-color,1017.546063,2.756876e-223
6,gill-spacing,782.706735,3.104413e-172
3,bruises,594.486486,2.648752e-131
13,stalk-color-above-ring,434.008467,2.179991e-96
14,stalk-color-below-ring,359.364741,3.871775e-80
20,population,311.441655,1.0596740000000001e-69


In [12]:
chi2_select_pipe = Pipeline([('select', chi2_select),
                            ('clf', lr)])

chi2_select_pipe_params = deepcopy(lr_params)

chi2_select_pipe_params.update({
    'select__k':[i for i in range(1, len(train_X.columns)+1, 2)]
})

print(chi2_select_pipe_params)
get_best_model_and_accuracy(chi2_select_pipe, chi2_select_pipe_params, train_X, train_y)

{'clf__C': [0.1, 1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2'], 'select__k': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
Best Accuracy: 0.9622692439640785
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2', 'select__k': 21}


In [13]:
#使用到了哪些特徵的結果最好
k=22
list(chi2_score_pvalue['column'][:k])

['spore-print-color',
 'gill-size',
 'ring-type',
 'stalk-root',
 'gill-color',
 'gill-spacing',
 'bruises',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'population',
 'cap-surface',
 'cap-shape',
 'cap-color',
 'stalk-surface-above-ring',
 'odor',
 'stalk-shape',
 'stalk-surface-below-ring',
 'ring-number',
 'habitat',
 'veil-color',
 'gill-attachment',
 'veil-type']

### ANOVA假設檢定

與chi2差不多，都是觀察數值與p-value

In [14]:
#手動檢定，觀察不同特徵之間的score與p-value
anova_select = SelectKBest(f_classif)
anova_select.fit_transform(train_X, train_y)

anova_score_pvalue = pd.DataFrame({'column':train_X.columns, 'score':anova_select.scores_, 'p_value':anova_select.pvalues_}).sort_values('p_value')

high_anova_select = anova_score_pvalue[anova_score_pvalue['p_value'] < .001]['column']

anova_score_pvalue

Unnamed: 0,column,score,p_value
0,cap-shape,254.951111,0.0
19,spore-print-color,2594.117676,0.0
18,ring-type,1049.536499,0.0
17,ring-number,303.47934,0.0
14,stalk-color-below-ring,430.228821,0.0
13,stalk-color-above-ring,569.45636,0.0
20,population,730.020569,0.0
8,gill-color,454.954559,0.0
10,stalk-root,1300.074707,0.0
6,gill-spacing,1125.007935,0.0


In [15]:
anova_select_pipe = Pipeline([('select', anova_select),
                            ('clf', lr)])

anova_select_pipe_params = deepcopy(lr_params)

anova_select_pipe_params.update({
    'select__k':[i for i in range(1, len(train_X.columns)+1, 3)]
})

print(anova_select_pipe_params)
get_best_model_and_accuracy(anova_select_pipe, anova_select_pipe_params, train_X, train_y)

{'clf__C': [0.1, 1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2'], 'select__k': [1, 4, 7, 10, 13, 16, 19, 22]}
Best Accuracy: 0.9628649213319788
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2', 'select__k': 19}


In [16]:
#使用到了哪些特徵的結果最好
k=22
anova_score_pvalue['column'][:k]

0                    cap-shape
19           spore-print-color
18                   ring-type
17                 ring-number
14      stalk-color-below-ring
13      stalk-color-above-ring
20                  population
8                   gill-color
10                  stalk-root
6                 gill-spacing
3                      bruises
1                  cap-surface
7                    gill-size
11    stalk-surface-above-ring
16                  veil-color
9                  stalk-shape
5              gill-attachment
12    stalk-surface-below-ring
4                         odor
2                    cap-color
21                     habitat
15                   veil-type
Name: column, dtype: object

### 決策樹(Gini)

In [17]:
#觀察各個特徵重要程度排序
tree = DecisionTreeClassifier()

tree.fit(train_X, train_y)

importances = pd.DataFrame({'importance':tree.feature_importances_,
                           'feature':train_X.columns}).sort_values('importance', ascending=False)

importances

Unnamed: 0,importance,feature
19,0.527497,spore-print-color
17,0.158141,ring-number
7,0.141146,gill-size
6,0.047927,gill-spacing
16,0.042995,veil-color
20,0.023266,population
9,0.015939,stalk-shape
18,0.015797,ring-type
4,0.012326,odor
0,0.006966,cap-shape


In [18]:
tree_select = SelectFromModel(DecisionTreeClassifier())

tree_select_pipe = Pipeline([('select', tree_select),
                            ('clf', lr)])

tree_select_pipe_params = deepcopy(lr_params)

tree_select_pipe_params.update({
    'select__threshold': [.001, .01, .05, .1],
    'select__estimator__max_depth': [None, 1, 3, 5, 7]
})

#print(tree_select_pipe_params)
get_best_model_and_accuracy(tree_select_pipe, tree_select_pipe_params, train_X, train_y)

Best Accuracy: 0.937913871671239
Best Parameters: {'clf__C': 10.0, 'clf__penalty': 'l2', 'select__estimator__max_depth': 7, 'select__threshold': 0.01}


In [19]:
#取得最佳結果之變數
tree_select_pipe.set_params(**{'clf__C': 100.0,
                               'clf__penalty': 'l2',
                               'select__estimator__max_depth': 7,
                               'select__threshold': 0.01})

#擬合資料
tree_select_pipe.steps[0][1].fit(train_X, train_y)

#列出所選擇的特徵
train_X.columns[tree_select_pipe.steps[0][1].get_support()]

Index(['odor', 'gill-spacing', 'gill-size', 'stalk-shape', 'stalk-root',
       'veil-color', 'ring-number', 'spore-print-color', 'population'],
      dtype='object')

### 邏輯迴歸 L1L2正規化

In [20]:
#觀察各個特徵重要程度排序
logistic = LogisticRegression()

logistic.fit(train_X, train_y)

coefficient = pd.DataFrame({'coefficient':logistic.coef_[0],
                           'feature':train_X.columns}).sort_values('coefficient', ascending=False)

coefficient

Unnamed: 0,coefficient,feature
7,5.544608,gill-size
5,5.258407,gill-attachment
16,4.204435,veil-color
18,1.348069,ring-type
3,0.889163,bruises
1,0.588029,cap-surface
19,0.419415,spore-print-color
20,0.133681,population
0,0.044285,cap-shape
2,0.037897,cap-color


In [21]:
logistic_select = SelectFromModel(LogisticRegression())

regularization_pipe = Pipeline([('select', logistic_select),
                               ('clf', lr)])

regularization_pipe_params = deepcopy(lr_params)

#L1和L2正規化
regularization_pipe_params.update({
    'select__threshold': [.001, .01, .05, .1],
    'select__estimator__penalty':['l1', 'l2']
})

#print(regularization_pipe_params)
get_best_model_and_accuracy(regularization_pipe, regularization_pipe_params, train_X, train_y)

Best Accuracy: 0.9619720671438705
Best Parameters: {'clf__C': 100.0, 'clf__penalty': 'l2', 'select__estimator__penalty': 'l2', 'select__threshold': 0.001}


In [22]:
#取得最佳結果之變數
regularization_pipe.set_params(**{'clf__C': 100.0,
                                  'clf__penalty': 'l2',
                                  'select__estimator__penalty': 'l2',
                                  'select__threshold': 0.01})

#擬合資料
regularization_pipe.steps[0][1].fit(train_X, train_y)

#列出所選擇的特徵
train_X.columns[regularization_pipe.steps[0][1].get_support()]

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'gill-attachment',
       'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'veil-color', 'ring-number', 'ring-type',
       'spore-print-color', 'population', 'habitat'],
      dtype='object')

> 用邏輯回歸的結果最好，把veil-type拿掉