## 讀取資料 (Default：Label Encoding)

In [1]:
#讀取資料
import pandas as pd
import numpy as np
# plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

df= pd.read_csv('winequality-red.csv')
# quality：7分以上好酒(1)，6分以下劣酒(0)
# POs%：217/1599(13.57%)

In [2]:
# all columns
print(list(df.columns))

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [3]:
#資料前處理 #沒有 Label Encoder
df=df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['quality']#應變數

X=df.drop(columns='quality') 

In [4]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1594    0
1595    0
1596    0
1597    0
1598    0
Name: quality, Length: 1599, dtype: int64

In [5]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


## P1：標準化是否影響結果

In [6]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())
# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.875      0.821875   0.865625   0.84375    0.88714734]
Accuracy of XGBoost cross-vaild test: 0.8586794670846395


In [7]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [8]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

# Accuracy of XGBoost cross-vaild test: 0.9362212662992337
##標準化在XGBoost的交叉驗證上沒有差異太大

[0.875      0.821875   0.865625   0.84375    0.88714734]
Accuracy of XGBoost cross-vaild test: 0.8586794670846395


## P3：Feature Binning 有沒有效果

In [9]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [10]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [11]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['free sulfur dioxide']])
disc2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc2.fit(X[['total sulfur dioxide']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [12]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['free sulfur dioxide']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[['total sulfur dioxide']])

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [13]:
disc1.bin_edges_

array([array([ 1. ,  8.1, 15.2, 22.3, 29.4, 36.5, 43.6, 50.7, 57.8, 64.9, 72. ])],
      dtype=object)

In [14]:
disc11.bin_edges_

array([array([ 1.,  5.,  6.,  9., 11., 14., 16., 19., 24., 31., 72.])],
      dtype=object)

In [16]:
X3 = pd.DataFrame(X)
X3[['free sulfur dioxide']]=disc1.transform(X[['free sulfur dioxide']])
X3[['total sulfur dioxide']]=disc2.transform(X[['total sulfur dioxide']])
X3 = pd.DataFrame(X3)

In [17]:
X3

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,1.0,0.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,3.0,2.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,1.0,1.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,2.0,1.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,1.0,0.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,4.0,1.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,5.0,1.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,3.0,1.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,4.0,1.0,0.99547,3.57,0.71,10.2


In [18]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

#Accuracy of XGBoost cross-vaild test: 0.9304140341443743
# Frequency較佳

[0.884375   0.853125   0.86875    0.859375   0.87147335]
Accuracy of XGBoost cross-vaild test: 0.867419670846395


In [20]:
#資料前處理 #沒有 Label Encoder
df= pd.read_csv('winequality-red.csv')
df=df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['quality']#應變數
X=df.drop(columns='quality') 
X31=pd.DataFrame(X)

X31[['free sulfur dioxide']]=disc11.transform(X[['free sulfur dioxide']])
X31[['total sulfur dioxide']]=disc21.transform(X[['total sulfur dioxide']])
X31 = pd.DataFrame(X31)

In [21]:
X31

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,4.0,4.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,8.0,7.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,5.0,6.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,6.0,7.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,4.0,4.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,9.0,5.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,9.0,6.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,8.0,5.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,9.0,5.0,0.99547,3.57,0.71,10.2


In [22]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf3,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

#Accuracy of XGBoost cross-vaild test:  0.935065196935072
# Frequency較佳

[0.875      0.815625   0.8875     0.86875    0.86833856]
Accuracy of XGBoost cross-vaild test: 0.863042711598746


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [23]:
X3

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,1.0,0.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,3.0,2.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,1.0,1.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,2.0,1.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,1.0,0.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,4.0,1.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,5.0,1.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,3.0,1.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,4.0,1.0,0.99547,3.57,0.71,10.2


In [24]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [25]:
from category_encoders import *
enc2 = OneHotEncoder(cols=['free sulfur dioxide','total sulfur dioxide']).fit(X3,y)
X2 = enc2.transform(X3)
X2 = pd.DataFrame(X2)
X2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide_1,free sulfur dioxide_2,free sulfur dioxide_3,free sulfur dioxide_4,free sulfur dioxide_5,...,total sulfur dioxide_2,total sulfur dioxide_3,total sulfur dioxide_4,total sulfur dioxide_5,total sulfur dioxide_6,total sulfur dioxide_7,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,0,1,0,0,0,...,1,0,0,0,0,0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,1,0,0,0,0,...,0,1,0,0,0,0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,0,0,1,0,0,...,0,1,0,0,0,0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,0,0,0,0,1,...,0,1,0,0,0,0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,0,0,0,0,0,...,0,1,0,0,0,0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,0,1,0,0,0,...,0,1,0,0,0,0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,0,0,0,0,1,...,0,1,0,0,0,0,0.99547,3.57,0.71,10.2


In [26]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X3,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.9362212662992337
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.9420217771205808
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.884375   0.853125   0.86875    0.859375   0.87147335]
Accuracy of XGBoost cross-vaild test: 0.867419670846395


## P4：Label Encoding (這裡不用)

In [27]:
df= pd.read_csv('winequality-red.csv')
df=df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['quality']#應變數
X=df.drop(columns='quality') 
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [28]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.875      0.821875   0.865625   0.84375    0.88714734]
Accuracy of XGBoost cross-vaild test: 0.8586794670846395


In [29]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test: 0.9327261728726979

[0.875      0.846875   0.871875   0.85625    0.88087774]
Accuracy of RandomForest cross-vaild test: 0.8661755485893418


In [30]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9373706143298831

[0.875      0.81875    0.878125   0.853125   0.88087774]
Accuracy of Lightgbm cross-vaild test: 0.8611755485893419


In [31]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X,y)
scores43 = cross_val_score(clf43,X,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test: 0.7034816507595107



[0.8875     0.846875   0.9        0.84375    0.84639498]
Accuracy of MLP cross-vaild test: 0.8649039968652037


In [32]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X,y)
scores44 = cross_val_score(clf44,X,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.865625   0.865625   0.8625     0.8625     0.86520376]
Accuracy of SVM cross-vaild test: 0.8642907523510971


## P4：One Hot Encoding

In [33]:
# 清除 Label Encoder 後，再做 One Hot Encoding
X2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide_1,free sulfur dioxide_2,free sulfur dioxide_3,free sulfur dioxide_4,free sulfur dioxide_5,...,total sulfur dioxide_2,total sulfur dioxide_3,total sulfur dioxide_4,total sulfur dioxide_5,total sulfur dioxide_6,total sulfur dioxide_7,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,0,1,0,0,0,...,1,0,0,0,0,0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,1,0,0,0,0,...,0,1,0,0,0,0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,0,0,1,0,0,...,0,1,0,0,0,0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,0,0,0,0,1,...,0,1,0,0,0,0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,0,0,0,0,0,...,0,1,0,0,0,0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,0,1,0,0,0,...,0,1,0,0,0,0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,0,0,0,0,1,...,0,1,0,0,0,0,0.99547,3.57,0.71,10.2


In [34]:
# One hot encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4,X2,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())
# Accuracy of XGBoost cross-vaild test: 0.9420217771205808

[0.853125   0.84375    0.8625     0.8375     0.85579937]
Accuracy of XGBoost cross-vaild test: 0.8505348746081506


In [35]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X2,y)
scores41a = cross_val_score(clf41,X2,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test: 0.9223215485952412

[0.871875   0.859375   0.88125    0.85625    0.86833856]
Accuracy of RandomForest cross-vaild test: 0.8674177115987461


In [36]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X2,y)
scores42a = cross_val_score(clf42,X2,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9362212662992337

[0.86875    0.84375    0.86875    0.85       0.87774295]
Accuracy of Lightgbm cross-vaild test: 0.8617985893416927


In [37]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X2,y)
scores43a = cross_val_score(clf43,X2,y,cv=5,scoring='accuracy')
print(scores43a)
print('Accuracy of MLP cross-vaild test:',scores43a.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.86875    0.81875    0.9        0.7375     0.86206897]
Accuracy of MLP cross-vaild test: 0.8374137931034482




In [38]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X2,y)
scores44a = cross_val_score(clf44,X2,y,cv=5,scoring='accuracy')
print(scores44a)
print('Accuracy of SVM cross-vaild test:',scores44a.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.865625   0.865625   0.8625     0.8625     0.86520376]
Accuracy of SVM cross-vaild test: 0.8642907523510971


## P4：Frequency Encoding

In [39]:
# Frequency encoding：用類別出現頻率當作該類別數值
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [40]:
X4b=pd.DataFrame(X)
encwork=X4b['total sulfur dioxide'].value_counts()
X4b['total sulfur dioxide']=X4b['total sulfur dioxide'].apply(lambda x : encwork[x]) 
encwork=X4b['total sulfur dioxide'].value_counts()
X4b['total sulfur dioxide']=X4b['total sulfur dioxide'].apply(lambda x : encwork[x]) 

In [41]:
X4b

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,100,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,24,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,100,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,36,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,100,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,23,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,22,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,68,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,23,0.99547,3.57,0.71,10.2


In [42]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4b= XGBClassifier(**params)

# 5-fold validation
scores4b = cross_val_score(clf4b,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test: 0.936194380965183

[0.871875   0.821875   0.8625     0.834375   0.85266458]
Accuracy of XGBoost cross-vaild test: 0.8486579153605017


In [43]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41b=RF.fit(X4b,y)
scores41b = cross_val_score(clf41b,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test: 0.9315633821750235

[0.884375   0.846875   0.871875   0.859375   0.88087774]
Accuracy of RandomForest cross-vaild test: 0.8686755485893418


In [44]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42b=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42b,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9408321010888561

[0.86875    0.85       0.871875   0.85       0.87460815]
Accuracy of Lightgbm cross-vaild test: 0.8630466300940439


In [45]:
# Frequency encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43b=MLP.fit(X4b,y)
scores43b = cross_val_score(clf43b,X4b,y,cv=5,scoring='accuracy')
print(scores43b)
print('Accuracy of MLP cross-vaild test:',scores43b.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.865625   0.859375   0.859375   0.846875   0.86520376]
Accuracy of MLP cross-vaild test: 0.8592907523510972




In [46]:
# Frequency encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44b=svm.fit(X4b,y)
scores44b = cross_val_score(clf44b,X4b,y,cv=5,scoring='accuracy')
print(scores44b)
print('Accuracy of SVM cross-vaild test:',scores44b.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.865625   0.865625   0.8625     0.8625     0.86520376]
Accuracy of SVM cross-vaild test: 0.8642907523510971


## P4：Target Encoding

In [47]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [48]:
df= pd.read_csv('winequality-red.csv')
df=df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['quality']#應變數
X=df.drop(columns='quality') 
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [50]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=['free sulfur dioxide','total sulfur dioxide'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4c = enc.transform(X)

In [51]:
X4c

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,0.135596,0.167855,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,0.104353,0.171143,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,0.089882,0.117855,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,0.100642,0.074618,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,0.135596,0.167855,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,0.111077,0.157655,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,0.110953,0.096483,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,0.107704,0.077957,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,0.111077,0.157655,0.99547,3.57,0.71,10.2


In [52]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.9        0.859375   0.88125    0.825      0.87774295]
Accuracy of XGBoost cross-vaild test: 0.8686735893416928


In [53]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.884375   0.871875   0.9        0.871875   0.89028213]
Accuracy of RandomForest cross-vaild test: 0.8836814263322884


In [54]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454765425460412
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.884375   0.865625   0.89375    0.85       0.89028213]
Accuracy of Lightgbm cross-vaild test: 0.8768064263322884


In [55]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4c,y)
scores43c = cross_val_score(clf43,X4c,y,cv=5,scoring='accuracy')
print(scores43c)
print('Accuracy of MLP cross-vaild test:',scores43c.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781 



[0.88125    0.83125    0.90625    0.784375   0.88714734]
Accuracy of MLP cross-vaild test: 0.8580544670846395


In [56]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4c,y)
scores44c = cross_val_score(clf44,X4c,y,cv=5,scoring='accuracy')
print(scores44c)
print('Accuracy of SVM cross-vaild test:',scores44c.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.865625   0.865625   0.8625     0.8625     0.86520376]
Accuracy of SVM cross-vaild test: 0.8642907523510971


## P4：Leave-One-Out Encoding

In [57]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [59]:
encc = LeaveOneOutEncoder(cols=['free sulfur dioxide','total sulfur dioxide'],sigma=0.05).fit(X, y)
X4d = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [60]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4d = cross_val_score(clf4,X4d,y,cv=5,scoring='accuracy')
print(scores4d)
print('Accuracy of XGBoost cross-vaild test:',scores4d.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.9466258905766904

[0.909375   0.821875   0.8875     0.85625    0.87774295]
Accuracy of XGBoost cross-vaild test: 0.8705485893416929


In [61]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4d,y)
scores41d = cross_val_score(clf41,X4d,y,cv=5,scoring='accuracy')
print(scores41d)
print('Accuracy of RandomForest cross-vaild test:',scores41d.mean())

# Accuracy of RandomForest cross-vaild test: 0.9559214948245731

[0.884375   0.865625   0.896875   0.875      0.89028213]
Accuracy of RandomForest cross-vaild test: 0.8824314263322884


In [62]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4d,y)
scores42d = cross_val_score(clf42,X4d,y,cv=5,scoring='accuracy')
print(scores42d)
print('Accuracy of Lightgbm cross-vaild test:',scores42d.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454899852130663

[0.890625   0.853125   0.896875   0.86875    0.90282132]
Accuracy of Lightgbm cross-vaild test: 0.8824392633228839


In [63]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4d,y)
scores43d = cross_val_score(clf43,X4d,y,cv=5,scoring='accuracy')
print(scores43d)
print('Accuracy of MLP cross-vaild test:',scores43d.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781



[0.8875     0.825      0.91875    0.8        0.89341693]
Accuracy of MLP cross-vaild test: 0.8649333855799372




In [64]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4d,y)
scores44d = cross_val_score(clf44,X4d,y,cv=5,scoring='accuracy')
print(scores44d)
print('Accuracy of SVM cross-vaild test:',scores44d.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.865625   0.865625   0.8625     0.8625     0.86520376]
Accuracy of SVM cross-vaild test: 0.8642907523510971


## P6：Categorical values of a feature is high (超過20種)

這邊為 native country 共有 42種類別，所以將其進行 encoding，所有 label encoder 的結果視為 baseline

In [92]:
# Standardization *  Label Encoding
# Standardization *  Label Encoding
df= pd.read_csv('winequality-red.csv')
df=df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['quality']#應變數
X=df.drop(columns='quality') 

X6 = pd.DataFrame(X)

disc6a = KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='quantile')
disc6a.fit(X6[['free sulfur dioxide']])
X6[['free sulfur dioxide']] = disc6a.transform(X6[['free sulfur dioxide']])
disc6b = KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='quantile')
disc6b.fit(X6[['total sulfur dioxide']])
X6[['free sulfur dioxide']] = disc6b.transform(X6[['free sulfur dioxide']])

Feature names unseen at fit time:
- free sulfur dioxide
Feature names seen at fit time, yet now missing:
- total sulfur dioxide



In [93]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [94]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=['free sulfur dioxide','total sulfur dioxide']).fit(X,y)
X6 = enc6.transform(X)

In [95]:
X6

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide_1,free sulfur dioxide_2,free sulfur dioxide_3,free sulfur dioxide_4,free sulfur dioxide_5,...,total sulfur dioxide_139,total sulfur dioxide_140,total sulfur dioxide_141,total sulfur dioxide_142,total sulfur dioxide_143,total sulfur dioxide_144,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,0,1,0,0,0,...,0,0,0,0,0,0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,0,0,1,0,0,...,0,0,0,0,0,0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,0,0,0,1,0,...,0,0,0,0,0,0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,1,0,0,0,0,...,0,0,0,0,0,0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,0,0,0,0,0,...,0,0,0,0,0,0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,0,0,0,0,0,...,0,0,0,0,0,0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,0,0,0,0,0,...,0,0,0,0,0,0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,0,0,0,0,0,...,0,0,0,0,0,0,0.99547,3.57,0.71,10.2


In [96]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy of XGBoost cross-vaild test: 0.935031590267509

[0.85       0.834375   0.878125   0.84375    0.85579937]
Accuracy of XGBoost cross-vaild test: 0.8524098746081504


In [97]:
# Target Encoder
enc6a = TargetEncoder(cols=['free sulfur dioxide','total sulfur dioxide'],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [98]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9396894743917192

[0.884375   0.84375    0.875      0.834375   0.90282132]
Accuracy of XGBoost cross-vaild test: 0.8680642633228841


## P7：Undersampling

In [99]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,0.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,6.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,2.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,3.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,0.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,7.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,9.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,7.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,7.0,44.0,0.99547,3.57,0.71,10.2


In [100]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [101]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy of XGBoost cross-vaild test: 0.957330827067669

[0.5862069  0.70114943 0.71264368 0.70114943 0.6744186 ]
Accuracy of XGBoost cross-vaild test: 0.6751136059877039


In [102]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [103]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy of XGBoost cross-vaild test: 0.7978070175438596

[0.63218391 0.68965517 0.75862069 0.74712644 0.74418605]
Accuracy of XGBoost cross-vaild test: 0.7143544506816359


In [104]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [105]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.9586666666666668

[0.88167939 0.86641221 0.91603053 0.8129771  0.86641221]
Accuracy of XGBoost cross-vaild test: 0.8687022900763358


In [106]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [107]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.9472198368398456

[0.86538462 0.83846154 0.91153846 0.85384615 0.88803089]
Accuracy of XGBoost cross-vaild test: 0.8714523314523316


In [108]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [109]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.9322485207100591

[0.87859425 0.84345048 0.88782051 0.84615385 0.87179487]
Accuracy of XGBoost cross-vaild test: 0.8655627918407471


In [110]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [111]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.9385542168674699

[0.87179487 0.82315113 0.8778135  0.84244373 0.86173633]
Accuracy of XGBoost cross-vaild test: 0.8553879132657268


## P7：Oversampling

In [112]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [113]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.948709630911188

[0.90958409 0.88969259 0.93309222 0.86799277 0.91847826]
Accuracy of XGBoost cross-vaild test: 0.9037679849044735


In [114]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [115]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.9473327566320646

[0.90235081 0.8915009  0.93851718 0.86980108 0.92028986]
Accuracy of XGBoost cross-vaild test: 0.9044919673978562


In [116]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [117]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.9323432664896079

[0.90235081 0.89492754 0.9384058  0.85869565 0.91485507]
Accuracy of XGBoost cross-vaild test: 0.9018469743428466


## P7：Ensemble

In [118]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [119]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.9785811232163164

[0.97130243 0.94260486 0.97130243 0.91169978 0.90929204]
Accuracy of XGBoost cross-vaild test: 0.9412403055343921


In [120]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [121]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.9412366580787633

[0.90217391 0.86594203 0.94192377 0.87477314 0.92558984]
Accuracy of XGBoost cross-vaild test: 0.9020805386780294


SMOTE+ENN 效果最好

## P8：SMOTE‐based Oversampling

In [122]:
# 記得跑上面
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X8, y8 = smote.fit_resample(X, y)

In [123]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf8= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores8 = cross_val_score(clf8,X8,y8,cv=5,scoring='accuracy')
print(scores8)
print('Accuracy of XGBoost cross-vaild test:',scores8.mean())

# Accuracy: 0.8759304207119742

[0.91681736 0.88788427 0.9403255  0.86256781 0.91123188]
Accuracy of XGBoost cross-vaild test: 0.9037653641533664


In [124]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X8a, y8a = bsmote.fit_resample(X, y)

In [125]:
# XGBoost 
scores8a = cross_val_score(clf8,X8a,y8a,cv=5,scoring='accuracy')
print(scores8a)
print('Accuracy of XGBoost cross-vaild test:',scores8a.mean())

# Accuracy: 0.8759304207119742

[0.89330922 0.88607595 0.94755877 0.86980108 0.92210145]
Accuracy of XGBoost cross-vaild test: 0.9037692952800273


## P9：Imbalance Ratio vs. Resampling Strategy

In [126]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [127]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [128]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [129]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.952

[0.8778626  0.86259542 0.93129771 0.85114504 0.89312977]
Accuracy of RandomForest cross-vaild test: 0.8832061068702292


In [130]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.86641221 0.83969466 0.91603053 0.85496183 0.89312977]
Accuracy of Lightgbm cross-vaild test: 0.8740458015267176


In [131]:
# MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf0b=MLP.fit(X0,y0)
scores0b = cross_val_score(clf0b,X0,y0,cv=5,scoring='accuracy')
print(scores0b)
print('Accuracy of MLP cross-vaild test:',scores0b.mean())
# Accuracy of MLP cross-vaild test: 0.8733333333333334



[0.86259542 0.91221374 0.93129771 0.88931298 0.83969466]
Accuracy of MLP cross-vaild test: 0.8870229007633588




In [132]:
# SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf0c=svm.fit(X0,y0)
scores0c = cross_val_score(clf0c,X0,y0,cv=5,scoring='accuracy')
print(scores0c)
print('Accuracy of SVM cross-vaild test:',scores0c.mean())
# Accuracy of SVM cross-vaild test : 0.852

[0.83587786 0.83587786 0.83587786 0.83206107 0.83206107]
Accuracy of SVM cross-vaild test: 0.834351145038168


## P10：Tomek Links

In [133]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [134]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.9370245139475909

[0.88817891 0.84025559 0.88461538 0.88141026 0.88141026]
Accuracy of RandomForest cross-vaild test: 0.875174080445646


In [135]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.9426666666666668

[0.87539936 0.83386581 0.88461538 0.84294872 0.86538462]
Accuracy of Lightgbm cross-vaild test: 0.8740458015267176


In [136]:
scores01b = cross_val_score(clf0b,X01,y01,cv=5,scoring='accuracy')
print(scores01b)
print('Accuracy of MLP cross-vaild test:',scores01b.mean())
# Accuracy of MLP cross-vaild test: 0.8692096365173289



[0.88817891 0.8370607  0.91346154 0.85576923 0.84615385]
Accuracy of MLP cross-vaild test: 0.8681248463996069




In [137]:
scores01c = cross_val_score(clf0c,X01,y01,cv=5,scoring='accuracy')
print(scores01c)
print('Accuracy of SVM cross-vaild test:',scores01c.mean())
# Accuracy of SVM cross-vaild test : 0.8478021978021978

[0.85942492 0.85942492 0.86217949 0.86217949 0.86217949]
Accuracy of SVM cross-vaild test: 0.8610776603588105


## P10：One Sided Selection

In [138]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [139]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.9268595339441598

[0.87741935 0.83870968 0.86451613 0.8516129  0.87741935]
Accuracy of RandomForest cross-vaild test: 0.8619354838709679


In [140]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.88064516 0.83870968 0.88064516 0.83870968 0.87741935]
Accuracy of Lightgbm cross-vaild test: 0.8740458015267176


In [141]:
scores02b = cross_val_score(clf0b,X02,y02,cv=5,scoring='accuracy')
print(scores02b)
print('Accuracy of MLP cross-vaild test:',scores02b.mean())
# Accuracy of MLP cross-vaild test:  0.8573118822595773



[0.87741935 0.8483871  0.89677419 0.85806452 0.8516129 ]
Accuracy of MLP cross-vaild test: 0.8664516129032258




In [142]:
scores02c = cross_val_score(clf0c,X02,y02,cv=5,scoring='accuracy')
print(scores02c)
print('Accuracy of SVM cross-vaild test:',scores02c.mean())
# Accuracy of SVM cross-vaild test : 0.8441165861048987

[0.86129032 0.86129032 0.86129032 0.85806452 0.85806452]
Accuracy of SVM cross-vaild test: 0.86


## P10：SMOTE

In [143]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [144]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.9403714917339485

[0.95660036 0.88065099 0.94213382 0.84267631 0.91666667]
Accuracy of RandomForest cross-vaild test: 0.9077456298975287


In [145]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9535539215686274

[0.88969259 0.88065099 0.94755877 0.85714286 0.91304348]
Accuracy of Lightgbm cross-vaild test: 0.8976177372434939


In [146]:
scores03b = cross_val_score(clf0b,X03,y03,cv=5,scoring='accuracy')
print(scores03b)
print('Accuracy of MLP cross-vaild test:',scores03b.mean())
# Accuracy of MLP cross-vaild test: 0.8217920991926182



[0.87522604 0.7323689  0.8318264  0.73779385 0.80797101]
Accuracy of MLP cross-vaild test: 0.7970372408732344




In [147]:
scores03c = cross_val_score(clf0c,X03,y03,cv=5,scoring='accuracy')
print(scores03c)
print('Accuracy of SVM cross-vaild test:',scores03c.mean())
# Accuracy of SVM cross-vaild test : 0.6685097078046904

[0.83363472 0.67088608 0.7522604  0.67811935 0.73007246]
Accuracy of SVM cross-vaild test: 0.7329946012527191


## P10：Borderline-SMOTE

In [148]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [149]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.9396914648212226

[0.93490054 0.86980108 0.94213382 0.86256781 0.9057971 ]
Accuracy of RandomForest cross-vaild test: 0.9030400712844301


In [150]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.948027201076509

[0.88969259 0.8915009  0.9403255  0.8517179  0.91666667]
Accuracy of Lightgbm cross-vaild test: 0.8979807112718505


In [151]:
scores04b = cross_val_score(clf0b,X04,y04,cv=5,scoring='accuracy')
print(scores04b)
print('Accuracy of MLP cross-vaild test:',scores04b.mean())
# Accuracy of MLP cross-vaild test: 0.8252619184928873



[0.88245931 0.74502712 0.84267631 0.73598553 0.78985507]
Accuracy of MLP cross-vaild test: 0.7992006709122834




In [152]:
scores04c = cross_val_score(clf0c,X04,y04,cv=5,scoring='accuracy')
print(scores04c)
print('Accuracy of SVM cross-vaild test:',scores04c.mean())
# Accuracy of SVM cross-vaild test : 0.6733732218377547

[0.86075949 0.66184448 0.7721519  0.65641953 0.72826087]
Accuracy of SVM cross-vaild test: 0.7358872552873654


## P10：SMOTE + ENN

In [153]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [154]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.9791304347826086

[0.9800885  0.92920354 0.97339246 0.89578714 0.92461197]
Accuracy of RandomForest cross-vaild test: 0.9406167219355218


In [155]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of Lightgbm cross-vaild test:',scores05a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.982608695652174

[0.9800885  0.93141593 0.98004435 0.90465632 0.91574279]
Accuracy of Lightgbm cross-vaild test: 0.942389576751761


In [156]:
scores05b = cross_val_score(clf0b,X05,y05,cv=5,scoring='accuracy')
print(scores05b)
print('Accuracy of MLP cross-vaild test:',scores05b.mean())
# Accuracy of MLP cross-vaild test:  0.8626086956521739



[0.8960177  0.78982301 0.90909091 0.79379157 0.77827051]
Accuracy of MLP cross-vaild test: 0.8333987402625436




In [157]:
scores05c = cross_val_score(clf0c,X05,y05,cv=5,scoring='accuracy')
print(scores05c)
print('Accuracy of SVM cross-vaild test:',scores05c.mean())
# Accuracy of SVM cross-vaild test : 0.7417391304347826

[0.88274336 0.76327434 0.77161863 0.74722838 0.65853659]
Accuracy of SVM cross-vaild test: 0.7646802582265565


## P10：SMOTE + Tomek Links

In [158]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [159]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.9413830072366658

[0.94746377 0.87137681 0.9491833  0.86751361 0.91288566]
Accuracy of RandomForest cross-vaild test: 0.9096846313685262


In [160]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9490777515167759

[0.90036232 0.87862319 0.95099819 0.86388385 0.92740472]
Accuracy of Lightgbm cross-vaild test: 0.9042544517215078


In [161]:
scores06b = cross_val_score(clf0b,X06,y06,cv=5,scoring='accuracy')
print(scores06b)
print('Accuracy of MLP cross-vaild test:',scores06b.mean())
# Accuracy of MLP cross-vaild test: 0.7955191150313101



[0.86050725 0.63949275 0.8076225  0.73684211 0.79310345]
Accuracy of MLP cross-vaild test: 0.767513611615245




In [162]:
scores06c = cross_val_score(clf0c,X06,y06,cv=5,scoring='accuracy')
print(scores06c)
print('Accuracy of SVM cross-vaild test:',scores06c.mean())
# Accuracy of SVM cross-vaild test : 0.6887673302307448

[0.83152174 0.67934783 0.74410163 0.67150635 0.65517241]
Accuracy of SVM cross-vaild test: 0.7163299928982877


## 輸出

In [165]:
data=pd.DataFrame([[scores1.mean(),scores11.mean()],
         [scores1.mean(),scores2.mean()],         
         [scores3.mean(),scores31.mean()],
         [scores4.mean(),scores41.mean(),scores42.mean(),scores43.mean(),scores44.mean()],
         [scores4a.mean(),scores41a.mean(),scores42a.mean(),scores43a.mean(),scores44a.mean()],
         [scores4b.mean(),scores41b.mean(),scores42b.mean(),scores43b.mean(),scores44b.mean()],
         [scores4c.mean(),scores41c.mean(),scores42c.mean(),scores43c.mean(),scores44c.mean()],
         [scores4d.mean(),scores41d.mean(),scores42d.mean(),scores43d.mean(),scores44d.mean()],
         #[scores5.mean(),scores5a.mean(),scores5b.mean(),scores5c.mean(),scores5d.mean(),scores5e.mean()],
         [scores1.mean(),scores6.mean(),scores6a.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
          scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores7g.mean(),scores7h.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
          scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores0.mean(),scores0a.mean(),scores0b.mean(),scores0c.mean()],
         [scores01.mean(),scores01a.mean(),scores01b.mean(),scores01c.mean()],
         [scores02.mean(),scores02a.mean(),scores02b.mean(),scores02c.mean()],
         [scores03.mean(),scores03a.mean(),scores03b.mean(),scores03c.mean()],
         [scores04.mean(),scores04a.mean(),scores04b.mean(),scores04c.mean()],
         [scores05.mean(),scores05a.mean(),scores05b.mean(),scores05c.mean()],
         [scores06.mean(),scores06a.mean(),scores06b.mean(),scores06c.mean()]],
         index=['Q1','Q2','Q3','Q4Label','Q4One','Q4F','Q4T','Q4LOL','Q6','Q7','Q8','Q9','Q10-1','Q10-2','Q10-3','Q10-4','Q10-5','Q10-6','Q10-7']     )

In [166]:
data.to_csv('Winequality_Result.csv')