## 讀取資料 (Default：Label Encoding)

In [1]:
#讀取資料
import pandas as pd
import numpy as np

df= pd.read_csv('income_evaluation.csv')

In [2]:
# Label Emcoder，將部分字串轉數值
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country',' income']]:
    df[col]=le.fit_transform(df[col])

In [3]:
#資料前處理 
df=df.drop(columns=' education') #此列已經數值化(education-num)，所以丟棄
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df[' income']#應變數
X=df.drop(columns=' income') #剩下的為自變數

In [4]:
y

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name:  income, Length: 32561, dtype: int64

## P1：標準化是否影響結果

In [None]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())

[0.85997236 0.85841523 0.86194717 0.86624693 0.86624693]
Accuracy of XGBoost cross-vaild test: 0.8625657256645279


In [None]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [None]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

##標準化在XGBoost的交叉驗證上沒有差異太大

[0.85997236 0.8585688  0.86210074 0.86624693 0.86624693]
Accuracy of XGBoost cross-vaild test: 0.8626271507259531


## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [None]:
# 清除 Label Encoder 後，再做 One Hot Encoding
df= pd.read_csv('income_evaluation.csv')
X2=df.drop(columns=' income')
X2 = pd.get_dummies(X2)
pd.DataFrame(X2)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X2,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.8625657256645279
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.8605081039961279
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.85613389 0.85749386 0.86394349 0.86578624 0.85918305]
Accuracy of XGBoost cross-vaild test: 0.8605081039961279


## P3：Feature Binning 有沒有效果

In [None]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['age']])
disc2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc2.fit(X[[' fnlwgt']])
disc3 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')
disc3.fit(X[[' capital-gain']])
disc4 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')
disc4.fit(X[[' capital-loss']])
disc5 = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
disc5.fit(X[[' hours-per-week']])

KBinsDiscretizer(encode='ordinal', strategy='uniform')

In [None]:
X3 = X #inherit
X3[['age']]=disc1.transform(X3[['age']])
X3[[' fnlwgt']]=disc2.transform(X3[[' fnlwgt']])
X3[[' capital-gain']]=disc3.transform(X3[[' capital-gain']])
X3[[' capital-loss']]=disc4.transform(X3[[' capital-loss']])
X3[[' capital-loss']]=disc5.transform(X3[[' hours-per-week']])
X3 = pd.DataFrame(X3)

In [None]:
disc1.bin_edges_

array([array([17. , 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90. ])],
      dtype=object)

In [None]:
X3

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,3.0,7,0.0,13,4,1,1,4,1,0.0,1.0,40,39
1,4.0,6,0.0,13,2,4,0,4,1,0.0,0.0,13,39
2,2.0,4,1.0,9,0,6,1,4,1,0.0,1.0,40,39
3,4.0,4,1.0,7,2,6,0,2,1,0.0,1.0,40,39
4,1.0,4,2.0,13,2,10,5,2,0,0.0,1.0,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1.0,4,1.0,12,2,13,5,4,0,0.0,1.0,38,39
32557,3.0,4,0.0,9,2,7,0,4,1,0.0,1.0,40,39
32558,5.0,4,0.0,9,6,1,4,4,0,0.0,1.0,40,39
32559,0.0,4,1.0,9,4,1,3,4,1,0.0,0.0,20,39


In [None]:
#X3=pd.concat([X3,X],axis=1)
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

In [None]:
#X3

Unnamed: 0,age_wb,fnlwgt_wb,capital-gain_wb,capital-loss_wb,hours-per-week_wb,workclass,education-num,marital-status,occupation,relationship,race,sex,native-country,age_wb.1,fnlwgt_wb.1,capital-gain_wb.1,capital-loss_wb.1,hours-per-week_wb.1
0,3.0,0.0,0.0,0.0,1.0,7,13,4,1,1,4,1,39,3.0,0.0,0.0,0.0,1.0
1,4.0,0.0,0.0,0.0,0.0,6,13,2,4,0,4,1,39,4.0,0.0,0.0,0.0,0.0
2,2.0,1.0,0.0,0.0,1.0,4,9,0,6,1,4,1,39,2.0,1.0,0.0,0.0,1.0
3,4.0,1.0,0.0,0.0,1.0,4,7,2,6,0,2,1,39,4.0,1.0,0.0,0.0,1.0
4,1.0,2.0,0.0,0.0,1.0,4,13,2,10,5,2,0,5,1.0,2.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1.0,1.0,0.0,0.0,1.0,4,12,2,13,5,4,0,39,1.0,1.0,0.0,0.0,1.0
32557,3.0,0.0,0.0,0.0,1.0,4,9,2,7,0,4,1,39,3.0,0.0,0.0,0.0,1.0
32558,5.0,0.0,0.0,0.0,1.0,4,9,6,1,4,4,0,39,5.0,0.0,0.0,0.0,1.0
32559,0.0,1.0,0.0,0.0,0.0,4,9,4,1,3,4,1,39,0.0,1.0,0.0,0.0,0.0


In [None]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

##Equal Width Binning 在 XGBoost 的交叉驗證上表現較差

[0.83325656 0.83353808 0.8264742  0.84090909 0.84444103]
Accuracy of XGBoost cross-vaild test: 0.8357237943315787


In [None]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['age']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[[' fnlwgt']])
disc31 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
disc31.fit(X[[' capital-gain']])
disc41 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
disc41.fit(X[[' capital-loss']])
disc51 = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
disc51.fit(X[[' hours-per-week']])



KBinsDiscretizer(encode='ordinal')

In [None]:
disc11.bin_edges_

array([array([17., 22., 26., 30., 33., 37., 41., 45., 50., 58., 90.])],
      dtype=object)

In [None]:
X31 = X #inherit
X31[['age']]=disc11.transform(X31[['age']])
X31[[' fnlwgt']]=disc21.transform(X31[[' fnlwgt']])
X31[[' capital-gain']]=disc31.transform(X31[[' capital-gain']])
X31[[' capital-loss']]=disc41.transform(X31[[' capital-loss']])
X31[[' capital-loss']]=disc51.transform(X31[[' hours-per-week']])
X31 = pd.DataFrame(X31)

In [None]:
X31

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,5.0,7,1.0,13,4,1,1,4,1,0.0,2.0,40,39
1,8.0,6,1.0,13,2,4,0,4,1,0.0,0.0,13,39
2,5.0,4,6.0,9,0,6,1,4,1,0.0,2.0,40,39
3,8.0,4,7.0,7,2,6,0,2,1,0.0,2.0,40,39
4,2.0,4,9.0,13,2,10,5,2,0,0.0,2.0,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2.0,4,7.0,12,2,13,5,4,0,0.0,1.0,38,39
32557,5.0,4,3.0,9,2,7,0,4,1,0.0,2.0,40,39
32558,9.0,4,3.0,9,6,1,4,4,0,0.0,2.0,40,39
32559,1.0,4,6.0,9,4,1,3,4,1,0.0,0.0,20,39


In [None]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf31= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf31,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

## Accuracy of XGBoost cross-vaild test: 0.8330826675886556
##Equal Frequency Binning 在 XGBoost 的交叉驗證上表現較原本差

[0.82834331 0.83154177 0.83046683 0.8355344  0.83952703]
Accuracy of XGBoost cross-vaild test: 0.8330826675886556


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P4：Label Encoding

In [None]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.8605081039961279

[0.85613389 0.85749386 0.86394349 0.86578624 0.85918305]
Accuracy of XGBoost cross-vaild test: 0.8605081039961279


In [None]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test for one-hot encoding: 0.8537207195889831

[0.85337018 0.8470516  0.85304054 0.85749386 0.85764742]
Accuracy of RandomForest cross-vaild test: 0.8537207195889831


In [None]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())
# Accuracy of Lightgbm cross-vaild test for one-hot encoding: 0.8741748264951859

[0.86764932 0.87315725 0.87423219 0.87776413 0.87807125]
Accuracy of Lightgbm cross-vaild test: 0.8741748264951859


In [None]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4,y)
scores43 = cross_val_score(clf43,X,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test for one-hot encoding: 0.7891032997320423



[0.79656072 0.77902334 0.79499386 0.77134521 0.80359337]
Accuracy of Lightgbm cross-vaild test: 0.7891032997320423


In [None]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4,y)
scores44 = cross_val_score(clf44,X,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding:

[0.79548595 0.79376536 0.79468673 0.79637592 0.79422604]
Accuracy of Lightgbm cross-vaild test: 0.7949080010457257


## P4：One Hot Encoding

In [None]:
# One hot encoding 後 XGBoost / 前處理在 P2
X4=X2 #inherit
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X4,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.8605081039961279

[0.85613389 0.85749386 0.86394349 0.86578624 0.85918305]
Accuracy of XGBoost cross-vaild test: 0.8605081039961279


In [None]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4,y)
scores41 = cross_val_score(clf41,X4,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test for one-hot encoding: 0.8537207195889831

[0.85337018 0.8470516  0.85304054 0.85749386 0.85764742]
Accuracy of RandomForest cross-vaild test: 0.8537207195889831


In [None]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4,y)
scores42 = cross_val_score(clf42,X4,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())
# Accuracy of Lightgbm cross-vaild test for one-hot encoding: 0.8741748264951859

[0.86764932 0.87315725 0.87423219 0.87776413 0.87807125]
Accuracy of Lightgbm cross-vaild test: 0.8741748264951859


In [None]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4,y)
scores43 = cross_val_score(clf43,X4,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test for one-hot encoding: 0.7891032997320423



[0.79656072 0.77902334 0.79499386 0.77134521 0.80359337]
Accuracy of Lightgbm cross-vaild test: 0.7891032997320423


In [None]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4,y)
scores44 = cross_val_score(clf44,X4,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding:

[0.79548595 0.79376536 0.79468673 0.79637592 0.79422604]
Accuracy of Lightgbm cross-vaild test: 0.7949080010457257


## P4：Frequency Encoding

In [None]:
# Frequency encoding：用類別出現頻率當作該類別數值
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
df= pd.read_csv('income_evaluation.csv')
y=df[' income']
X4a=df.drop(columns=' income')
X4a=X4a.drop(columns=' education')

encwork=X4a[' workclass'].value_counts()
X4a[' workclass']=X4a[' workclass'].apply(lambda x : encwork[x]) 

encmarry=X4a[' marital-status'].value_counts()
X4a[' marital-status']=X4a[' marital-status'].apply(lambda x : encmarry[x])

encrel=X4a[' relationship'].value_counts()
X4a[' relationship']=X4a[' relationship'].apply(lambda x : encrel[x])

encrace=X4a[' race'].value_counts()
X4a[' race']=X4a[' race'].apply(lambda x : encrace[x]) 

encsex=X4a[' sex'].value_counts()
X4a[' sex']=X4a[' sex'].apply(lambda x : encsex[x])

enccountry=X4a[' native-country'].value_counts()
X4a[' native-country']=X4a[' native-country'].apply(lambda x : enccountry[x])

encocc=X4a[' occupation'].value_counts()
X4a[' occupation']=X4a[' occupation'].apply(lambda x : encocc[x])

In [None]:
X4a

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,1298,77516,13,10683,3770,8305,27816,21790,2174,0,40,29170
1,50,2541,83311,13,14976,4066,13193,27816,21790,0,0,13,29170
2,38,22696,215646,9,4443,1370,8305,27816,21790,0,0,40,29170
3,53,22696,234721,7,14976,1370,13193,3124,21790,0,0,40,29170
4,28,22696,338409,13,14976,4140,1568,3124,10771,0,0,40,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,22696,257302,12,14976,928,1568,27816,10771,0,0,38,29170
32557,40,22696,154374,9,14976,2002,13193,27816,21790,0,0,40,29170
32558,58,22696,151910,9,993,3770,3446,27816,10771,0,0,40,29170
32559,22,22696,201490,9,10683,3770,5068,27816,21790,0,0,20,29170


In [None]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4a= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4a,X4a,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())

# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.8607843611586127

[0.86120068 0.85841523 0.86010442 0.86363636 0.86056511]
Accuracy of XGBoost cross-vaild test: 0.8607843611586127


In [None]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41a=RF.fit(X4,y)
scores41a = cross_val_score(clf41a,X4a,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test for one-hot encoding: 0.8589110288212083

[0.85690158 0.85611179 0.85841523 0.86363636 0.85949017]
Accuracy of RandomForest cross-vaild test: 0.8589110288212083


In [None]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42a=LGBM.fit(X4a,y)
scores42a = cross_val_score(clf42a,X4a,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())
# Accuracy of Lightgbm cross-vaild test for one-hot encoding: 0.8735605711653616

[0.86780286 0.87331081 0.872543   0.87668919 0.877457  ]
Accuracy of Lightgbm cross-vaild test: 0.8735605711653616


In [None]:
# Frequency encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43a=MLP.fit(X4a,y)
scores43a = cross_val_score(clf43a,X4a,y,cv=5,scoring='accuracy')
print(scores43a)
print('Accuracy of MLP cross-vaild test:',scores43a.mean())
# Accuracy of MLP cross-vaild test for one-hot encoding: 0.7785384343767576
# 還沒 converge



[0.78857669 0.69256757 0.7997543  0.80605037 0.80574324]
Accuracy of Lightgbm cross-vaild test: 0.7785384343767576




In [None]:
# Frequency encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44a=svm.fit(X4a,y)
scores44a = cross_val_score(clf44a,X4a,y,cv=5,scoring='accuracy')
print(scores44a)
print('Accuracy of SVM cross-vaild test:',scores44a.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.795430109352265
# 還沒 converge



[0.79563949 0.79407248 0.79545455 0.79760442 0.79437961]
Accuracy of Lightgbm cross-vaild test: 0.795430109352265


## P4：Target Encoding

In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 631 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4b = enc.transform(X)

In [None]:
X4b

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,0.271957,77516,13,0.045961,0.134483,0.103070,0.25586,0.305737,2174,0,40,0.245835
1,50,0.284927,83311,13,0.446848,0.484014,0.448571,0.25586,0.305737,0,0,13,0.245835
2,38,0.218673,215646,9,0.104209,0.062774,0.103070,0.25586,0.305737,0,0,40,0.245835
3,53,0.218673,234721,7,0.446848,0.062774,0.448571,0.12388,0.305737,0,0,40,0.245835
4,28,0.218673,338409,13,0.446848,0.449034,0.475128,0.12388,0.109461,0,0,40,0.263146
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,0.218673,257302,12,0.446848,0.304957,0.475128,0.25586,0.109461,0,0,38,0.245835
32557,40,0.218673,154374,9,0.446848,0.124875,0.448571,0.25586,0.305737,0,0,40,0.245835
32558,58,0.218673,151910,9,0.085599,0.134483,0.063262,0.25586,0.109461,0,0,40,0.245835
32559,22,0.218673,201490,9,0.045961,0.134483,0.013220,0.25586,0.305737,0,0,20,0.245835


In [None]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4b = cross_val_score(clf4,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test for Target encoding: 0.8601396196456077

[0.85398434 0.85995086 0.86041155 0.86240786 0.86394349]
Accuracy of XGBoost cross-vaild test: 0.8601396196456077


In [None]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4b,y)
scores41b = cross_val_score(clf41,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.85705512 0.85595823 0.85902948 0.86271499 0.8610258 ]
Accuracy of RandomForest cross-vaild test: 0.8591567243513352


In [None]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.86810993 0.87285012 0.87638206 0.87638206 0.87668919]
Accuracy of Lightgbm cross-vaild test: 0.8740826747563275


In [None]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4b,y)
scores43b = cross_val_score(clf43,X4b,y,cv=5,scoring='accuracy')
print(scores43b)
print('Accuracy of MLP cross-vaild test:',scores43b.mean())
# Accuracy of MLP cross-vaild test for target encoding: 0.7930344800853782 



[0.79732842 0.80113636 0.7718059  0.79484029 0.80006143]
Accuracy of MLP cross-vaild test: 0.7930344800853782




In [None]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44b=svm.fit(X4b,y)
scores44b = cross_val_score(clf44,X4b,y,cv=5,scoring='accuracy')
print(scores44b)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test for target encoding: 0.7949080010457257

[0.79548595 0.79376536 0.79468673 0.79637592 0.79422604]
Accuracy of Lightgbm cross-vaild test: 0.7949080010457257


## P4：Beta Target Encoding (擱置)

In [None]:
# One hot encoding 後 XGBoost / 前處理在 P2
X4=X2 #inherit
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X4,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.8605081039961279

[0.85613389 0.85749386 0.86394349 0.86578624 0.85918305]
Accuracy of XGBoost cross-vaild test: 0.8605081039961279


In [None]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4,y)
scores41 = cross_val_score(clf41,X4,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test for one-hot encoding: 0.8537207195889831

[0.85337018 0.8470516  0.85304054 0.85749386 0.85764742]
Accuracy of RandomForest cross-vaild test: 0.8537207195889831


In [None]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4,y)
scores42 = cross_val_score(clf42,X4,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())
# Accuracy of Lightgbm cross-vaild test for one-hot encoding: 0.8741748264951859

[0.86764932 0.87315725 0.87423219 0.87776413 0.87807125]
Accuracy of Lightgbm cross-vaild test: 0.8741748264951859


In [None]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4,y)
scores43 = cross_val_score(clf43,X4,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test for one-hot encoding: 0.7891032997320423



[0.79656072 0.77902334 0.79499386 0.77134521 0.80359337]
Accuracy of Lightgbm cross-vaild test: 0.7891032997320423


In [None]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4,y)
scores44 = cross_val_score(clf44,X4,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding:

[0.79548595 0.79376536 0.79468673 0.79637592 0.79422604]
Accuracy of Lightgbm cross-vaild test: 0.7949080010457257


## P4：Leave-One-Out Encoding

In [None]:
encc = LeaveOneOutEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country']
                          ,sigma=0.05).fit(X, y)
X4c = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [None]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.8608765459064861

[0.85966528 0.8585688  0.86240786 0.86194717 0.86179361]
Accuracy of XGBoost cross-vaild test: 0.8608765459064861


In [None]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41c=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for LOO encoding: 0.8599245894904577

[0.85536619 0.85826167 0.86087224 0.8622543  0.86286855]
Accuracy of RandomForest cross-vaild test: 0.8599245894904577


In [None]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test for LOO encoding: 0.8745433061301325

[0.8699524  0.87285012 0.87684275 0.87668919 0.87638206]
Accuracy of Lightgbm cross-vaild test: 0.8745433061301325


In [None]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4c,y)
scores43c = cross_val_score(clf43,X4c,y,cv=5,scoring='accuracy')
print(scores43c)
print('Accuracy of MLP cross-vaild test:',scores43c.mean())
# Accuracy of MLP cross-vaild test for LOO encoding: 0.7934030446006494



[0.7968678  0.76812039 0.79929361 0.80052211 0.8022113 ]
Accuracy of MLP cross-vaild test: 0.7934030446006494




In [None]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4c,y)
scores44c = cross_val_score(clf44,X4c,y,cv=5,scoring='accuracy')
print(scores44c)
print('Accuracy of SVM cross-vaild test:',scores44c.mean())
# Accuracy of SVM cross-vaild test for LOO encoding: 0.7952458341679899

[0.79563949 0.79407248 0.79530098 0.79699017 0.79422604]
Accuracy of SVM cross-vaild test: 0.7952458341679899


## P5：Combinations of numerical and categorical feature transformation

挑選較常用的六個組合

In [None]:
# Standardization *  Label Encoding
import pandas as pd
import numpy as np
df= pd.read_csv('income_evaluation.csv')

#Label encoding
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for col in df[[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country',' income']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns=' education') #此列已經數值化(education-num)，所以丟棄
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df[' income']#應變數
X=df.drop(columns=' income') #剩下的為自變數

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5 = sc.fit_transform(X)

In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores5 = cross_val_score(clf5,X5,y,cv=5,scoring='accuracy')
print(scores5)
print('Accuracy of XGBoost cross-vaild test:',scores5.mean())

# Accuracy: 0.8626271507259531

[0.85997236 0.8585688  0.86210074 0.86624693 0.86624693]
Accuracy of XGBoost cross-vaild test: 0.8626271507259531


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
# Standardization *  LOO
from category_encoders import *
encc = LeaveOneOutEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country']
                          ,sigma=0.05).fit(X, y)
X5a = encc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5a = sc.fit_transform(X5a)

In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5a = cross_val_score(clf5,X5a,y,cv=5,scoring='accuracy')
print(scores5a)
print('Accuracy of XGBoost cross-vaild test:',scores5a.mean())

# Accuracy: 0.8608765459064861

[0.85966528 0.8585688  0.86240786 0.86194717 0.86179361]
Accuracy of XGBoost cross-vaild test: 0.8608765459064861


In [None]:
# Standardization *  Target Encoding
enc = TargetEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X5b = enc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5b = sc.fit_transform(X5b)

In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5b = cross_val_score(clf5,X5b,y,cv=5,scoring='accuracy')
print(scores5b)
print('Accuracy of XGBoost cross-vaild test:',scores5b.mean())

# Accuracy: 0.8601089071148952

[0.85398434 0.85995086 0.86025799 0.86240786 0.86394349]
Accuracy of XGBoost cross-vaild test: 0.8601089071148952


In [None]:
# Equal‐Frequency Binning *  Label Encoding
df= pd.read_csv('income_evaluation.csv')

#Label encoding
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for col in df[[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country',' income']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns=' education') #此列已經數值化(education-num)，所以丟棄
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df[' income']#應變數
X=df.drop(columns=' income') #剩下的為自變數

# Equal Frequency Binning
from sklearn.preprocessing import KBinsDiscretizer
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['age']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[[' fnlwgt']])
disc31 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
disc31.fit(X[[' capital-gain']])
disc41 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
disc41.fit(X[[' capital-loss']])
disc51 = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
disc51.fit(X[[' hours-per-week']])

X31 = X #inherit
X31[['age']]=disc11.transform(X31[['age']])
X31[[' fnlwgt']]=disc21.transform(X31[[' fnlwgt']])
X31[[' capital-gain']]=disc31.transform(X31[[' capital-gain']])
X31[[' capital-loss']]=disc41.transform(X31[[' capital-loss']])
X31[[' capital-loss']]=disc51.transform(X31[[' hours-per-week']])
X5c = pd.DataFrame(X31)



In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5c = cross_val_score(clf5,X5c,y,cv=5,scoring='accuracy')
print(scores5c)
print('Accuracy of XGBoost cross-vaild test:',scores5c.mean())

# Accuracy: 0.8330826675886556

[0.82834331 0.83154177 0.83046683 0.8355344  0.83952703]
Accuracy of XGBoost cross-vaild test: 0.8330826675886556


In [None]:
# Equal‐Frequency Binning * Leave‐One‐Out Encoding
encc = LeaveOneOutEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country']
                          ,sigma=0.05).fit(X, y)
X5d = encc.transform(X)

In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5d = cross_val_score(clf5,X5d,y,cv=5,scoring='accuracy')
print(scores5d)
print('Accuracy of XGBoost cross-vaild test:',scores5d.mean())

# Accuracy: 0.8345874495575094

[0.83264241 0.83323096 0.83123464 0.83860565 0.83722359]
Accuracy of XGBoost cross-vaild test: 0.8345874495575094


In [None]:
# Equal‐Frequency Binning * Target Encoding
enc = TargetEncoder(cols=[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' native-country'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X5e = enc.transform(X)

In [None]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5e = cross_val_score(clf5,X5e,y,cv=5,scoring='accuracy')
print(scores5e)
print('Accuracy of XGBoost cross-vaild test:',scores5e.mean())

# Accuracy: 0.8319155159724023

[0.83079994 0.83200246 0.82493857 0.83645577 0.83538084]
Accuracy of XGBoost cross-vaild test: 0.8319155159724023


Standard Scalar 效果較佳，且 Label Encoder 效果又較 Target Encoding, LOO 佳

## P6：Categorical values of a feature is high (超過20種)

這邊為 native country 共有 42種類別，所以將其進行 encoding，所有 label encoder 的結果視為 baseline

In [None]:
# One hot encoding
import pandas as pd
import numpy as np
df= pd.read_csv('income_evaluation.csv')

from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for col in df[[' workclass',' marital-status',' occupation',' relationship',' race',' sex',' income']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns=' education') #此列已經數值化(education-num)，所以丟棄
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df[' income']#應變數
X=df.drop(columns=' income') #剩下的為自變數

In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [None]:
X

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,7,77516,13,4,1,1,4,1,2174,0,40,United-States
1,50,6,83311,13,2,4,0,4,1,0,0,13,United-States
2,38,4,215646,9,0,6,1,4,1,0,0,40,United-States
3,53,4,234721,7,2,6,0,2,1,0,0,40,United-States
4,28,4,338409,13,2,10,5,2,0,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,12,2,13,5,4,0,0,0,38,United-States
32557,40,4,154374,9,2,7,0,4,1,0,0,40,United-States
32558,58,4,151910,9,6,1,4,4,0,0,0,40,United-States
32559,22,4,201490,9,4,1,3,4,1,0,0,20,United-States


In [None]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=[' native-country']).fit(X,y)
X6 = enc6.transform(X)

In [None]:
X6

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,native-country_41,native-country_42
0,39,7,77516,13,4,1,1,4,1,2174,...,0,0,0,0,0,0,0,0,0,0
1,50,6,83311,13,2,4,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0
2,38,4,215646,9,0,6,1,4,1,0,...,0,0,0,0,0,0,0,0,0,0
3,53,4,234721,7,2,6,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
4,28,4,338409,13,2,10,5,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,12,2,13,5,4,0,0,...,0,0,0,0,0,0,0,0,0,0
32557,40,4,154374,9,2,7,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0
32558,58,4,151910,9,6,1,4,4,0,0,...,0,0,0,0,0,0,0,0,0,0
32559,22,4,201490,9,4,1,3,4,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy: 0.8609994856252342

[0.85674804 0.85703317 0.86302211 0.86440418 0.86378993]
Accuracy of XGBoost cross-vaild test: 0.8609994856252342


In [None]:
# Target Encoder
enc6a = TargetEncoder(cols=[' native-country'],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [None]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy: 0.8606309305411102

[0.85690158 0.85933661 0.86409705 0.86087224 0.86194717]
Accuracy of XGBoost cross-vaild test: 0.8606309305411102


## P7：Undersampling

In [None]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [None]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy: 0.8088915503243056

[0.77303156 0.80745936 0.82270408 0.81983418 0.82142857]
Accuracy of XGBoost cross-vaild test: 0.8088915503243056


In [None]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [None]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy: 0.8359887420062064

[0.84475614 0.85176921 0.83992347 0.83195153 0.81154337]
Accuracy of XGBoost cross-vaild test: 0.8359887420062064


In [None]:
# Condensed Nearest Neighbour (跑不出來先略過)
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour()
X7b, y7b = cnn.fit_resample(X, y)

KeyboardInterrupt: ignored

In [None]:
scores7b = cross_val_score(clf7,X7b,y7b,cv=5,scoring='accuracy')
print(scores7b)
print('Accuracy of XGBoost cross-vaild test:',scores7b.mean())


In [None]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [None]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.8589394520028113

[0.85760518 0.86199723 0.85899214 0.85575589 0.86034682]
Accuracy of XGBoost cross-vaild test: 0.8589394520028113


In [None]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [None]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.8569771067096544

[0.85184358 0.85564246 0.85743017 0.8572067  0.86276263]
Accuracy of XGBoost cross-vaild test: 0.8569771067096544


In [None]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [None]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.8625555942121375

[0.85569746 0.86000995 0.85868303 0.86877903 0.86960849]
Accuracy of XGBoost cross-vaild test: 0.8625555942121375


In [None]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [None]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.8619305086852584

[0.85965494 0.86446583 0.8624523  0.8599635  0.86311598]
Accuracy of XGBoost cross-vaild test: 0.8619305086852584


## P7：Oversampling

In [None]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [None]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.8759304207119742

[0.78226133 0.85072816 0.90857605 0.91828479 0.91980178]
Accuracy of XGBoost cross-vaild test: 0.8759304207119742


In [None]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [None]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.8751415857605178

[0.78418285 0.85254854 0.91454288 0.90877832 0.91565534]
Accuracy of XGBoost cross-vaild test: 0.8751415857605178


In [None]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [None]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.871004842615012

[0.78198144 0.84705408 0.90617433 0.91010896 0.90970541]
Accuracy of XGBoost cross-vaild test: 0.871004842615012


## P7：Ensemble

In [None]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [None]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.928167809408128

[0.86907175 0.92055128 0.95135792 0.94871275 0.95114535]
Accuracy of XGBoost cross-vaild test: 0.928167809408128


In [None]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [None]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.8800619387467432

[0.78728814 0.85741525 0.9154661  0.91663136 0.92350885]
Accuracy of XGBoost cross-vaild test: 0.8800619387467432


SMOTE+ENN 效果最好

## P8：SMOTE‐based Oversampling

In [None]:
# 記得跑上面
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X8, y8 = smote.fit_resample(X, y)

In [None]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf8= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores8 = cross_val_score(clf8,X8,y8,cv=5,scoring='accuracy')
print(scores8)
print('Accuracy of XGBoost cross-vaild test:',scores8.mean())

# Accuracy: 0.8759304207119742

In [None]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X8a, y8a = bsmote.fit_resample(X, y)

In [None]:
# XGBoost 
scores8a = cross_val_score(clf8,X8a,y8a,cv=5,scoring='accuracy')
print(scores8a)
print('Accuracy of XGBoost cross-vaild test:',scores8a.mean())

# Accuracy: 0.8759304207119742

## P9：Imbalance Ratio vs. Resampling Strategy

In [None]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [5]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [8]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [9]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.8518656757500688

[0.84835876 0.845816   0.85460009 0.85598706 0.85456647]
Accuracy of RandomForest cross-vaild test: 0.8518656757500688


In [10]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.8650426644646299

[0.85829866 0.86338419 0.86430883 0.86638927 0.87283237]
Accuracy of Lightgbm cross-vaild test: 0.8650426644646299


In [11]:
# MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf0b=MLP.fit(X0,y0)
scores0b = cross_val_score(clf0b,X0,y0,cv=5,scoring='accuracy')
print(scores0b)
print('Accuracy of MLP cross-vaild test:',scores0b.mean())
# Accuracy of MLP cross-vaild test: 0.7249531292173417



[0.73509015 0.73000462 0.69417476 0.72931114 0.73618497]
Accuracy of MLP cross-vaild test: 0.7249531292173417




In [12]:
# SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf0c=svm.fit(X0,y0)
scores0c = cross_val_score(clf0c,X0,y0,cv=5,scoring='accuracy')
print(scores0c)
print('Accuracy of SVM cross-vaild test:',scores0c.mean())
# Accuracy of SVM cross-vaild test : 0.6980904064414923



[0.69972261 0.6962552  0.69810448 0.70087841 0.69549133]
Accuracy of SVM cross-vaild test: 0.6980904064414923


## P10：Tomek Links

In [13]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [14]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.8588067538637352

[0.85486814 0.85072151 0.8588489  0.86662243 0.86297279]
Accuracy of RandomForest cross-vaild test: 0.8588067538637352


In [15]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.8650426644646299

[0.86863493 0.87045945 0.87477194 0.87375581 0.87856669]
Accuracy of Lightgbm cross-vaild test: 0.8650426644646299


In [16]:
scores01b = cross_val_score(clf0b,X01,y01,cv=5,scoring='accuracy')
print(scores01b)
print('Accuracy of MLP cross-vaild test:',scores01b.mean())
# Accuracy of MLP cross-vaild test: 0.77845532701212



[0.78503898 0.78238514 0.78901974 0.7876576  0.74817518]
Accuracy of MLP cross-vaild test: 0.77845532701212


In [18]:
scores01c = cross_val_score(clf0c,X01,y01,cv=5,scoring='accuracy')
print(scores01c)
print('Accuracy of SVM cross-vaild test:',scores01c.mean())
# Accuracy of SVM cross-vaild test : 0.7797830503594494

[0.78089235 0.77857024 0.77939957 0.78185136 0.77820173]
Accuracy of SVM cross-vaild test: 0.7797830503594494


## P10：One Sided Selection

In [19]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [20]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.8579344328116383

[0.85619506 0.85351692 0.8561712  0.86463172 0.85915727]
Accuracy of RandomForest cross-vaild test: 0.8579344328116383


In [21]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.8650426644646299

[0.86730801 0.87126742 0.87408759 0.87425348 0.87806901]
Accuracy of Lightgbm cross-vaild test: 0.8650426644646299


In [22]:
scores02b = cross_val_score(clf0b,X02,y02,cv=5,scoring='accuracy')
print(scores02b)
print('Accuracy of MLP cross-vaild test:',scores02b.mean())
# Accuracy of MLP cross-vaild test:  0.7698834916791799



[0.71156079 0.78848706 0.7919708  0.77853351 0.7788653 ]
Accuracy of MLP cross-vaild test: 0.7698834916791799




In [23]:
scores02c = cross_val_score(clf0c,X02,y02,cv=5,scoring='accuracy')
print(scores02c)
print('Accuracy of SVM cross-vaild test:',scores02c.mean())
# Accuracy of SVM cross-vaild test : 0.7798015684642124

[0.78072649 0.7786994  0.77952887 0.78185136 0.77820173]
Accuracy of SVM cross-vaild test: 0.7798015684642124


## P10：SMOTE

In [24]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [25]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.8794498381877022

[0.7967233  0.8595267  0.91110437 0.9170712  0.91282362]
Accuracy of RandomForest cross-vaild test: 0.8794498381877022


In [26]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.8788632686084142

[0.79419498 0.85730178 0.91312702 0.9167678  0.91292476]
Accuracy of Lightgbm cross-vaild test: 0.8788632686084142


In [27]:
scores03b = cross_val_score(clf0b,X03,y03,cv=5,scoring='accuracy')
print(scores03b)
print('Accuracy of MLP cross-vaild test:',scores03b.mean())
# Accuracy of MLP cross-vaild test: 0.6025687702265372



[0.62459547 0.61701052 0.52912621 0.62095469 0.62115696]
Accuracy of MLP cross-vaild test: 0.6025687702265372




In [28]:
scores03c = cross_val_score(clf0c,X03,y03,cv=5,scoring='accuracy')
print(scores03c)
print('Accuracy of SVM cross-vaild test:',scores03c.mean())
# Accuracy of SVM cross-vaild test : 0.5028519417475728



[0.51375405 0.50040453 0.49403317 0.50323625 0.50283172]
Accuracy of SVM cross-vaild test: 0.5028519417475728


## P10：Borderline-SMOTE

In [29]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [30]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.8787014563106796

[0.79894822 0.85912217 0.91110437 0.91292476 0.91140777]
Accuracy of RandomForest cross-vaild test: 0.8787014563106796


In [31]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.876779935275081

[0.79470065 0.85558252 0.91069984 0.9118123  0.91110437]
Accuracy of Lightgbm cross-vaild test: 0.876779935275081


In [32]:
scores04b = cross_val_score(clf0b,X04,y04,cv=5,scoring='accuracy')
print(scores04b)
print('Accuracy of MLP cross-vaild test:',scores04b.mean())
# Accuracy of MLP cross-vaild test: 0.5436893203883495



[0.6203479  0.53205906 0.53509304 0.5309466  0.5       ]
Accuracy of MLP cross-vaild test: 0.5436893203883495


In [33]:
scores04c = cross_val_score(clf0c,X04,y04,cv=5,scoring='accuracy')
print(scores04c)
print('Accuracy of SVM cross-vaild test:',scores04c.mean())
# Accuracy of SVM cross-vaild test : 0.502366504854369



[0.50616909 0.5006068  0.49888754 0.50546117 0.50070793]
Accuracy of SVM cross-vaild test: 0.502366504854369


## P10：SMOTE + ENN

In [34]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [35]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.924471223176495

[0.87344101 0.92107953 0.94029851 0.94377428 0.94376278]
Accuracy of RandomForest cross-vaild test: 0.924471223176495


In [36]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of Lightgbm cross-vaild test:',scores05a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9217723718578299

[0.8587201  0.91985279 0.94213862 0.94479657 0.94335378]
Accuracy of Lightgbm cross-vaild test: 0.9217723718578299


In [38]:
scores05b = cross_val_score(clf0b,X05,y05,cv=5,scoring='accuracy')
print(scores05b)
print('Accuracy of MLP cross-vaild test:',scores05b.mean())
# Accuracy of MLP cross-vaild test: 0.6699521553506524



[0.68268248 0.67450419 0.6505827  0.6620323  0.6799591 ]
Accuracy of MLP cross-vaild test: 0.6699521553506524




In [39]:
scores05c = cross_val_score(clf0c,X05,y05,cv=5,scoring='accuracy')
print(scores05c)
print('Accuracy of SVM cross-vaild test:',scores05c.mean())
# Accuracy of SVM cross-vaild test : 0.6277498046367874



[0.63995093 0.6248211  0.62645676 0.62604784 0.62147239]
Accuracy of SVM cross-vaild test: 0.6277498046367874


## P10：SMOTE + Tomek Links

In [40]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [41]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.8834445503654275

[0.80139816 0.86240864 0.91833492 0.91748755 0.91759348]
Accuracy of RandomForest cross-vaild test: 0.8834445503654275


In [42]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.8823429721427815

[0.79493698 0.86399746 0.91801716 0.91854676 0.9162165 ]
Accuracy of Lightgbm cross-vaild test: 0.8823429721427815


In [43]:
scores06b = cross_val_score(clf0b,X06,y06,cv=5,scoring='accuracy')
print(scores06b)
print('Accuracy of MLP cross-vaild test:',scores06b.mean())
# Accuracy of MLP cross-vaild test: 0.6152949899375068



[0.6138121  0.61148183 0.61730749 0.61148183 0.6223917 ]
Accuracy of MLP cross-vaild test: 0.6152949899375068




In [44]:
scores06c = cross_val_score(clf0c,X06,y06,cv=5,scoring='accuracy')
print(scores06c)
print('Accuracy of SVM cross-vaild test:',scores06c.mean())
# Accuracy of SVM cross-vaild test : 0.5239275500476644



[0.52632136 0.52335558 0.52536808 0.52113124 0.5234615 ]
Accuracy of SVM cross-vaild test: 0.5239275500476644
