## 讀取資料 (Default：Label Encoding)

In [1]:
#讀取資料
import pandas as pd
import numpy as np

df= pd.read_csv('bank-full.csv',sep=';') 

In [2]:
# all columns
print(list(df.columns))

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [3]:
#資料前處理  # Label Encoder
#將部分字串轉數值
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數

In [4]:
y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [5]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


## P1：標準化是否影響結果

In [6]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())
# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.85192967 0.66213227 0.69951338 0.51183367 0.40024331]
Accuracy of XGBoost cross-vaild test: 0.6251304594189481


In [7]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [8]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

# Accuracy of XGBoost cross-vaild test: 0.9362212662992337
##標準化在XGBoost的交叉驗證上沒有差異太大

[0.85192967 0.66213227 0.69962398 0.51183367 0.40024331]
Accuracy of XGBoost cross-vaild test: 0.6251525784191693


## P3：Feature Binning 有沒有效果

In [9]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [10]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


In [11]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['balance']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [12]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['balance']])

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [13]:
disc1.bin_edges_

array([array([ -8019. ,   2995.6,  14010.2,  25024.8,  36039.4,  47054. ,
               58068.6,  69083.2,  80097.8,  91112.4, 102127. ])         ],
      dtype=object)

In [14]:
disc11.bin_edges_

array([array([-8.01900e+03,  0.00000e+00,  2.20000e+01,  1.31000e+02,
               2.72000e+02,  4.48000e+02,  7.01000e+02,  1.12600e+03,
               1.85900e+03,  3.57400e+03,  1.02127e+05])             ],
      dtype=object)

In [15]:
X3 = pd.DataFrame(X)
X3[['balance']]=disc1.transform(X[['balance']])
X3 = pd.DataFrame(X3)

In [16]:
X3

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,0.0,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,0.0,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,0.0,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,0.0,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,0.0,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,0.0,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,0.0,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,1.0,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,0.0,0,0,1,17,9,508,4,-1,0,3


In [17]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

#Accuracy of XGBoost cross-vaild test: 0.9304140341443743
# Frequency較佳

[0.84971801 0.69387304 0.67772617 0.44215881 0.4071002 ]
Accuracy of XGBoost cross-vaild test: 0.6141152462284023


In [18]:
#資料前處理 # Label Encoder
df= pd.read_csv('bank-full.csv',sep=';') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數

X31=pd.DataFrame(X)
X31[['balance']]=disc11.transform(X[['balance']])
X31 = pd.DataFrame(X31)

In [19]:
X31

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,8.0,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,2.0,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,1.0,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,7.0,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1.0,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,6.0,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,7.0,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,9.0,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,5.0,0,0,1,17,9,508,4,-1,0,3


In [20]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf3,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

#Accuracy of XGBoost cross-vaild test:  0.935065196935072
# Frequency較佳

[0.83666925 0.68624198 0.68922805 0.48318956 0.41240876]
Accuracy of XGBoost cross-vaild test: 0.6215475189284023


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [21]:
X3

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,0.0,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,0.0,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,0.0,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,0.0,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,0.0,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,0.0,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,0.0,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,1.0,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,0.0,0,0,1,17,9,508,4,-1,0,3


In [22]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [23]:
from category_encoders import *
enc2 = OneHotEncoder(cols=['balance']).fit(X3,y)
X2 = enc2.transform(X3)
X2 = pd.DataFrame(X2)
X2

Unnamed: 0,age,job,marital,education,default,balance_1,balance_2,balance_3,balance_4,balance_5,...,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,1,0,0,0,0,...,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,1,0,0,0,0,...,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,1,0,0,0,0,...,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1,0,0,0,0,...,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,0,0,...,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,1,0,0,0,0,...,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1,0,0,0,0,...,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,0,1,0,0,0,...,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,1,0,0,0,0,...,0,0,1,17,9,508,4,-1,0,3


In [24]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X3,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.9362212662992337
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.9420217771205808
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.84971801 0.69387304 0.67772617 0.44215881 0.4071002 ]
Accuracy of XGBoost cross-vaild test: 0.6141152462284023


## P4：Label Encoding (這裡不用)

In [25]:
df= pd.read_csv('bank-full.csv',sep=';') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數


In [26]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.85192967 0.66213227 0.69951338 0.51183367 0.40024331]
Accuracy of XGBoost cross-vaild test: 0.6251304594189481


In [27]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test: 0.9327261728726979

[0.88377751 0.800929   0.7589029  0.60230038 0.42623313]
Accuracy of RandomForest cross-vaild test: 0.6944285826695983


In [28]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9373706143298831

[0.88311401 0.74242424 0.73965937 0.5199071  0.38155275]
Accuracy of Lightgbm cross-vaild test: 0.6533314949345093


## P4：One Hot Encoding

In [31]:
# 清除 Label Encoder 後，再做 One Hot Encoding
X2

Unnamed: 0,age,job,marital,education,default,balance_1,balance_2,balance_3,balance_4,balance_5,...,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,1,0,0,0,0,...,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,1,0,0,0,0,...,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,1,0,0,0,0,...,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1,0,0,0,0,...,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,0,0,...,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,1,0,0,0,0,...,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1,0,0,0,0,...,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,0,1,0,0,0,...,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,1,0,0,0,0,...,0,0,1,17,9,508,4,-1,0,3


In [32]:
# One hot encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4,X2,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())
# Accuracy of XGBoost cross-vaild test: 0.9420217771205808

[0.76324229 0.64742314 0.68414068 0.46936518 0.36396815]
Accuracy of XGBoost cross-vaild test: 0.5856278867001359


In [33]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X2,y)
scores41a = cross_val_score(clf41,X2,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test: 0.9223215485952412

[0.88333518 0.79440389 0.76653395 0.6128069  0.48153063]
Accuracy of RandomForest cross-vaild test: 0.7077221115864548


In [34]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X2,y)
scores42a = cross_val_score(clf42,X2,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9362212662992337

[0.87703196 0.74651626 0.76929883 0.51205486 0.41307233]
Accuracy of Lightgbm cross-vaild test: 0.6635948455660602


## P4：Frequency Encoding

In [37]:
# Frequency encoding：用類別出現頻率當作該類別數值
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


In [38]:
# "job","marital","education","default","housing","month","loan","contact","duration","poutcome"
X4b=pd.DataFrame(X)
enc1=X4b['job'].value_counts()
X4b['job']=X4b['job'].apply(lambda x : enc1[x]) 
enc2=X4b['education'].value_counts()
X4b['education']=X4b['education'].apply(lambda x : enc2[x]) 
enc3=X4b['marital'].value_counts()
X4b['marital']=X4b['marital'].apply(lambda x : enc3[x]) 
enc4=X4b['default'].value_counts()
X4b['default']=X4b['default'].apply(lambda x : enc4[x]) 
enc5=X4b['housing'].value_counts()
X4b['housing']=X4b['housing'].apply(lambda x : enc5[x]) 
enc6=X4b['month'].value_counts()
X4b['month']=X4b['month'].apply(lambda x : enc6[x]) 
enc7=X4b['loan'].value_counts()
X4b['loan']=X4b['loan'].apply(lambda x : enc7[x]) 
enc7=X4b['contact'].value_counts()
X4b['contact']=X4b['contact'].apply(lambda x : enc7[x]) 
enc8=X4b['duration'].value_counts()
X4b['duration']=X4b['duration'].apply(lambda x : enc8[x]) 
enc9=X4b['poutcome'].value_counts()
X4b['poutcome']=X4b['poutcome'].apply(lambda x : enc9[x]) 

In [39]:
X4b

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,9458,27214,13301,44396,2143,25130,37967,13020,5,13766,73,1,-1,0,36959
1,44,7597,12790,23202,44396,29,25130,37967,13020,5,13766,157,1,-1,0,36959
2,33,1487,27214,23202,44396,2,25130,7244,13020,5,13766,160,1,-1,0,36959
3,47,9732,27214,1857,44396,1506,25130,37967,13020,5,13766,168,1,-1,0,36959
4,33,288,12790,1857,44396,1,20081,37967,13020,5,13766,120,1,-1,0,36959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,7597,27214,13301,44396,825,20081,37967,29285,17,3970,3,3,-1,0,36959
45207,71,2264,5207,6851,44396,1729,20081,37967,29285,17,3970,27,2,-1,0,36959
45208,72,2264,27214,23202,44396,5715,20081,37967,29285,17,3970,3,5,184,3,1511
45209,57,9732,27214,23202,44396,668,20081,37967,2906,17,3970,19,4,-1,0,36959


In [40]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4b= XGBClassifier(**params)

# 5-fold validation
scores4b = cross_val_score(clf4b,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test: 0.936194380965183

[0.87670021 0.72152179 0.75702278 0.46715328 0.24928113]
Accuracy of XGBoost cross-vaild test: 0.614335839411411


In [41]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41b=RF.fit(X4b,y)
scores41b = cross_val_score(clf41b,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test: 0.9315633821750235

[0.88300343 0.80601637 0.80778589 0.49059942 0.23379783]
Accuracy of RandomForest cross-vaild test: 0.6442405882895804


In [42]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42b=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42b,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9408321010888561

[0.88289285 0.75812873 0.72904225 0.45708914 0.32625525]
Accuracy of Lightgbm cross-vaild test: 0.6306816435999714


In [43]:
# Frequency encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43b=MLP.fit(X4b,y)
scores43b = cross_val_score(clf43b,X4b,y,cv=5,scoring='accuracy')
print(scores43b)
print('Accuracy of MLP cross-vaild test:',scores43b.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.88300343 0.88310108 0.84925901 0.8826587  0.65361646]
Accuracy of MLP cross-vaild test: 0.8303277371504519


In [44]:
# Frequency encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44b=svm.fit(X4b,y)
scores44b = cross_val_score(clf44b,X4b,y,cv=5,scoring='accuracy')
print(scores44b)
print('Accuracy of SVM cross-vaild test:',scores44b.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233



[0.88300343 0.88310108 0.88299049 0.88299049 0.63912851]
Accuracy of SVM cross-vaild test: 0.8342428001896026


## P4：Target Encoding

In [45]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
df= pd.read_csv('bank-full.csv',sep=';') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


In [47]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4c = enc.transform(X)

In [48]:
X4c

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,0.137556,0.101235,0.150064,0.117961,2143,0.077000,0.126557,0.040707,5,0.067195,0.259562,1,-1,0,0.091615
1,44,0.110570,0.149492,0.105594,0.117961,29,0.077000,0.126557,0.040707,5,0.067195,0.031847,1,-1,0,0.091615
2,33,0.082717,0.101235,0.105594,0.117961,2,0.077000,0.066814,0.040707,5,0.067195,0.006250,1,-1,0,0.091615
3,47,0.072750,0.101235,0.135703,0.117961,1506,0.077000,0.126557,0.040707,5,0.067195,0.017857,1,-1,0,0.091615
4,33,0.118056,0.149492,0.135703,0.117961,1,0.167024,0.126557,0.040707,5,0.067195,0.033337,1,-1,0,0.091615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0.110570,0.101235,0.150064,0.117961,825,0.167024,0.126557,0.149189,17,0.101511,0.150403,3,-1,0,0.091615
45207,71,0.227915,0.119455,0.086265,0.117961,1729,0.167024,0.126557,0.149189,17,0.101511,0.212051,2,-1,0,0.091615
45208,72,0.227915,0.101235,0.105594,0.117961,5715,0.167024,0.126557,0.149189,17,0.101511,0.253380,5,184,3,0.647253
45209,57,0.072750,0.101235,0.105594,0.117961,668,0.167024,0.126557,0.134205,17,0.101511,0.161419,4,-1,0,0.091615


In [49]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.88665266 0.58770184 0.71112586 0.5200177  0.32581287]
Accuracy of XGBoost cross-vaild test: 0.606262184192446


In [50]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.88433042 0.75193541 0.7543685  0.54888299 0.36540588]
Accuracy of RandomForest cross-vaild test: 0.6609846421052572


In [51]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454765425460412
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.88521508 0.67120106 0.72185357 0.52167662 0.42756027]
Accuracy of Lightgbm cross-vaild test: 0.6455013223825816


## P4：Leave-One-Out Encoding

In [54]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,5715,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3


In [55]:
encc = LeaveOneOutEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],sigma=0.05).fit(X, y)
X4d = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [56]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4d = cross_val_score(clf4,X4d,y,cv=5,scoring='accuracy')
print(scores4d)
print('Accuracy of XGBoost cross-vaild test:',scores4d.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.9466258905766904

[0.83578458 0.65726609 0.7323601  0.54346384 0.32570228]
Accuracy of XGBoost cross-vaild test: 0.6189153774699234


In [57]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4d,y)
scores41d = cross_val_score(clf41,X4d,y,cv=5,scoring='accuracy')
print(scores41d)
print('Accuracy of RandomForest cross-vaild test:',scores41d.mean())

# Accuracy of RandomForest cross-vaild test: 0.9559214948245731

[0.89306646 0.79285556 0.76211015 0.57365627 0.3473789 ]
Accuracy of RandomForest cross-vaild test: 0.6738134690011005


In [58]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4d,y)
scores42d = cross_val_score(clf42,X4d,y,cv=5,scoring='accuracy')
print(scores42d)
print('Accuracy of Lightgbm cross-vaild test:',scores42d.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454899852130663

[0.8979321  0.69995576 0.74131829 0.53262553 0.42003981]
Accuracy of Lightgbm cross-vaild test: 0.6583742992235749


## P5：Combinations of numerical and categorical feature transformation

挑選較常用的六個組合

In [61]:
# Standardization *  Label Encoding
df= pd.read_csv('bank-full.csv',sep=';') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5 = sc.fit_transform(X)

In [62]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores5 = cross_val_score(clf5,X5,y,cv=5,scoring='accuracy')
print(scores5)
print('Accuracy of XGBoost cross-vaild test:',scores5.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.85192967 0.66213227 0.69962398 0.51183367 0.40024331]
Accuracy of XGBoost cross-vaild test: 0.6251525784191693


In [63]:
# Standardization *  LOO
from category_encoders import *
encc = LeaveOneOutEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],sigma=0.05).fit(X, y)
X5a = encc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5a = sc.fit_transform(X5a)

In [64]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5a = cross_val_score(clf5,X5a,y,cv=5,scoring='accuracy')
print(scores5a)
print('Accuracy of XGBoost cross-vaild test:',scores5a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9466258905766904

[0.83578458 0.65726609 0.7323601  0.54346384 0.32570228]
Accuracy of XGBoost cross-vaild test: 0.6189153774699234


In [65]:
# Standardization *  Target Encoding
enc = TargetEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],min_samples_leaf=20, smoothing=10).fit(X, y)
X5b = enc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5b = sc.fit_transform(X5b)

In [66]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5b = cross_val_score(clf5,X5b,y,cv=5,scoring='accuracy')
print(scores5b)
print('Accuracy of XGBoost cross-vaild test:',scores5b.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.88665266 0.58770184 0.71112586 0.5200177  0.32581287]
Accuracy of XGBoost cross-vaild test: 0.606262184192446


In [71]:
# Equal‐Frequency Binning *  Label Encoding
from sklearn.preprocessing import KBinsDiscretizer
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['balance']])
X31=pd.DataFrame(X)
X31[['balance']]=disc11.transform(X[['balance']])
X5c = pd.DataFrame(X31)

In [72]:
X5c

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,8.0,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,2.0,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,1.0,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,7.0,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1.0,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,6.0,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,7.0,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,9.0,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,5.0,0,0,1,17,9,508,4,-1,0,3


In [73]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5c = cross_val_score(clf5,X5c,y,cv=5,scoring='accuracy')
print(scores5c)
print('Accuracy of XGBoost cross-vaild test:',scores5c.mean())

# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.83666925 0.68624198 0.68922805 0.48318956 0.41240876]
Accuracy of XGBoost cross-vaild test: 0.6215475189284023


In [74]:
# Equal‐Frequency Binning * Leave‐One‐Out Encoding
encc = LeaveOneOutEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],sigma=0.05).fit(X5c, y)
X5d = encc.transform(X5c)

In [75]:
X5d

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,0.137556,0.101235,0.150064,0.117961,8.0,0.077000,0.126557,0.040707,5,0.067195,0.260274,1,-1,0,0.091615
1,44,0.110570,0.149492,0.105594,0.117961,2.0,0.077000,0.126557,0.040707,5,0.067195,0.031847,1,-1,0,0.091615
2,33,0.082717,0.101235,0.105594,0.117961,1.0,0.077000,0.066814,0.040707,5,0.067195,0.006250,1,-1,0,0.091615
3,47,0.072750,0.101235,0.135703,0.117961,7.0,0.077000,0.126557,0.040707,5,0.067195,0.017857,1,-1,0,0.091615
4,33,0.118056,0.149492,0.135703,0.117961,1.0,0.167024,0.126557,0.040707,5,0.067195,0.033333,1,-1,0,0.091615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0.110570,0.101235,0.150064,0.117961,6.0,0.167024,0.126557,0.149189,17,0.101511,0.333333,3,-1,0,0.091615
45207,71,0.227915,0.119455,0.086265,0.117961,7.0,0.167024,0.126557,0.149189,17,0.101511,0.259259,2,-1,0,0.091615
45208,72,0.227915,0.101235,0.105594,0.117961,9.0,0.167024,0.126557,0.149189,17,0.101511,1.000000,5,184,3,0.647253
45209,57,0.072750,0.101235,0.105594,0.117961,5.0,0.167024,0.126557,0.134205,17,0.101511,0.210526,4,-1,0,0.091615


In [76]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5d = cross_val_score(clf5,X5d,y,cv=5,scoring='accuracy')
print(scores5d)
print('Accuracy of XGBoost cross-vaild test:',scores5d.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.88952781 0.62983853 0.75392612 0.50398142 0.27615572]
Accuracy of XGBoost cross-vaild test: 0.6106859206411951


In [77]:
# Equal‐Frequency Binning * Target Encoding
enc = TargetEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],min_samples_leaf=20, smoothing=10).fit(X5c, y)
X5e = enc.transform(X5c)

In [78]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5e = cross_val_score(clf5,X5e,y,cv=5,scoring='accuracy')
print(scores5e)
print('Accuracy of XGBoost cross-vaild test:',scores5e.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.88455159 0.59201504 0.69166114 0.47887635 0.39559832]
Accuracy of XGBoost cross-vaild test: 0.6085404876888549


Standard Scalar 效果較佳，且 Label Encoder 效果又較 Target Encoding, LOO 佳

## P6：Categorical values of a feature is high (超過20種)

所有 label encoder 的結果視為 baseline

In [79]:
df= pd.read_csv('bank-full.csv',sep=';') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[["job","marital","education","default","housing","month","loan","contact","duration","poutcome","y"]]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['y']#應變數
X=df.drop(columns='y') #剩下的為自變數

X6 = pd.DataFrame(X)

disc6a = KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='quantile')
disc6a.fit(X6[['balance']])
X6[['balance']] = disc6a.transform(X6[['balance']])



In [80]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [81]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"]).fit(X,y)
X6 = enc6.transform(X)

In [82]:
X6

Unnamed: 0,age,job_1,job_2,job_3,job_4,job_5,job_6,job_7,job_8,job_9,...,duration_1571,duration_1572,duration_1573,campaign,pdays,previous,poutcome_1,poutcome_2,poutcome_3,poutcome_4
0,58,1,0,0,0,0,0,0,0,0,...,0,0,0,1,-1,0,1,0,0,0
1,44,0,1,0,0,0,0,0,0,0,...,0,0,0,1,-1,0,1,0,0,0
2,33,0,0,1,0,0,0,0,0,0,...,0,0,0,1,-1,0,1,0,0,0
3,47,0,0,0,1,0,0,0,0,0,...,0,0,0,1,-1,0,1,0,0,0
4,33,0,0,0,0,1,0,0,0,0,...,0,0,0,1,-1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,1,0,0,0,0,0,0,0,...,0,0,0,3,-1,0,1,0,0,0
45207,71,0,0,0,0,0,1,0,0,0,...,0,0,0,2,-1,0,1,0,0,0
45208,72,0,0,0,0,0,1,0,0,0,...,0,0,0,5,184,3,0,0,0,1
45209,57,0,0,0,1,0,0,0,0,0,...,0,0,0,4,-1,0,1,0,0,0


In [83]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy of XGBoost cross-vaild test: 0.935031590267509

[0.88134469 0.21212121 0.31818182 0.22284893 0.18292413]
Accuracy of XGBoost cross-vaild test: 0.3634841551717209


In [84]:
# Target Encoder
enc6a = TargetEncoder(cols=["job","marital","education","default","housing","month","loan","contact","duration","poutcome"],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [85]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9396894743917192

[0.87979653 0.60440168 0.70050874 0.49292192 0.37016147]
Accuracy of XGBoost cross-vaild test: 0.6095580668761844


## P7：Undersampling

In [86]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,23.0,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,5.0,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,4.0,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,21.0,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,3.0,0,0,2,5,8,198,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,18.0,0,0,0,17,9,975,3,-1,0,3
45207,71,5,0,0,0,22.0,0,0,0,17,9,456,2,-1,0,3
45208,72,5,1,1,0,27.0,0,0,0,17,9,1116,5,184,3,2
45209,57,1,1,1,0,16.0,0,0,1,17,9,508,4,-1,0,3


In [87]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [88]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy of XGBoost cross-vaild test: 0.957330827067669

[0.60349716 0.74196597 0.74574669 0.86619385 0.89408983]
Accuracy of XGBoost cross-vaild test: 0.7702987035621874


In [89]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [90]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy of XGBoost cross-vaild test: 0.7978070175438596

[0.48818526 0.5973535  0.58837429 0.63167849 0.71820331]
Accuracy of XGBoost cross-vaild test: 0.6047589680337137


In [91]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [92]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.9586666666666668

[0.87609121 0.61954397 0.76752671 0.57232213 0.4151681 ]
Accuracy of XGBoost cross-vaild test: 0.6501304238942103


In [93]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [94]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.9472198368398456

[0.8349219  0.62973259 0.767408   0.5694996  0.3839534 ]
Accuracy of XGBoost cross-vaild test: 0.6371030964626953


In [95]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [96]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.9322485207100591

[0.82708143 0.6808097  0.69647758 0.51955627 0.35948759]
Accuracy of XGBoost cross-vaild test: 0.616682513439846


In [97]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [98]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.9385542168674699

[0.77193585 0.68018328 0.69656357 0.50601443 0.41448047]
Accuracy of XGBoost cross-vaild test: 0.613835521074503


## P7：Oversampling

In [99]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [100]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.948709630911188

[0.6937817  0.86361075 0.87118793 0.69422005 0.65311874]
Accuracy of XGBoost cross-vaild test: 0.7551838326599972


In [101]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [102]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.9473327566320646

[0.70693218 0.84651512 0.86354812 0.72703363 0.63558367]
Accuracy of XGBoost cross-vaild test: 0.7559225447262488


In [103]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [104]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.9323432664896079

[0.71155544 0.87618267 0.85031224 0.71267268 0.62511827]
Accuracy of XGBoost cross-vaild test: 0.7551682616777866


## P7：Ensemble

In [105]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [106]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.9785811232163164

[0.81824205 0.89391932 0.91342756 0.80918728 0.71841873]
Accuracy of XGBoost cross-vaild test: 0.8306389870435806


In [107]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [108]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.9412366580787633

[0.70931693 0.86168142 0.86193257 0.68870472 0.64450305]
Accuracy of XGBoost cross-vaild test: 0.7532277350797241


SMOTE+ENN 效果最好

## P8：SMOTE‐based Oversampling

In [109]:
# 記得跑上面
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X8, y8 = smote.fit_resample(X, y)

In [110]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf8= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores8 = cross_val_score(clf8,X8,y8,cv=5,scoring='accuracy')
print(scores8)
print('Accuracy of XGBoost cross-vaild test:',scores8.mean())

# Accuracy: 0.8759304207119742

[0.69603607 0.84050348 0.86298453 0.69929238 0.64835922]
Accuracy of XGBoost cross-vaild test: 0.7494351350644177


In [111]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X8a, y8a = bsmote.fit_resample(X, y)

In [112]:
# XGBoost 
scores8a = cross_val_score(clf8,X8a,y8a,cv=5,scoring='accuracy')
print(scores8a)
print('Accuracy of XGBoost cross-vaild test:',scores8a.mean())

# Accuracy: 0.8759304207119742

[0.69616131 0.831486   0.87688647 0.71588703 0.64716934]
Accuracy of XGBoost cross-vaild test: 0.7535180308014112


## P9：Imbalance Ratio vs. Resampling Strategy

In [113]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [114]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [115]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [116]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.952

[0.89980456 0.79530945 0.80375293 0.72270003 0.54417514]
Accuracy of RandomForest cross-vaild test: 0.7531484202760877


In [117]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.89511401 0.7562215  0.78107897 0.62627052 0.44813657]
Accuracy of Lightgbm cross-vaild test: 0.7013643123402427


## P10：Tomek Links

In [None]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [None]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.9370245139475909

[0.88094694 0.80409424 0.77150046 0.63769442 0.43326089]
Accuracy of RandomForest cross-vaild test: 0.705499388401062


In [None]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.9426666666666668

[0.88026075 0.71786368 0.74451052 0.56461574 0.3343246 ]
Accuracy of Lightgbm cross-vaild test: 0.7013643123402427


## P10：One Sided Selection

In [None]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [None]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.9268595339441598

[0.88055077 0.80585198 0.77246127 0.63350545 0.42639128]
Accuracy of RandomForest cross-vaild test: 0.7037521514629949


In [None]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.87550201 0.74308663 0.75318417 0.56052783 0.34423408]
Accuracy of Lightgbm cross-vaild test: 0.7013643123402427


## P10：SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [None]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.9403714917339485

[0.8473292  0.88847141 0.86273405 0.72390256 0.64735721]
Accuracy of RandomForest cross-vaild test: 0.7939588873093422


In [None]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9535539215686274

[0.72002004 0.90362577 0.86736803 0.71363266 0.63902806]
Accuracy of Lightgbm cross-vaild test: 0.768734911742202


In [None]:
scores03c = cross_val_score(clf0c,X03,y03,cv=5,scoring='accuracy')
print(scores03c)
print('Accuracy of SVM cross-vaild test:',scores03c.mean())
# Accuracy of SVM cross-vaild test : 0.6685097078046904



[0.45769929 0.47654831 0.37146972 0.49464588 0.64810872]
Accuracy of SVM cross-vaild test: 0.4896943842284105


## P10：Borderline-SMOTE

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [None]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.9396914648212226

[0.82635106 0.91114033 0.88089423 0.75214478 0.66075902]
Accuracy of RandomForest cross-vaild test: 0.8062578841382433


In [None]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.948027201076509

[0.72615693 0.90249859 0.87400589 0.74450498 0.65581162]
Accuracy of Lightgbm cross-vaild test: 0.780595601623436


In [None]:
scores04b = cross_val_score(clf0b,X04,y04,cv=5,scoring='accuracy')
print(scores04b)
print('Accuracy of MLP cross-vaild test:',scores04b.mean())
# Accuracy of MLP cross-vaild test: 0.8252619184928873



[0.86373599 0.87613501 0.83699668 0.71889285 0.64172094]
Accuracy of MLP cross-vaild test: 0.7874962955844685


In [None]:
scores04c = cross_val_score(clf0c,X04,y04,cv=5,scoring='accuracy')
print(scores04c)
print('Accuracy of SVM cross-vaild test:',scores04c.mean())
# Accuracy of SVM cross-vaild test : 0.6733732218377547

## P10：SMOTE + ENN

In [None]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [None]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.9791304347826086

In [None]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of Lightgbm cross-vaild test:',scores05a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.982608695652174

In [None]:
scores05b = cross_val_score(clf0b,X05,y05,cv=5,scoring='accuracy')
print(scores05b)
print('Accuracy of MLP cross-vaild test:',scores05b.mean())
# Accuracy of MLP cross-vaild test:  0.8626086956521739

In [None]:
scores05c = cross_val_score(clf0c,X05,y05,cv=5,scoring='accuracy')
print(scores05c)
print('Accuracy of SVM cross-vaild test:',scores05c.mean())
# Accuracy of SVM cross-vaild test : 0.7417391304347826

## P10：SMOTE + Tomek Links

In [None]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [None]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.9413830072366658

In [None]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9490777515167759

In [None]:
scores06b = cross_val_score(clf0b,X06,y06,cv=5,scoring='accuracy')
print(scores06b)
print('Accuracy of MLP cross-vaild test:',scores06b.mean())
# Accuracy of MLP cross-vaild test: 0.7955191150313101

In [None]:
scores06c = cross_val_score(clf0c,X06,y06,cv=5,scoring='accuracy')
print(scores06c)
print('Accuracy of SVM cross-vaild test:',scores06c.mean())
# Accuracy of SVM cross-vaild test : 0.6887673302307448

## 輸出

In [None]:
data=pd.DataFrame([[scores1.mean(),scores11.mean()],
         [scores1.mean(),scores2.mean()],         
         [scores3.mean(),scores31.mean()],
         [scores4.mean(),scores41.mean(),scores42.mean(),scores43.mean(),scores44.mean()],
         [scores4a.mean(),scores41a.mean(),scores42a.mean(),scores43a.mean(),scores44a.mean()],
         [scores4b.mean(),scores41b.mean(),scores42b.mean(),scores43b.mean(),scores44b.mean()]
         [scores4c.mean(),scores41c.mean(),scores42c.mean(),scores43c.mean(),scores44c.mean()],
         [scores4d.mean(),scores41d.mean(),scores42d.mean(),scores43d.mean(),scores44d.mean()],
         [scores5.mean(),scores5a.mean(),scores5b.mean(),scores5c.mean(),scores5d.mean(),scores5e.mean()],
         [scores1.mean(),scores6.mean(),scores6a.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores7g.mean(),scores7h.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores0.mean(),scores0a.mean(),scores0b.mean(),scores0c.mean()],
         [scores01.mean(),scores01a.mean(),scores01b.mean(),scores01c.mean()],
         [scores02.mean(),scores02a.mean(),scores02b.mean(),scores02c.mean()],
         [scores03.mean(),scores03a.mean(),scores03b.mean(),scores03c.mean()],
         [scores04.mean(),scores04a.mean(),scores04b.mean(),scores04c.mean()],
         [scores05.mean(),scores05a.mean(),scores05b.mean(),scores05c.mean()],
         [scores06.mean(),scores06a.mean(),scores06b.mean(),scores06c.mean()]],
         index=['Q1','Q2','Q3','Q4L','Q4O','Q4F','Q4T','Q4LOL','Q6','Q7','Q8','Q9','Q10-1','Q10-2','Q10-3','Q10-4','Q10-5','Q10-6','Q10-7']     )

In [None]:
data.to_csv('Bank_Result.csv')