## 讀取資料 (Default：Label Encoding)

In [1]:
#讀取資料
import pandas as pd
import numpy as np

df= pd.read_csv('online_shoppers_intention.csv',sep=',') 

In [2]:
# all columns
print(list(df.columns))

['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue']


In [3]:
#資料前處理  # Label Encoder
#將部分字串轉數值
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

In [4]:
y

0        0
1        0
2        0
3        0
4        0
        ..
12325    0
12326    0
12327    0
12328    0
12329    0
Name: Revenue, Length: 12330, dtype: int64

In [5]:
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


## P1：標準化是否影響結果

In [6]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())
# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.918897   0.88848337 0.86739659 0.85563666 0.85523114]
Accuracy of XGBoost cross-vaild test: 0.8771289537712894


In [7]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [8]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

# Accuracy of XGBoost cross-vaild test: 0.9362212662992337
##標準化在XGBoost的交叉驗證上沒有差異太大

[0.918897   0.88848337 0.86739659 0.85563666 0.85523114]
Accuracy of XGBoost cross-vaild test: 0.8771289537712894


## P3：Feature Binning 有沒有效果

In [9]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [10]:
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [11]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['ProductRelated_Duration']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [12]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['ProductRelated_Duration']])

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [13]:
disc1.bin_edges_

array([array([    0.      ,  6397.352223, 12794.704446, 19192.056669,
              25589.408892, 31986.761115, 38384.113338, 44781.465561,
              51178.817784, 57576.170007, 63973.52223 ])             ],
      dtype=object)

In [14]:
disc11.bin_edges_

array([array([0.00000000e+00, 3.60000000e+01, 1.29000000e+02, 2.46980000e+02,
              4.02117216e+02, 5.98936905e+02, 8.52000000e+02, 1.22701417e+03,
              1.77592867e+03, 2.87786333e+03, 6.39735222e+04])               ],
      dtype=object)

In [15]:
X3 = pd.DataFrame(X)
X3[['ProductRelated_Duration']]=disc1.transform(X3[['ProductRelated_Duration']])
X3 = pd.DataFrame(X3)

In [16]:
X3

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,0.0,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,0.0,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,0.0,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,0.0,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,0.0,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,0.0,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [17]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

#Accuracy of XGBoost cross-vaild test: 0.9304140341443743
# Frequency較佳

[0.91930251 0.89132198 0.86820762 0.85725872 0.84914842]
Accuracy of XGBoost cross-vaild test: 0.8770478507704784


In [18]:
#資料前處理 # Label Encoder
df= pd.read_csv('online_shoppers_intention.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

X31 = pd.DataFrame(X)
X31[['ProductRelated_Duration']]=disc11.transform(X31[['ProductRelated_Duration']])
X31 = pd.DataFrame(X31)

In [19]:
X31

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,1.0,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,5.0,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,8.0,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,4.0,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,2.0,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,3.0,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [20]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf3,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

#Accuracy of XGBoost cross-vaild test:  0.935065196935072
# Frequency較佳

[0.91484185 0.89294404 0.86820762 0.85360908 0.84833739]
Accuracy of XGBoost cross-vaild test: 0.87558799675588


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [21]:
X3

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,0.0,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,0.0,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,0.0,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,0.0,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,0.0,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,0.0,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [22]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [23]:
from category_encoders import *
enc2 = OneHotEncoder(cols=['TrafficType']).fit(X3,y)
X2 = enc2.transform(X3)
X2 = pd.DataFrame(X2)
X2

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,...,TrafficType_13,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,...,0,0,0,0,0,0,0,0,2,0
1,0,0,0.0,0.000000,0.100000,0.000000,0.0,2,2,2,...,0,0,0,0,0,0,0,0,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,...,0,0,0,0,0,0,0,0,2,0
4,0,0,0.0,0.020000,0.050000,0.000000,0.0,2,3,3,...,0,0,0,0,0,0,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,0.0,0.007143,0.029031,12.241717,0.0,1,4,6,...,0,0,0,0,0,0,0,0,2,1
12326,0,0,0.0,0.000000,0.021333,0.000000,0.0,7,3,2,...,0,0,0,0,0,0,0,0,2,1
12327,0,0,0.0,0.083333,0.086667,0.000000,0.0,7,3,2,...,1,0,0,0,0,0,0,0,2,1
12328,4,0,0.0,0.000000,0.021053,0.000000,0.0,7,2,2,...,0,0,0,0,0,0,0,0,2,0


In [24]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X3,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.9362212662992337
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.9420217771205808
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.91930251 0.89132198 0.86820762 0.85725872 0.84914842]
Accuracy of XGBoost cross-vaild test: 0.8770478507704784


## P4：Label Encoding (這裡不用)

In [25]:
df= pd.read_csv('online_shoppers_intention.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

In [26]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.918897   0.88848337 0.86739659 0.85563666 0.85523114]
Accuracy of XGBoost cross-vaild test: 0.8771289537712894


In [27]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test: 0.9327261728726979

[0.91484185 0.89983779 0.89578264 0.87591241 0.88199513]
Accuracy of RandomForest cross-vaild test: 0.8936739659367398


In [28]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9373706143298831

[0.92214112 0.89699919 0.89172749 0.87996756 0.87915653]
Accuracy of Lightgbm cross-vaild test: 0.8939983779399838


In [29]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X,y)
scores43 = cross_val_score(clf43,X,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test: 0.7034816507595107



[0.9026764  0.90835361 0.88199513 0.86415247 0.87550689]
Accuracy of MLP cross-vaild test: 0.8865369018653689




In [30]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X,y)
scores44 = cross_val_score(clf44,X,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.84752636 0.84671533 0.84468775 0.85036496 0.84793187]
Accuracy of SVM cross-vaild test: 0.8474452554744525


## P4：One Hot Encoding

In [31]:
# 清除 Label Encoder 後，再做 One Hot Encoding
X2

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,...,TrafficType_13,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,...,0,0,0,0,0,0,0,0,2,0
1,0,0,0.0,0.000000,0.100000,0.000000,0.0,2,2,2,...,0,0,0,0,0,0,0,0,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,...,0,0,0,0,0,0,0,0,2,0
4,0,0,0.0,0.020000,0.050000,0.000000,0.0,2,3,3,...,0,0,0,0,0,0,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,0.0,0.007143,0.029031,12.241717,0.0,1,4,6,...,0,0,0,0,0,0,0,0,2,1
12326,0,0,0.0,0.000000,0.021333,0.000000,0.0,7,3,2,...,0,0,0,0,0,0,0,0,2,1
12327,0,0,0.0,0.083333,0.086667,0.000000,0.0,7,3,2,...,1,0,0,0,0,0,0,0,2,1
12328,4,0,0.0,0.000000,0.021053,0.000000,0.0,7,2,2,...,0,0,0,0,0,0,0,0,2,0


In [32]:
# One hot encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4,X2,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())
# Accuracy of XGBoost cross-vaild test: 0.9420217771205808

[0.91443633 0.89213301 0.86942417 0.85725872 0.8459043 ]
Accuracy of XGBoost cross-vaild test: 0.875831305758313


In [33]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X2,y)
scores41a = cross_val_score(clf41,X2,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test: 0.9223215485952412

[0.91403082 0.89618816 0.89132198 0.87956204 0.87510138]
Accuracy of RandomForest cross-vaild test: 0.8912408759124087


In [34]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X2,y)
scores42a = cross_val_score(clf42,X2,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9362212662992337

[0.91524736 0.89659367 0.88077859 0.88240065 0.87793998]
Accuracy of Lightgbm cross-vaild test: 0.8905920519059206


In [35]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X2,y)
scores43a = cross_val_score(clf43,X2,y,cv=5,scoring='accuracy')
print(scores43a)
print('Accuracy of MLP cross-vaild test:',scores43a.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.92660178 0.89618816 0.86455799 0.85847526 0.84752636]
Accuracy of MLP cross-vaild test: 0.8786699107866992




In [36]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X2,y)
scores44a = cross_val_score(clf44,X2,y,cv=5,scoring='accuracy')
print(scores44a)
print('Accuracy of SVM cross-vaild test:',scores44a.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.9217356  0.91605839 0.87591241 0.86861314 0.87793998]
Accuracy of SVM cross-vaild test: 0.8920519059205191


## P4：Frequency Encoding

In [37]:
# Frequency encoding：用類別出現頻率當作該類別數值
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [39]:
X4b=pd.DataFrame(X)
enc1=X4b['ProductRelated_Duration'].value_counts()
X4b['ProductRelated_Duration']=X4b['ProductRelated_Duration'].apply(lambda x : enc1[x]) 

In [40]:
X4b

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,755,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,10,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,755,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,1,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,1,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,1,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,1,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,1,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [41]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4b= XGBClassifier(**params)

# 5-fold validation
scores4b = cross_val_score(clf4b,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test: 0.936194380965183

[0.918897   0.89294404 0.87429035 0.85766423 0.85077048]
Accuracy of XGBoost cross-vaild test: 0.8789132197891323


In [42]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41b=RF.fit(X4b,y)
scores41b = cross_val_score(clf41b,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test: 0.9315633821750235

[0.91281427 0.90064882 0.88848337 0.87915653 0.88037307]
Accuracy of RandomForest cross-vaild test: 0.8922952149229522


In [43]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42b=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42b,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9408321010888561

[0.92092457 0.8945661  0.88361719 0.87915653 0.88037307]
Accuracy of Lightgbm cross-vaild test: 0.8917274939172749


In [44]:
# Frequency encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43b=MLP.fit(X4b,y)
scores43b = cross_val_score(clf43b,X4b,y,cv=5,scoring='accuracy')
print(scores43b)
print('Accuracy of MLP cross-vaild test:',scores43b.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.92497972 0.88888889 0.86658556 0.8568532  0.87550689]
Accuracy of MLP cross-vaild test: 0.8825628548256285




In [45]:
# Frequency encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44b=svm.fit(X4b,y)
scores44b = cross_val_score(clf44b,X4b,y,cv=5,scoring='accuracy')
print(scores44b)
print('Accuracy of SVM cross-vaild test:',scores44b.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.9107867  0.90510949 0.88483374 0.87145174 0.87266829]
Accuracy of SVM cross-vaild test: 0.8889699918896999


## P4：Target Encoding

In [46]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [48]:
#資料前處理 # Label Encoder
df= pd.read_csv('online_shoppers_intention.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

In [52]:
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [53]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=['Month','VisitorType','Weekend'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4c = enc.transform(X)

In [54]:
X4c

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,0.016304,1,1,1,1,0.139323,0.148911
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,0.016304,2,2,1,2,0.139323,0.148911
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,0.016304,4,1,9,3,0.139323,0.148911
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,0.016304,3,2,2,4,0.139323,0.148911
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,0.016304,3,3,1,4,0.139323,0.173989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,0.125072,4,6,1,1,0.139323,0.173989
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,0.253502,3,2,1,8,0.139323,0.173989
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,0.253502,3,2,1,13,0.139323,0.173989
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,0.253502,2,2,3,11,0.139323,0.148911


In [55]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.91443633 0.89213301 0.87510138 0.84712084 0.85117599]
Accuracy of XGBoost cross-vaild test: 0.8759935117599351


In [56]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.91200324 0.905515   0.89699919 0.8783455  0.88199513]
Accuracy of RandomForest cross-vaild test: 0.8949716139497161


In [57]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454765425460412
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.91524736 0.90186537 0.88969992 0.8811841  0.87753447]
Accuracy of Lightgbm cross-vaild test: 0.8931062449310625


In [58]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4c,y)
scores43c = cross_val_score(clf43,X4c,y,cv=5,scoring='accuracy')
print(scores43c)
print('Accuracy of MLP cross-vaild test:',scores43c.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781 



[0.92822384 0.90429846 0.8811841  0.8649635  0.87388483]
Accuracy of MLP cross-vaild test: 0.8905109489051094


In [59]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4c,y)
scores44c = cross_val_score(clf44,X4c,y,cv=5,scoring='accuracy')
print(scores44c)
print('Accuracy of SVM cross-vaild test:',scores44c.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.84752636 0.84671533 0.84468775 0.85036496 0.84793187]
Accuracy of SVM cross-vaild test: 0.8474452554744525


## P4：Leave-One-Out Encoding

In [60]:
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [62]:
encc = LeaveOneOutEncoder(cols=['Month','VisitorType','Weekend'],sigma=0.05).fit(X, y)
X4d = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [63]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4d = cross_val_score(clf4,X4d,y,cv=5,scoring='accuracy')
print(scores4d)
print('Accuracy of XGBoost cross-vaild test:',scores4d.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.9466258905766904

[0.91443633 0.89213301 0.87510138 0.84712084 0.85117599]
Accuracy of XGBoost cross-vaild test: 0.8759935117599351


In [64]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4d,y)
scores41d = cross_val_score(clf41,X4d,y,cv=5,scoring='accuracy')
print(scores41d)
print('Accuracy of RandomForest cross-vaild test:',scores41d.mean())

# Accuracy of RandomForest cross-vaild test: 0.9559214948245731

[0.91646391 0.90754258 0.89902676 0.88158962 0.88037307]
Accuracy of RandomForest cross-vaild test: 0.896999188969992


In [65]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4d,y)
scores42d = cross_val_score(clf42,X4d,y,cv=5,scoring='accuracy')
print(scores42d)
print('Accuracy of Lightgbm cross-vaild test:',scores42d.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454899852130663

[0.91524736 0.90186537 0.88969992 0.8811841  0.87753447]
Accuracy of Lightgbm cross-vaild test: 0.8931062449310625


In [66]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4d,y)
scores43d = cross_val_score(clf43,X4d,y,cv=5,scoring='accuracy')
print(scores43d)
print('Accuracy of MLP cross-vaild test:',scores43d.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781



[0.90673155 0.90835361 0.81832928 0.8621249  0.87996756]
Accuracy of MLP cross-vaild test: 0.8751013787510139




In [67]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4d,y)
scores44d = cross_val_score(clf44,X4d,y,cv=5,scoring='accuracy')
print(scores44d)
print('Accuracy of SVM cross-vaild test:',scores44d.mean())
# Accuracy of SVM cross-vaild 

[0.84752636 0.84671533 0.84468775 0.85036496 0.84793187]
Accuracy of SVM cross-vaild test: 0.8474452554744525


## P5：Combinations of numerical and categorical feature transformation

挑選較常用的六個組合

In [69]:
df= pd.read_csv('online_shoppers_intention.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

In [77]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores5 = cross_val_score(clf5,X,y,cv=5,scoring='accuracy')
print(scores5)
print('Accuracy of XGBoost cross-vaild test:',scores5.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.918897   0.88848337 0.86739659 0.85563666 0.85523114]
Accuracy of XGBoost cross-vaild test: 0.8771289537712894


In [72]:
# Standardization *  LOO
from category_encoders import *
encc = LeaveOneOutEncoder(cols=['Month','VisitorType','Weekend'],sigma=0.05).fit(X, y)
X5a = encc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5a = sc.fit_transform(X5a)

In [73]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5a = cross_val_score(clf5,X5a,y,cv=5,scoring='accuracy')
print(scores5a)
print('Accuracy of XGBoost cross-vaild test:',scores5a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9466258905766904

[0.91443633 0.89213301 0.87510138 0.84712084 0.85117599]
Accuracy of XGBoost cross-vaild test: 0.8759935117599351


In [74]:
# Standardization *  Target Encoding
enc = TargetEncoder(cols=['Month','VisitorType','Weekend'],min_samples_leaf=20, smoothing=10).fit(X, y)
X5b = enc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5b = sc.fit_transform(X5b)

In [75]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5b = cross_val_score(clf5,X5b,y,cv=5,scoring='accuracy')
print(scores5b)
print('Accuracy of XGBoost cross-vaild test:',scores5b.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.91443633 0.89213301 0.87510138 0.84712084 0.85117599]
Accuracy of XGBoost cross-vaild test: 0.8759935117599351


In [78]:
# Equal‐Frequency Binning *  Label Encoding
from sklearn.preprocessing import KBinsDiscretizer
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['ProductRelated_Duration']])

X31 = pd.DataFrame(X)
X31[['ProductRelated_Duration']]=disc11.transform(X31[['ProductRelated_Duration']])
X5c = pd.DataFrame(X31)

In [79]:
X5c

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,1.0,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,5.0,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,8.0,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,4.0,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,2.0,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,3.0,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [80]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5c = cross_val_score(clf5,X5c,y,cv=5,scoring='accuracy')
print(scores5c)
print('Accuracy of XGBoost cross-vaild test:',scores5c.mean())

# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.91484185 0.89294404 0.86820762 0.85360908 0.84833739]
Accuracy of XGBoost cross-vaild test: 0.87558799675588


In [82]:
# Equal‐Frequency Binning * Leave‐One‐Out Encoding
encc = LeaveOneOutEncoder(cols=['Month','VisitorType','Weekend'],sigma=0.05).fit(X5c, y)
X5d = encc.transform(X5c)

In [83]:
X5d

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.0,0.200000,0.200000,0.000000,0.0,0.016304,1,1,1,1,0.139323,0.148911
1,0,0,1.0,0.000000,0.100000,0.000000,0.0,0.016304,2,2,1,2,0.139323,0.148911
2,0,0,0.0,0.200000,0.200000,0.000000,0.0,0.016304,4,1,9,3,0.139323,0.148911
3,0,0,0.0,0.050000,0.140000,0.000000,0.0,0.016304,3,2,2,4,0.139323,0.148911
4,0,0,5.0,0.020000,0.050000,0.000000,0.0,0.016304,3,3,1,4,0.139323,0.173989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,8.0,0.007143,0.029031,12.241717,0.0,0.125072,4,6,1,1,0.139323,0.173989
12326,0,0,4.0,0.000000,0.021333,0.000000,0.0,0.253502,3,2,1,8,0.139323,0.173989
12327,0,0,2.0,0.083333,0.086667,0.000000,0.0,0.253502,3,2,1,13,0.139323,0.173989
12328,4,0,3.0,0.000000,0.021053,0.000000,0.0,0.253502,2,2,3,11,0.139323,0.148911


In [84]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5d = cross_val_score(clf5,X5d,y,cv=5,scoring='accuracy')
print(scores5d)
print('Accuracy of XGBoost cross-vaild test:',scores5d.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.91727494 0.89943228 0.8730738  0.8487429  0.86658556]
Accuracy of XGBoost cross-vaild test: 0.881021897810219


In [86]:
# Equal‐Frequency Binning * Target Encoding
enc = TargetEncoder(cols=['Month','VisitorType','Weekend'],min_samples_leaf=20, smoothing=10).fit(X5c, y)
X5e = enc.transform(X5c)

In [87]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5e = cross_val_score(clf5,X5e,y,cv=5,scoring='accuracy')
print(scores5e)
print('Accuracy of XGBoost cross-vaild test:',scores5e.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.91727494 0.89943228 0.8730738  0.8487429  0.86658556]
Accuracy of XGBoost cross-vaild test: 0.881021897810219


Standard Scalar 效果較佳，且 Label Encoder 效果又較 Target Encoding, LOO 佳

## P6：Categorical values of a feature is high (超過20種)

所有 label encoder 的結果視為 baseline

In [89]:
df= pd.read_csv('online_shoppers_intention.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Month','VisitorType','Weekend','Revenue']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='Administrative_Duration') #此三列與另外三行有高度關係，所以丟棄
df=df.drop(columns='Informational_Duration') 
df=df.drop(columns='ProductRelated') 
df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Revenue']#應變數
X=df.drop(columns='Revenue') #剩下的為自變數

X6 = pd.DataFrame(X)

In [90]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [91]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=['TrafficType']).fit(X,y)
X6 = enc6.transform(X)

In [92]:
X6

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,...,TrafficType_13,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,...,0,0,0,0,0,0,0,0,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,...,0,0,0,0,0,0,0,0,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,...,0,0,0,0,0,0,0,0,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,...,0,0,0,0,0,0,0,0,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,...,0,0,0,0,0,0,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,...,0,0,0,0,0,0,0,0,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,...,0,0,0,0,0,0,0,0,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,...,1,0,0,0,0,0,0,0,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,...,0,0,0,0,0,0,0,0,2,0


In [93]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy of XGBoost cross-vaild test: 0.935031590267509

[0.90673155 0.89781022 0.85847526 0.85563666 0.85928629]
Accuracy of XGBoost cross-vaild test: 0.87558799675588


In [95]:
# Target Encoder
enc6a = TargetEncoder(cols=['Month','VisitorType','Weekend'],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [96]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9396894743917192

[0.91443633 0.89213301 0.87510138 0.84712084 0.85117599]
Accuracy of XGBoost cross-vaild test: 0.8759935117599351


## P7：Undersampling

In [97]:
X

Unnamed: 0,Administrative,Informational,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0
1,0,0,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0
2,0,0,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0
3,0,0,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0
4,0,0,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,0,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1
12326,0,0,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1
12327,0,0,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1
12328,4,0,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0


In [98]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [99]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy of XGBoost cross-vaild test: 0.957330827067669

[0.95942408 0.94495413 0.91612058 0.93577982 0.91480996]
Accuracy of XGBoost cross-vaild test: 0.9342177132152635


In [100]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [101]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy of XGBoost cross-vaild test: 0.7978070175438596

[0.57722513 0.71166448 0.70773263 0.71952818 0.66448231]
Accuracy of XGBoost cross-vaild test: 0.6761265464925583


In [102]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [103]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.9586666666666668

[0.96005706 0.94196004 0.92102759 0.88915319 0.88201713]
Accuracy of XGBoost cross-vaild test: 0.9188430012310806


In [104]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [105]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.9472198368398456

[0.96745562 0.93343195 0.91666667 0.88905325 0.88110508]
Accuracy of XGBoost cross-vaild test: 0.9175425152940239


In [106]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [107]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.9322485207100591

[0.92767031 0.90916737 0.88178376 0.86748002 0.85065208]
Accuracy of XGBoost cross-vaild test: 0.8873507078099518


In [108]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [109]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.9385542168674699

[0.9270965  0.9047619  0.88284871 0.86298482 0.8494941 ]
Accuracy of XGBoost cross-vaild test: 0.8854372085057516


## P7：Oversampling

In [110]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [111]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.948709630911188

[0.54545455 0.93691533 0.9042936  0.89949628 0.8884357 ]
Accuracy of XGBoost cross-vaild test: 0.8349190902231026


In [112]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [113]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.9473327566320646

[0.51163349 0.94075318 0.9182058  0.91460782 0.88915547]
Accuracy of XGBoost cross-vaild test: 0.834871151617666


In [114]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [115]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.9323432664896079

[0.50622307 0.93847259 0.88795786 0.8893943  0.87263586]
Accuracy of XGBoost cross-vaild test: 0.8189367381817936


## P7：Ensemble

In [116]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [117]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.9785811232163164

[0.92105263 0.98115465 0.96918935 0.96260844 0.95782232]
Accuracy of XGBoost cross-vaild test: 0.9583654769589245


In [118]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [119]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.9412366580787633

[0.61670702 0.93970944 0.9086946  0.90893679 0.88156939]
Accuracy of XGBoost cross-vaild test: 0.8511234479794192


SMOTE+ENN 效果最好

## P8：SMOTE‐based Oversampling

In [120]:
# 記得跑上面
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X8, y8 = smote.fit_resample(X, y)

In [121]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf8= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores8 = cross_val_score(clf8,X8,y8,cv=5,scoring='accuracy')
print(scores8)
print('Accuracy of XGBoost cross-vaild test:',scores8.mean())

# Accuracy: 0.8759304207119742

[0.64955625 0.92492204 0.90237467 0.90405373 0.89035509]
Accuracy of XGBoost cross-vaild test: 0.85425235572494


In [122]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X8a, y8a = bsmote.fit_resample(X, y)

In [123]:
# XGBoost 
scores8a = cross_val_score(clf8,X8a,y8a,cv=5,scoring='accuracy')
print(scores8a)
print('Accuracy of XGBoost cross-vaild test:',scores8a.mean())

# Accuracy: 0.8759304207119742

[0.58815064 0.94291197 0.91220916 0.91628688 0.89851248]
Accuracy of XGBoost cross-vaild test: 0.8516142246330538


## P9：Imbalance Ratio vs. Resampling Strategy

In [124]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [125]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [126]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [127]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.952

[0.96766524 0.93625119 0.92911513 0.91151284 0.90865842]
Accuracy of RandomForest cross-vaild test: 0.9306405646774374


In [128]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.96528768 0.9376784  0.92673644 0.91008563 0.91294006]
Accuracy of Lightgbm cross-vaild test: 0.9305456434172921


In [129]:
# MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf0b=MLP.fit(X0,y0)
scores0b = cross_val_score(clf0b,X0,y0,cv=5,scoring='accuracy')
print(scores0b)
print('Accuracy of MLP cross-vaild test:',scores0b.mean())
# Accuracy of MLP cross-vaild test: 0.8733333333333334



[0.95435093 0.91912464 0.91103711 0.89105614 0.90104662]
Accuracy of MLP cross-vaild test: 0.9153230874474552


In [130]:
# SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf0c=svm.fit(X0,y0)
scores0c = cross_val_score(clf0c,X0,y0,cv=5,scoring='accuracy')
print(scores0c)
print('Accuracy of SVM cross-vaild test:',scores0c.mean())
# Accuracy of SVM cross-vaild test : 0.852



[0.83594864 0.82968601 0.83491912 0.85061846 0.8434824 ]
Accuracy of SVM cross-vaild test: 0.8389309278168607


## P10：Tomek Links

In [131]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [132]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.9370245139475909

[0.9255677  0.91463415 0.8969289  0.89061843 0.88641144]
Accuracy of RandomForest cross-vaild test: 0.9028321243710312


In [133]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.9426666666666668

[0.93061396 0.91126997 0.89145982 0.88893563 0.87673538]
Accuracy of Lightgbm cross-vaild test: 0.9305456434172921


In [134]:
scores01b = cross_val_score(clf0b,X01,y01,cv=5,scoring='accuracy')
print(scores01b)
print('Accuracy of MLP cross-vaild test:',scores01b.mean())
# Accuracy of MLP cross-vaild test: 0.8692096365173289



[0.94196804 0.90664424 0.88136306 0.87505259 0.87589398]
Accuracy of MLP cross-vaild test: 0.8961843826437337


In [135]:
scores01c = cross_val_score(clf0c,X01,y01,cv=5,scoring='accuracy')
print(scores01c)
print('Accuracy of SVM cross-vaild test:',scores01c.mean())
# Accuracy of SVM cross-vaild test : 0.8478021978021978

[0.84230446 0.84230446 0.84223812 0.84812789 0.84476231]
Accuracy of SVM cross-vaild test: 0.8439474456108493


## P10：One Sided Selection

In [136]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [137]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.9268595339441598

[0.92736486 0.91342905 0.89991554 0.88809122 0.88006757]
Accuracy of RandomForest cross-vaild test: 0.9017736486486487


In [138]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.92905405 0.91131757 0.89358108 0.88766892 0.88091216]
Accuracy of Lightgbm cross-vaild test: 0.9305456434172921


In [139]:
scores02b = cross_val_score(clf0b,X02,y02,cv=5,scoring='accuracy')
print(scores02b)
print('Accuracy of MLP cross-vaild test:',scores02b.mean())
# Accuracy of MLP cross-vaild test:  0.8573118822595773



[0.92398649 0.91849662 0.88555743 0.87711149 0.87415541]
Accuracy of MLP cross-vaild test: 0.8958614864864864


In [140]:
scores02c = cross_val_score(clf0c,X02,y02,cv=5,scoring='accuracy')
print(scores02c)
print('Accuracy of SVM cross-vaild test:',scores02c.mean())
# Accuracy of SVM cross-vaild test : 0.8441165861048987

[0.84206081 0.84206081 0.84079392 0.84712838 0.8441723 ]
Accuracy of SVM cross-vaild test: 0.8432432432432433


## P10：SMOTE

In [141]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [142]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.9403714917339485

[0.84672583 0.92420245 0.91604701 0.919645   0.91170825]
Accuracy of RandomForest cross-vaild test: 0.9036657091990097


In [143]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9535539215686274

[0.48189014 0.91988486 0.90501319 0.91172943 0.89299424]
Accuracy of Lightgbm cross-vaild test: 0.8223023743939478


In [144]:
scores03b = cross_val_score(clf0b,X03,y03,cv=5,scoring='accuracy')
print(scores03b)
print('Accuracy of MLP cross-vaild test:',scores03b.mean())
# Accuracy of MLP cross-vaild test: 0.8217920991926182



[0.83209403 0.87574958 0.85919885 0.85440154 0.87116123]
Accuracy of MLP cross-vaild test: 0.8585210439543491




In [145]:
scores03c = cross_val_score(clf0c,X03,y03,cv=5,scoring='accuracy')
print(scores03c)
print('Accuracy of SVM cross-vaild test:',scores03c.mean())
# Accuracy of SVM cross-vaild test : 0.6685097078046904



[0.76349244 0.73566803 0.70976253 0.70256656 0.69337812]
Accuracy of SVM cross-vaild test: 0.7209735369690096


## P10：Borderline-SMOTE

In [146]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [147]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.9396914648212226

[0.80450947 0.9342768  0.92612137 0.92852003 0.92394434]
Accuracy of RandomForest cross-vaild test: 0.903474403662164


In [148]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.948027201076509

[0.63156632 0.93067882 0.91676661 0.91628688 0.90475048]
Accuracy of Lightgbm cross-vaild test: 0.8600098225224201


In [149]:
scores04b = cross_val_score(clf0b,X04,y04,cv=5,scoring='accuracy')
print(scores04b)
print('Accuracy of MLP cross-vaild test:',scores04b.mean())
# Accuracy of MLP cross-vaild test: 0.8252619184928873



[0.81530343 0.88390501 0.85440154 0.87047254 0.88411708]
Accuracy of MLP cross-vaild test: 0.8616399192651732




In [150]:
scores04c = cross_val_score(clf0c,X04,y04,cv=5,scoring='accuracy')
print(scores04c)
print('Accuracy of SVM cross-vaild test:',scores04c.mean())
# Accuracy of SVM cross-vaild test : 0.6733732218377547



[0.68673543 0.74502279 0.73854641 0.70760374 0.71329175]
Accuracy of SVM cross-vaild test: 0.7182400235906281


## P10：SMOTE + ENN

In [151]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [152]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.9791304347826086

[0.94532417 0.97311025 0.97101882 0.96712493 0.96622833]
Accuracy of RandomForest cross-vaild test: 0.9645612998660067


In [153]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of Lightgbm cross-vaild test:',scores05a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.982608695652174

[0.90319689 0.97340902 0.96952495 0.96323969 0.95696354]
Accuracy of Lightgbm cross-vaild test: 0.9532668182388846


In [154]:
scores05b = cross_val_score(clf0b,X05,y05,cv=5,scoring='accuracy')
print(scores05b)
print('Accuracy of MLP cross-vaild test:',scores05b.mean())
# Accuracy of MLP cross-vaild test:  0.8626086956521739



[0.94054377 0.93277562 0.92769644 0.93006575 0.90824866]
Accuracy of MLP cross-vaild test: 0.9278660480672398




In [155]:
scores05c = cross_val_score(clf0c,X05,y05,cv=5,scoring='accuracy')
print(scores05c)
print('Accuracy of SVM cross-vaild test:',scores05c.mean())
# Accuracy of SVM cross-vaild test : 0.7417391304347826

[0.81953989 0.81416194 0.79623543 0.77794381 0.77256426]
Accuracy of SVM cross-vaild test: 0.796089065316363


## P10：SMOTE + Tomek Links

In [156]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [157]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.9413830072366658

[0.85119768 0.92886523 0.92209049 0.9199129  0.91505324]
Accuracy of RandomForest cross-vaild test: 0.9074239077301225


In [158]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9490777515167759

[0.5785144  0.92547786 0.91217034 0.91628357 0.89762827]
Accuracy of Lightgbm cross-vaild test: 0.846014886439254


In [159]:
scores06b = cross_val_score(clf0b,X06,y06,cv=5,scoring='accuracy')
print(scores06b)
print('Accuracy of MLP cross-vaild test:',scores06b.mean())
# Accuracy of MLP cross-vaild test: 0.7955191150313101



[0.87999032 0.88192596 0.85506896 0.84902008 0.86616651]
Accuracy of MLP cross-vaild test: 0.8664343656668436


In [160]:
scores06c = cross_val_score(clf0c,X06,y06,cv=5,scoring='accuracy')
print(scores06c)
print('Accuracy of SVM cross-vaild test:',scores06c.mean())
# Accuracy of SVM cross-vaild test : 0.6887673302307448



[0.76869102 0.73747883 0.72586499 0.70771836 0.7018393 ]
Accuracy of SVM cross-vaild test: 0.7283185017809339


## 輸出

In [163]:
data=pd.DataFrame([[scores1.mean(),scores11.mean()],
         [scores1.mean(),scores2.mean()],         
         [scores3.mean(),scores31.mean()],
         [scores4.mean(),scores41.mean(),scores42.mean(),scores43.mean(),scores44.mean()],
         [scores4a.mean(),scores41a.mean(),scores42a.mean(),scores43a.mean(),scores44a.mean()],
         [scores4b.mean(),scores41b.mean(),scores42b.mean(),scores43b.mean(),scores44b.mean()],
         [scores4c.mean(),scores41c.mean(),scores42c.mean(),scores43c.mean(),scores44c.mean()],
         [scores4d.mean(),scores41d.mean(),scores42d.mean(),scores43d.mean(),scores44d.mean()],
         [scores5.mean(),scores5a.mean(),scores5b.mean(),scores5c.mean(),scores5d.mean(),scores5e.mean()],
         [scores1.mean(),scores6.mean(),scores6a.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores7g.mean(),scores7h.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores0.mean(),scores0a.mean(),scores0b.mean(),scores0c.mean()],
         [scores01.mean(),scores01a.mean(),scores01b.mean(),scores01c.mean()],
         [scores02.mean(),scores02a.mean(),scores02b.mean(),scores02c.mean()],
         [scores03.mean(),scores03a.mean(),scores03b.mean(),scores03c.mean()],
         [scores04.mean(),scores04a.mean(),scores04b.mean(),scores04c.mean()],
         [scores05.mean(),scores05a.mean(),scores05b.mean(),scores05c.mean()],
         [scores06.mean(),scores06a.mean(),scores06b.mean(),scores06c.mean()]],
         index=['Q1','Q2','Q3','Q4L','Q4O','Q4F','Q4T','Q4LOL','Q5','Q6','Q7','Q8','Q9','Q10-1','Q10-2','Q10-3','Q10-4','Q10-5','Q10-6','Q10-7']     )

In [164]:
data.to_csv('online_shoppers_intention_Result.csv')