## 讀取資料 (Default：Label Encoding)

In [3]:
#讀取資料
import pandas as pd
import numpy as np

df= pd.read_csv('Churn_Modelling.csv',sep=',') 

In [4]:
# all columns
print(list(df.columns))

['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']


In [5]:
#資料前處理  # Label Encoder
#將部分字串轉數值
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

In [6]:
y

array([1, 0, 1, ..., 1, 1, 0])

In [7]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


## P1：標準化是否影響結果

In [8]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())
# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.842 0.842 0.836 0.845 0.837]
Accuracy of XGBoost cross-vaild test: 0.8404


In [9]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [10]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

# Accuracy of XGBoost cross-vaild test: 0.9362212662992337
##標準化在XGBoost的交叉驗證上沒有差異太大

[0.842 0.842 0.836 0.845 0.837]
Accuracy of XGBoost cross-vaild test: 0.8404


## P3：Feature Binning 有沒有效果

In [11]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [12]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [13]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['Balance']])
disc2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc2.fit(X[['EstimatedSalary']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [14]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['Balance']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[['EstimatedSalary']])



KBinsDiscretizer(encode='ordinal', n_bins=10)

In [15]:
disc1.bin_edges_

array([array([     0.   ,  25089.809,  50179.618,  75269.427, 100359.236,
              125449.045, 150538.854, 175628.663, 200718.472, 225808.281,
              250898.09 ])                                               ],
      dtype=object)

In [16]:
disc11.bin_edges_

array([array([     0.   ,  73080.908,  97198.54 , 110138.926, 122029.87 ,
              133710.358, 149244.792, 250898.09 ])                       ],
      dtype=object)

In [17]:
X3 = pd.DataFrame(X)
X3[['Balance']]=disc1.transform(X3[['Balance']])
X3[['EstimatedSalary']]=disc1.transform(X3[['EstimatedSalary']])
X3 = pd.DataFrame(X3)

Feature names unseen at fit time:
- EstimatedSalary
Feature names seen at fit time, yet now missing:
- Balance



In [18]:
X3

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,4.0
1,608,2,0,41,1,3.0,1,0,1,4.0
2,502,0,0,42,8,6.0,3,1,0,4.0
3,699,0,0,39,1,0.0,2,0,0,3.0
4,850,2,0,43,2,5.0,1,1,1,3.0
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.0,2,1,0,3.0
9996,516,0,1,35,10,2.0,1,1,1,4.0
9997,709,0,0,36,7,0.0,1,0,1,1.0
9998,772,1,1,42,3,2.0,2,1,0,3.0


In [19]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

#Accuracy of XGBoost cross-vaild test: 0.9304140341443743
# Frequency較佳

[0.8445 0.8475 0.844  0.8415 0.8365]
Accuracy of XGBoost cross-vaild test: 0.8428000000000001


In [21]:
#資料前處理 # Label Encoder
df= pd.read_csv('Churn_Modelling.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

X31 = pd.DataFrame(X)
X31[['Balance']]=disc11.transform(X3[['Balance']])
X31[['EstimatedSalary']]=disc21.transform(X3[['EstimatedSalary']])
X31 = pd.DataFrame(X31)

In [22]:
X31

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,0.0
1,608,2,0,41,1,0.0,1,0,1,0.0
2,502,0,0,42,8,0.0,3,1,0,0.0
3,699,0,0,39,1,0.0,2,0,0,0.0
4,850,2,0,43,2,0.0,1,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.0,2,1,0,0.0
9996,516,0,1,35,10,0.0,1,1,1,0.0
9997,709,0,0,36,7,0.0,1,0,1,0.0
9998,772,1,1,42,3,0.0,2,1,0,0.0


In [23]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf3,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

#Accuracy of XGBoost cross-vaild test:  0.935065196935072
# Frequency較佳

[0.8415 0.8395 0.828  0.8485 0.837 ]
Accuracy of XGBoost cross-vaild test: 0.8389


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [24]:
X3

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,4.0
1,608,2,0,41,1,3.0,1,0,1,4.0
2,502,0,0,42,8,6.0,3,1,0,4.0
3,699,0,0,39,1,0.0,2,0,0,3.0
4,850,2,0,43,2,5.0,1,1,1,3.0
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.0,2,1,0,3.0
9996,516,0,1,35,10,2.0,1,1,1,4.0
9997,709,0,0,36,7,0.0,1,0,1,1.0
9998,772,1,1,42,3,2.0,2,1,0,3.0


In [25]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [27]:
from category_encoders import *
enc2 = OneHotEncoder(cols=['Geography']).fit(X3,y)
X2 = enc2.transform(X3)
X2 = pd.DataFrame(X2)
X2

Unnamed: 0,CreditScore,Geography_1,Geography_2,Geography_3,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,1,0,0,0,42,2,0.0,1,1,1,4.0
1,608,0,1,0,0,41,1,3.0,1,0,1,4.0
2,502,1,0,0,0,42,8,6.0,3,1,0,4.0
3,699,1,0,0,0,39,1,0.0,2,0,0,3.0
4,850,0,1,0,0,43,2,5.0,1,1,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,0,0,1,39,5,0.0,2,1,0,3.0
9996,516,1,0,0,1,35,10,2.0,1,1,1,4.0
9997,709,1,0,0,0,36,7,0.0,1,0,1,1.0
9998,772,0,0,1,1,42,3,2.0,2,1,0,3.0


In [28]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X3,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.9362212662992337
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.9420217771205808
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.8445 0.8475 0.844  0.8415 0.8365]
Accuracy of XGBoost cross-vaild test: 0.8428000000000001


## P4：Label Encoding (這裡不用)

In [30]:
df= pd.read_csv('Churn_Modelling.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

In [31]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.842 0.842 0.836 0.845 0.837]
Accuracy of XGBoost cross-vaild test: 0.8404


In [32]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test: 0.9327261728726979

[0.859  0.8675 0.859  0.8655 0.855 ]
Accuracy of RandomForest cross-vaild test: 0.8612


In [33]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9373706143298831

[0.87   0.8665 0.8595 0.869  0.855 ]
Accuracy of Lightgbm cross-vaild test: 0.8640000000000001


In [34]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X,y)
scores43 = cross_val_score(clf43,X,y,cv=5,scoring='accuracy')
print(scores43)
print('Accuracy of MLP cross-vaild test:',scores43.mean())
# Accuracy of MLP cross-vaild test: 0.7034816507595107



[0.795  0.7965 0.727  0.596  0.7955]
Accuracy of MLP cross-vaild test: 0.742




In [35]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X,y)
scores44 = cross_val_score(clf44,X,y,cv=5,scoring='accuracy')
print(scores44)
print('Accuracy of SVM cross-vaild test:',scores44.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.796  0.796  0.7965 0.7965 0.7965]
Accuracy of SVM cross-vaild test: 0.7963


## P4：One Hot Encoding

In [36]:
# 清除 Label Encoder 後，再做 One Hot Encoding
X2

Unnamed: 0,CreditScore,Geography_1,Geography_2,Geography_3,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,1,0,0,0,42,2,0.0,1,1,1,4.0
1,608,0,1,0,0,41,1,3.0,1,0,1,4.0
2,502,1,0,0,0,42,8,6.0,3,1,0,4.0
3,699,1,0,0,0,39,1,0.0,2,0,0,3.0
4,850,0,1,0,0,43,2,5.0,1,1,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,0,0,1,39,5,0.0,2,1,0,3.0
9996,516,1,0,0,1,35,10,2.0,1,1,1,4.0
9997,709,1,0,0,0,36,7,0.0,1,0,1,1.0
9998,772,0,0,1,1,42,3,2.0,2,1,0,3.0


In [37]:
# One hot encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4,X2,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())
# Accuracy of XGBoost cross-vaild test: 0.9420217771205808

[0.835  0.8395 0.8365 0.8435 0.829 ]
Accuracy of XGBoost cross-vaild test: 0.8367000000000001


In [38]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X2,y)
scores41a = cross_val_score(clf41,X2,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test: 0.9223215485952412

[0.8545 0.871  0.8615 0.863  0.858 ]
Accuracy of RandomForest cross-vaild test: 0.8615999999999999


In [39]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X2,y)
scores42a = cross_val_score(clf42,X2,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9362212662992337

[0.867  0.873  0.8605 0.8705 0.857 ]
Accuracy of Lightgbm cross-vaild test: 0.8656


In [40]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X2,y)
scores43a = cross_val_score(clf43,X2,y,cv=5,scoring='accuracy')
print(scores43a)
print('Accuracy of MLP cross-vaild test:',scores43a.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.813  0.8175 0.8225 0.7995 0.8135]
Accuracy of MLP cross-vaild test: 0.8132000000000001


In [41]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X2,y)
scores44a = cross_val_score(clf44,X2,y,cv=5,scoring='accuracy')
print(scores44a)
print('Accuracy of SVM cross-vaild test:',scores44a.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.796  0.796  0.7965 0.7965 0.7965]
Accuracy of SVM cross-vaild test: 0.7963


## P4：Frequency Encoding

In [42]:
# Frequency encoding：用類別出現頻率當作該類別數值
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [44]:
X4b=pd.DataFrame(X)
enc1=X4b['Geography'].value_counts()
X4b['Geography']=X4b['Geography'].apply(lambda x : enc1[x]) 

In [45]:
X4b

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,5014,0,42,2,0.00,1,1,1,101348.88
1,608,2477,0,41,1,83807.86,1,0,1,112542.58
2,502,5014,0,42,8,159660.80,3,1,0,113931.57
3,699,5014,0,39,1,0.00,2,0,0,93826.63
4,850,2477,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,5014,1,39,5,0.00,2,1,0,96270.64
9996,516,5014,1,35,10,57369.61,1,1,1,101699.77
9997,709,5014,0,36,7,0.00,1,0,1,42085.58
9998,772,2509,1,42,3,75075.31,2,1,0,92888.52


In [46]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4b= XGBClassifier(**params)

# 5-fold validation
scores4b = cross_val_score(clf4b,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test: 0.936194380965183

[0.842 0.842 0.836 0.845 0.837]
Accuracy of XGBoost cross-vaild test: 0.8404


In [47]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41b=RF.fit(X4b,y)
scores41b = cross_val_score(clf41b,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test: 0.9315633821750235

[0.8605 0.869  0.8605 0.8655 0.855 ]
Accuracy of RandomForest cross-vaild test: 0.8620999999999999


In [48]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42b=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42b,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9408321010888561

[0.87   0.8665 0.8595 0.869  0.855 ]
Accuracy of Lightgbm cross-vaild test: 0.8640000000000001


In [49]:
# Frequency encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43b=MLP.fit(X4b,y)
scores43b = cross_val_score(clf43b,X4b,y,cv=5,scoring='accuracy')
print(scores43b)
print('Accuracy of MLP cross-vaild test:',scores43b.mean())
# Accuracy of MLP cross-vaild test: 0.8364296276381233



[0.6125 0.796  0.7825 0.752  0.564 ]
Accuracy of MLP cross-vaild test: 0.7013999999999999




In [50]:
# Frequency encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44b=svm.fit(X4b,y)
scores44b = cross_val_score(clf44b,X4b,y,cv=5,scoring='accuracy')
print(scores44b)
print('Accuracy of SVM cross-vaild test:',scores44b.mean())
# Accuracy of SVM cross-vaild test for one-hot encoding: 0.8364296276381233

[0.796  0.796  0.7965 0.7965 0.7965]
Accuracy of SVM cross-vaild test: 0.7963


## P4：Target Encoding

In [51]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [53]:
#資料前處理 # Label Encoder
df= pd.read_csv('Churn_Modelling.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

In [54]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [55]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=['Geography','Gender'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4c = enc.transform(X)

In [56]:
X4c

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0.161548,0.250715,42,2,0.00,1,1,1,101348.88
1,608,0.166734,0.250715,41,1,83807.86,1,0,1,112542.58
2,502,0.161548,0.250715,42,8,159660.80,3,1,0,113931.57
3,699,0.161548,0.250715,39,1,0.00,2,0,0,93826.63
4,850,0.166734,0.250715,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0.161548,0.164559,39,5,0.00,2,1,0,96270.64
9996,516,0.161548,0.164559,35,10,57369.61,1,1,1,101699.77
9997,709,0.161548,0.250715,36,7,0.00,1,0,1,42085.58
9998,772,0.324432,0.164559,42,3,75075.31,2,1,0,92888.52


In [57]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.843  0.8375 0.839  0.851  0.834 ]
Accuracy of XGBoost cross-vaild test: 0.8408999999999999


In [58]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.8615 0.874  0.859  0.8635 0.8585]
Accuracy of RandomForest cross-vaild test: 0.8633000000000001


In [59]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454765425460412
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.8645 0.868  0.86   0.868  0.8575]
Accuracy of Lightgbm cross-vaild test: 0.8635999999999999


In [60]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4c,y)
scores43c = cross_val_score(clf43,X4c,y,cv=5,scoring='accuracy')
print(scores43c)
print('Accuracy of MLP cross-vaild test:',scores43c.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781 



[0.7075 0.6645 0.533  0.772  0.7475]
Accuracy of MLP cross-vaild test: 0.6849




In [61]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4c,y)
scores44c = cross_val_score(clf44,X4c,y,cv=5,scoring='accuracy')
print(scores44c)
print('Accuracy of SVM cross-vaild test:',scores44c.mean())
# Accuracy of SVM cross-vaild test: 0.8364296276381233

[0.796  0.796  0.7965 0.7965 0.7965]
Accuracy of SVM cross-vaild test: 0.7963


## P4：Leave-One-Out Encoding

In [62]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [64]:
encc = LeaveOneOutEncoder(cols=['Geography','Gender'],sigma=0.05).fit(X, y)
X4d = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [65]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4d = cross_val_score(clf4,X4d,y,cv=5,scoring='accuracy')
print(scores4d)
print('Accuracy of XGBoost cross-vaild test:',scores4d.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.9466258905766904

[0.843  0.8375 0.839  0.851  0.834 ]
Accuracy of XGBoost cross-vaild test: 0.8408999999999999


In [66]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4d,y)
scores41d = cross_val_score(clf41,X4d,y,cv=5,scoring='accuracy')
print(scores41d)
print('Accuracy of RandomForest cross-vaild test:',scores41d.mean())

# Accuracy of RandomForest cross-vaild test: 0.9559214948245731

[0.8595 0.8715 0.862  0.8655 0.859 ]
Accuracy of RandomForest cross-vaild test: 0.8634999999999999


In [67]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4d,y)
scores42d = cross_val_score(clf42,X4d,y,cv=5,scoring='accuracy')
print(scores42d)
print('Accuracy of Lightgbm cross-vaild test:',scores42d.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454899852130663

[0.8645 0.868  0.86   0.868  0.8575]
Accuracy of Lightgbm cross-vaild test: 0.8635999999999999


In [68]:
# One hot encoding 後 MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf43=MLP.fit(X4d,y)
scores43d = cross_val_score(clf43,X4d,y,cv=5,scoring='accuracy')
print(scores43d)
print('Accuracy of MLP cross-vaild test:',scores43d.mean())
# Accuracy of MLP cross-vaild test: 0.5685979298292781



[0.7075 0.6645 0.533  0.772  0.7475]
Accuracy of MLP cross-vaild test: 0.6849




In [69]:
# One hot encoding 後 SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf44=svm.fit(X4d,y)
scores44d = cross_val_score(clf44,X4d,y,cv=5,scoring='accuracy')
print(scores44d)
print('Accuracy of SVM cross-vaild test:',scores44d.mean())
# Accuracy of SVM cross-vaild 

[0.796  0.796  0.7965 0.7965 0.7965]
Accuracy of SVM cross-vaild test: 0.7963


## P5：Combinations of numerical and categorical feature transformation

挑選較常用的六個組合

In [71]:
df= pd.read_csv('Churn_Modelling.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

In [72]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores5 = cross_val_score(clf5,X,y,cv=5,scoring='accuracy')
print(scores5)
print('Accuracy of XGBoost cross-vaild test:',scores5.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.842 0.842 0.836 0.845 0.837]
Accuracy of XGBoost cross-vaild test: 0.8404


In [74]:
# Standardization *  LOO
from category_encoders import *
encc = LeaveOneOutEncoder(cols=['Geography','Gender'],sigma=0.05).fit(X, y)
X5a = encc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5a = sc.fit_transform(X5a)

In [75]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5a = cross_val_score(clf5,X5a,y,cv=5,scoring='accuracy')
print(scores5a)
print('Accuracy of XGBoost cross-vaild test:',scores5a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9466258905766904

[0.843  0.8375 0.839  0.851  0.834 ]
Accuracy of XGBoost cross-vaild test: 0.8408999999999999


In [76]:
# Standardization *  Target Encoding
enc = TargetEncoder(cols=['Geography','Gender'],min_samples_leaf=20, smoothing=10).fit(X, y)
X5b = enc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5b = sc.fit_transform(X5b)

In [77]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5b = cross_val_score(clf5,X5b,y,cv=5,scoring='accuracy')
print(scores5b)
print('Accuracy of XGBoost cross-vaild test:',scores5b.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.843  0.8375 0.839  0.851  0.834 ]
Accuracy of XGBoost cross-vaild test: 0.8408999999999999


In [78]:
# Equal‐Frequency Binning *  Label Encoding
from sklearn.preprocessing import KBinsDiscretizer
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['Geography']])

X31 = pd.DataFrame(X)
X31[['Geography']]=disc11.transform(X31[['Geography']])
X5c = pd.DataFrame(X31)



In [79]:
X5c

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0.0,0,42,2,0.00,1,1,1,101348.88
1,608,1.0,0,41,1,83807.86,1,0,1,112542.58
2,502,0.0,0,42,8,159660.80,3,1,0,113931.57
3,699,0.0,0,39,1,0.00,2,0,0,93826.63
4,850,1.0,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0.0,1,39,5,0.00,2,1,0,96270.64
9996,516,0.0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0.0,0,36,7,0.00,1,0,1,42085.58
9998,772,1.0,1,42,3,75075.31,2,1,0,92888.52


In [80]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5c = cross_val_score(clf5,X5c,y,cv=5,scoring='accuracy')
print(scores5c)
print('Accuracy of XGBoost cross-vaild test:',scores5c.mean())

# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.827  0.827  0.8355 0.838  0.8275]
Accuracy of XGBoost cross-vaild test: 0.8310000000000001


In [81]:
# Equal‐Frequency Binning * Leave‐One‐Out Encoding
encc = LeaveOneOutEncoder(cols=['Geography','Gender'],sigma=0.05).fit(X5c, y)
X5d = encc.transform(X5c)

In [82]:
X5d

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0.161548,0.250715,42,2,0.00,1,1,1,101348.88
1,608,0.246089,0.250715,41,1,83807.86,1,0,1,112542.58
2,502,0.161548,0.250715,42,8,159660.80,3,1,0,113931.57
3,699,0.161548,0.250715,39,1,0.00,2,0,0,93826.63
4,850,0.246089,0.250715,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0.161548,0.164559,39,5,0.00,2,1,0,96270.64
9996,516,0.161548,0.164559,35,10,57369.61,1,1,1,101699.77
9997,709,0.161548,0.250715,36,7,0.00,1,0,1,42085.58
9998,772,0.246089,0.164559,42,3,75075.31,2,1,0,92888.52


In [83]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5d = cross_val_score(clf5,X5d,y,cv=5,scoring='accuracy')
print(scores5d)
print('Accuracy of XGBoost cross-vaild test:',scores5d.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.827  0.827  0.8355 0.838  0.8275]
Accuracy of XGBoost cross-vaild test: 0.8310000000000001


In [85]:
# Equal‐Frequency Binning * Target Encoding
enc = TargetEncoder(cols=['Geography','Gender'],min_samples_leaf=20, smoothing=10).fit(X5c, y)
X5e = enc.transform(X5c)

In [86]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5e = cross_val_score(clf5,X5e,y,cv=5,scoring='accuracy')
print(scores5e)
print('Accuracy of XGBoost cross-vaild test:',scores5e.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.827  0.827  0.8355 0.838  0.8275]
Accuracy of XGBoost cross-vaild test: 0.8310000000000001


Standard Scalar 效果較佳，且 Label Encoder 效果又較 Target Encoding, LOO 佳

## P6：Categorical values of a feature is high (超過20種)

所有 label encoder 的結果視為 baseline

In [87]:
df= pd.read_csv('Churn_Modelling.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['Geography','Gender']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='RowNumber')  #此三變數為個人資料，故刪除
df=df.drop(columns='Surname') 
df=df.drop(columns='CustomerId') 
y=np.array(df['Exited'])#應變數
X=df.drop(columns='Exited') #剩下的為自變數

In [88]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [89]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=['Geography']).fit(X,y)
X6 = enc6.transform(X)

In [90]:
X6

Unnamed: 0,CreditScore,Geography_1,Geography_2,Geography_3,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,1,0,0,0,42,2,0.00,1,1,1,101348.88
1,608,0,1,0,0,41,1,83807.86,1,0,1,112542.58
2,502,1,0,0,0,42,8,159660.80,3,1,0,113931.57
3,699,1,0,0,0,39,1,0.00,2,0,0,93826.63
4,850,0,1,0,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,0,0,1,39,5,0.00,2,1,0,96270.64
9996,516,1,0,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,1,0,0,0,36,7,0.00,1,0,1,42085.58
9998,772,0,0,1,1,42,3,75075.31,2,1,0,92888.52


In [91]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy of XGBoost cross-vaild test: 0.935031590267509

[0.852  0.848  0.836  0.848  0.8305]
Accuracy of XGBoost cross-vaild test: 0.8429


In [92]:
# Target Encoder
enc6a = TargetEncoder(cols=['Geography','Gender'],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [93]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9396894743917192

[0.843  0.8375 0.839  0.851  0.834 ]
Accuracy of XGBoost cross-vaild test: 0.8408999999999999


## P7：Undersampling

In [94]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [95]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [96]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy of XGBoost cross-vaild test: 0.957330827067669

[0.90674847 0.90429448 0.90674847 0.91411043 0.91769042]
Accuracy of XGBoost cross-vaild test: 0.9099184516362431


In [97]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [98]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy of XGBoost cross-vaild test: 0.7978070175438596

[0.73865031 0.75337423 0.7607362  0.73496933 0.73710074]
Accuracy of XGBoost cross-vaild test: 0.7449661596900861


In [99]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [100]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.9586666666666668

[0.79146538 0.784219   0.79549114 0.80676329 0.77840451]
Accuracy of XGBoost cross-vaild test: 0.7912686641727037


In [101]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [102]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.9472198368398456

[0.79936558 0.79064235 0.78984933 0.81111111 0.78571429]
Accuracy of XGBoost cross-vaild test: 0.7953365305942626


In [103]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [104]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.9322485207100591

[0.83683628 0.82853982 0.82964602 0.83130531 0.82567792]
Accuracy of XGBoost cross-vaild test: 0.8304010705662834


In [105]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [106]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.9385542168674699

[0.83683628 0.83075221 0.83065855 0.83674599 0.82567792]
Accuracy of XGBoost cross-vaild test: 0.832134190537291


## P7：Oversampling

In [107]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [108]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.948709630911188

[0.76177024 0.83767661 0.86091052 0.85965463 0.87032967]
Accuracy of XGBoost cross-vaild test: 0.8380683346785041


In [109]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [110]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.9473327566320646

[0.7611425  0.83673469 0.87158556 0.86310832 0.85400314]
Accuracy of XGBoost cross-vaild test: 0.8373148419153262


In [111]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [112]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.9323432664896079

[0.75407779 0.82434128 0.85790464 0.86386449 0.8531995 ]
Accuracy of XGBoost cross-vaild test: 0.8306775407779172


## P7：Ensemble

In [113]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [114]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.9785811232163164

[0.80402385 0.88673621 0.91132638 0.88971684 0.89858315]
Accuracy of XGBoost cross-vaild test: 0.8780772851187638


In [115]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [116]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.9412366580787633

[0.75682968 0.8381113  0.86673414 0.87280702 0.86673414]
Accuracy of XGBoost cross-vaild test: 0.8402432563442593


SMOTE+ENN 效果最好

## P8：SMOTE‐based Oversampling

In [117]:
# 記得跑上面
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X8, y8 = smote.fit_resample(X, y)

In [118]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf8= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores8 = cross_val_score(clf8,X8,y8,cv=5,scoring='accuracy')
print(scores8)
print('Accuracy of XGBoost cross-vaild test:',scores8.mean())

# Accuracy: 0.8759304207119742

[0.76177024 0.83296703 0.87472527 0.86091052 0.87158556]
Accuracy of XGBoost cross-vaild test: 0.8403917255733238


In [119]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X8a, y8a = bsmote.fit_resample(X, y)

In [120]:
# XGBoost 
scores8a = cross_val_score(clf8,X8a,y8a,cv=5,scoring='accuracy')
print(scores8a)
print('Accuracy of XGBoost cross-vaild test:',scores8a.mean())

# Accuracy: 0.8759304207119742

[0.74513497 0.84678179 0.87284144 0.86907378 0.87158556]
Accuracy of XGBoost cross-vaild test: 0.8410835080084474


## P9：Imbalance Ratio vs. Resampling Strategy

In [121]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [122]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [123]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [124]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.952

[0.81964573 0.81078905 0.80676329 0.81078905 0.80821918]
Accuracy of RandomForest cross-vaild test: 0.8112412591269054


In [125]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.83494364 0.82286634 0.80193237 0.8236715  0.80660757]
Accuracy of Lightgbm cross-vaild test: 0.8180042846335807


In [126]:
# MLP
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes = (256,128,64,32), activation="relu",max_iter=50, random_state=1)
clf0b=MLP.fit(X0,y0)
scores0b = cross_val_score(clf0b,X0,y0,cv=5,scoring='accuracy')
print(scores0b)
print('Accuracy of MLP cross-vaild test:',scores0b.mean())
# Accuracy of MLP cross-vaild test: 0.8733333333333334



[0.67230274 0.4041868  0.66988728 0.67149758 0.67365028]
Accuracy of MLP cross-vaild test: 0.6183049356331771




In [127]:
# SVM
from sklearn import svm
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score

svm = svm.SVC(kernel='rbf',max_iter=10000) #Default: rbf
clf0c=svm.fit(X0,y0)
scores0c = cross_val_score(clf0c,X0,y0,cv=5,scoring='accuracy')
print(scores0c)
print('Accuracy of SVM cross-vaild test:',scores0c.mean())
# Accuracy of SVM cross-vaild test : 0.852

[0.67230274 0.67230274 0.67149758 0.67149758 0.67203868]
Accuracy of SVM cross-vaild test: 0.6719278645214952


## P10：Tomek Links

In [128]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [129]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.9370245139475909

[0.84679204 0.86227876 0.85121681 0.85619469 0.84615385]
Accuracy of RandomForest cross-vaild test: 0.8525272294077604


In [130]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.9426666666666668

[0.86393805 0.85674779 0.85176991 0.86006637 0.84394023]
Accuracy of Lightgbm cross-vaild test: 0.8180042846335807


In [131]:
scores01b = cross_val_score(clf0b,X01,y01,cv=5,scoring='accuracy')
print(scores01b)
print('Accuracy of MLP cross-vaild test:',scores01b.mean())
# Accuracy of MLP cross-vaild test: 0.8692096365173289



[0.76714602 0.77433628 0.6050885  0.77212389 0.77033758]
Accuracy of MLP cross-vaild test: 0.7378064532716918




In [132]:
scores01c = cross_val_score(clf0c,X01,y01,cv=5,scoring='accuracy')
print(scores01c)
print('Accuracy of SVM cross-vaild test:',scores01c.mean())
# Accuracy of SVM cross-vaild test : 0.8478021978021978

[0.77433628 0.77433628 0.77488938 0.77488938 0.7747648 ]
Accuracy of SVM cross-vaild test: 0.7746432261950821


## P10：One Sided Selection

In [133]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [134]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.9268595339441598

[0.84819945 0.86371191 0.8531856  0.85096953 0.84866962]
Accuracy of RandomForest cross-vaild test: 0.8529472210108654


In [135]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.86204986 0.86094183 0.84487535 0.86260388 0.84201774]
Accuracy of Lightgbm cross-vaild test: 0.8180042846335807


In [136]:
scores02b = cross_val_score(clf0b,X02,y02,cv=5,scoring='accuracy')
print(scores02b)
print('Accuracy of MLP cross-vaild test:',scores02b.mean())
# Accuracy of MLP cross-vaild test:  0.8573118822595773



[0.47977839 0.65096953 0.77396122 0.49806094 0.77161863]
Accuracy of MLP cross-vaild test: 0.6348777416759309




In [137]:
scores02c = cross_val_score(clf0c,X02,y02,cv=5,scoring='accuracy')
print(scores02c)
print('Accuracy of SVM cross-vaild test:',scores02c.mean())
# Accuracy of SVM cross-vaild test : 0.8441165861048987

[0.77451524 0.77451524 0.77396122 0.77396122 0.77439024]
Accuracy of SVM cross-vaild test: 0.7742686304979394


## P10：SMOTE

In [138]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [139]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.9403714917339485

[0.7611425  0.84866562 0.88508634 0.88759812 0.88602826]
Accuracy of RandomForest cross-vaild test: 0.8537041668760799


In [140]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9535539215686274

[0.7645951  0.84646782 0.87880691 0.88006279 0.87378336]
Accuracy of Lightgbm cross-vaild test: 0.8487431965398068


In [141]:
scores03b = cross_val_score(clf0b,X03,y03,cv=5,scoring='accuracy')
print(scores03b)
print('Accuracy of MLP cross-vaild test:',scores03b.mean())
# Accuracy of MLP cross-vaild test: 0.8217920991926182



[0.5257376  0.55478807 0.56389325 0.52935636 0.54568289]
Accuracy of MLP cross-vaild test: 0.5438916334315851




In [142]:
scores03c = cross_val_score(clf0c,X03,y03,cv=5,scoring='accuracy')
print(scores03c)
print('Accuracy of SVM cross-vaild test:',scores03c.mean())
# Accuracy of SVM cross-vaild test : 0.6685097078046904



[0.56434401 0.56357928 0.57833595 0.57425432 0.58492936]
Accuracy of SVM cross-vaild test: 0.5730885812241745


## P10：Borderline-SMOTE

In [143]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [144]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.9396914648212226

[0.76647834 0.86059655 0.88602826 0.888854   0.88006279]
Accuracy of RandomForest cross-vaild test: 0.8564039888010833


In [145]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.948027201076509

[0.75580665 0.84646782 0.8744113  0.88351648 0.87535322]
Accuracy of Lightgbm cross-vaild test: 0.8471110953435408


In [146]:
scores04b = cross_val_score(clf0b,X04,y04,cv=5,scoring='accuracy')
print(scores04b)
print('Accuracy of MLP cross-vaild test:',scores04b.mean())
# Accuracy of MLP cross-vaild test: 0.8252619184928873



[0.54959196 0.56640502 0.59434851 0.5255887  0.60062794]
Accuracy of MLP cross-vaild test: 0.5673124275061321




In [147]:
scores04c = cross_val_score(clf0c,X04,y04,cv=5,scoring='accuracy')
print(scores04c)
print('Accuracy of SVM cross-vaild test:',scores04c.mean())
# Accuracy of SVM cross-vaild test : 0.6733732218377547



[0.56371626 0.57425432 0.60094192 0.5877551  0.60031397]
Accuracy of SVM cross-vaild test: 0.5853963129507924


## P10：SMOTE + ENN

In [148]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [149]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.9791304347826086

[0.75383436 0.88266871 0.90107362 0.8857362  0.89178818]
Accuracy of RandomForest cross-vaild test: 0.8630202129112148


In [150]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of Lightgbm cross-vaild test:',scores05a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.982608695652174

[0.77377301 0.86733129 0.89033742 0.88420245 0.89102072]
Accuracy of Lightgbm cross-vaild test: 0.8613329786382534


In [151]:
scores05b = cross_val_score(clf0b,X05,y05,cv=5,scoring='accuracy')
print(scores05b)
print('Accuracy of MLP cross-vaild test:',scores05b.mean())
# Accuracy of MLP cross-vaild test:  0.8626086956521739



[0.64493865 0.57131902 0.56518405 0.62039877 0.55180353]
Accuracy of MLP cross-vaild test: 0.5907288042224409




In [152]:
scores05c = cross_val_score(clf0c,X05,y05,cv=5,scoring='accuracy')
print(scores05c)
print('Accuracy of SVM cross-vaild test:',scores05c.mean())
# Accuracy of SVM cross-vaild test : 0.7417391304347826

[0.64340491 0.66104294 0.67868098 0.63420245 0.65771297]
Accuracy of SVM cross-vaild test: 0.6550088516825259


## P10：SMOTE + Tomek Links

In [153]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [154]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.9413830072366658

[0.76637407 0.85887914 0.88990206 0.88821344 0.87875718]
Accuracy of RandomForest cross-vaild test: 0.8564251770883289


In [155]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9490777515167759

[0.75928427 0.85347738 0.88145897 0.88179669 0.87808173]
Accuracy of Lightgbm cross-vaild test: 0.8508198067107372


In [156]:
scores06b = cross_val_score(clf0b,X06,y06,cv=5,scoring='accuracy')
print(scores06b)
print('Accuracy of MLP cross-vaild test:',scores06b.mean())
# Accuracy of MLP cross-vaild test: 0.7955191150313101



[0.48548278 0.57224848 0.54711246 0.5504897  0.46977373]
Accuracy of MLP cross-vaild test: 0.5250214298370375




In [157]:
scores06c = cross_val_score(clf0c,X06,y06,cv=5,scoring='accuracy')
print(scores06c)
print('Accuracy of SVM cross-vaild test:',scores06c.mean())
# Accuracy of SVM cross-vaild test : 0.6887673302307448



[0.58170155 0.5833896  0.60013509 0.58797703 0.58763931]
Accuracy of SVM cross-vaild test: 0.5881685179902314


## 輸出

In [158]:
data=pd.DataFrame([[scores1.mean(),scores11.mean()],
         [scores1.mean(),scores2.mean()],         
         [scores3.mean(),scores31.mean()],
         [scores4.mean(),scores41.mean(),scores42.mean(),scores43.mean(),scores44.mean()],
         [scores4a.mean(),scores41a.mean(),scores42a.mean(),scores43a.mean(),scores44a.mean()],
         [scores4b.mean(),scores41b.mean(),scores42b.mean(),scores43b.mean(),scores44b.mean()],
         [scores4c.mean(),scores41c.mean(),scores42c.mean(),scores43c.mean(),scores44c.mean()],
         [scores4d.mean(),scores41d.mean(),scores42d.mean(),scores43d.mean(),scores44d.mean()],
         [scores5.mean(),scores5a.mean(),scores5b.mean(),scores5c.mean(),scores5d.mean(),scores5e.mean()],
         [scores1.mean(),scores6.mean(),scores6a.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores7g.mean(),scores7h.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores0.mean(),scores0a.mean(),scores0b.mean(),scores0c.mean()],
         [scores01.mean(),scores01a.mean(),scores01b.mean(),scores01c.mean()],
         [scores02.mean(),scores02a.mean(),scores02b.mean(),scores02c.mean()],
         [scores03.mean(),scores03a.mean(),scores03b.mean(),scores03c.mean()],
         [scores04.mean(),scores04a.mean(),scores04b.mean(),scores04c.mean()],
         [scores05.mean(),scores05a.mean(),scores05b.mean(),scores05c.mean()],
         [scores06.mean(),scores06a.mean(),scores06b.mean(),scores06c.mean()]],
         index=['Q1','Q2','Q3','Q4L','Q4O','Q4F','Q4T','Q4LOL','Q5','Q6','Q7','Q8','Q9','Q10-1','Q10-2','Q10-3','Q10-4','Q10-5','Q10-6','Q10-7']     )

In [159]:
data.to_csv('online_shoppers_intention_Result.csv')