## 讀取資料 (Default：Label Encoding)

In [1]:
#讀取資料
import pandas as pd
import numpy as np

df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 

In [2]:
# all columns
print(list(df.columns))

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [3]:
#資料前處理  # Label Encoder
#將部分字串轉數值
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='customerID') #此列為個人資料，所以丟棄
df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

In [4]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [5]:
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60


## P1：標準化是否影響結果

In [6]:
#先不經標準化做XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf1= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores1 = cross_val_score(clf1,X,y,cv=5,scoring='accuracy')
print(scores1)
print('Accuracy of XGBoost cross-vaild test:',scores1.mean())
# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.75514549 0.7707594  0.75372605 0.77201705 0.7734375 ]
Accuracy of XGBoost cross-vaild test: 0.7650170978772823


In [7]:
#使用經過LabelEncoder編碼的特徵，標準化 (因為變數值間有大有小)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X)

In [8]:
#標準化後XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf11= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores11 = cross_val_score(clf11,X1,y,cv=5,scoring='accuracy')
print(scores11)
print('Accuracy of XGBoost cross-vaild test:',scores11.mean())

# Accuracy of XGBoost cross-vaild test: 0.9362212662992337
##標準化在XGBoost的交叉驗證上沒有差異太大

[0.75443577 0.77146913 0.75301632 0.77201705 0.77272727]
Accuracy of XGBoost cross-vaild test: 0.7647331077811472


## P3：Feature Binning 有沒有效果

In [9]:
import pandas as pd
import numpy as np
# For 繪製敘述統計
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import scipy.stats as stats
from sklearn.model_selection import train_test_split
# for discretization
from sklearn.preprocessing import KBinsDiscretizer

In [10]:
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60


In [11]:
# Equal width Binning
disc1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc1.fit(X[['MonthlyCharges']])
disc2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc2.fit(X[['TotalCharges']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [12]:
# Equal Frequency Binning
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['MonthlyCharges']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[['TotalCharges']])

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [13]:
disc1.bin_edges_

array([array([ 18.25,  28.3 ,  38.35,  48.4 ,  58.45,  68.5 ,  78.55,  88.6 ,
               98.65, 108.7 , 118.75])                                       ],
      dtype=object)

In [14]:
disc11.bin_edges_

array([array([ 18.25,  20.05,  25.05,  45.85,  58.83,  70.35,  79.1 ,  85.5 ,
               94.25, 102.6 , 118.75])                                       ],
      dtype=object)

In [15]:
X3 = pd.DataFrame(X)
X3[['MonthlyCharges']]=disc1.transform(X3[['MonthlyCharges']])
X3[['TotalCharges']]=disc1.transform(X3[['TotalCharges']])
X3 = pd.DataFrame(X3)

Feature names unseen at fit time:
- TotalCharges
Feature names seen at fit time, yet now missing:
- MonthlyCharges



In [16]:
X3

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,1.0,1.0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,3.0,9.0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,3.0,8.0
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,2.0,9.0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,6.0,9.0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,8.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,1.0,9.0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,5.0,9.0


In [17]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores3 = cross_val_score(clf3,X3,y,cv=5,scoring='accuracy')
print(scores3)
print('Accuracy of XGBoost cross-vaild test:',scores3.mean())

#Accuracy of XGBoost cross-vaild test: 0.9304140341443743
# Frequency較佳

[0.7735983  0.76721079 0.74520937 0.76420455 0.78551136]
Accuracy of XGBoost cross-vaild test: 0.7671468723788631


In [18]:
#資料前處理 # Label Encoder
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='customerID')
df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

X31 = pd.DataFrame(X)
X31[['MonthlyCharges']]=disc11.transform(X31[['MonthlyCharges']])
X31[['TotalCharges']]=disc21.transform(X31[['TotalCharges']])
X31 = pd.DataFrame(X31)

In [19]:
X31

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,2.0,0.0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,3.0,5.0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,3.0,1.0
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,2.0,5.0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,6.0,5.0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,9.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,2.0,2.0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,5.0,2.0


In [20]:
# Binning 後進行 XGBoost
#X3=X3.drop(columns=['age',' fnlwgt',' capital-gain',' capital-loss',' hours-per-week']) #原先的特徵丟掉

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf3= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores31 = cross_val_score(clf3,X31,y,cv=5,scoring='accuracy')
print(scores31)
print('Accuracy of XGBoost cross-vaild test:',scores31.mean())

#Accuracy of XGBoost cross-vaild test:  0.935065196935072
# Frequency較佳

[0.76863023 0.7707594  0.75514549 0.77201705 0.7734375 ]
Accuracy of XGBoost cross-vaild test: 0.7679979353506677


Source：iT幫幫忙--Day12 - Feature Engineering -- 4. 分隔方法(Discretization),https://ithelp.ithome.com.tw/articles/10235726

## P2：One-hot Encoding vs. Label Encoding on Tree-based method

In [21]:
X3

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,1.0,1.0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,3.0,9.0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,3.0,8.0
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,2.0,9.0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,6.0,9.0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,8.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,1.0,9.0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,5.0,9.0


In [22]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
from category_encoders import *
enc2 = OneHotEncoder(cols=['PaymentMethod']).fit(X3,y)
X2 = enc2.transform(X3)
X2 = pd.DataFrame(X2)
X2

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,...,0,0,0,1,1,0,0,0,1.0,1.0
1,1,0,0,0,34,1,0,0,2,0,...,0,0,1,0,0,1,0,0,3.0,9.0
2,1,0,0,0,2,1,0,0,2,2,...,0,0,0,1,0,1,0,0,3.0,8.0
3,1,0,0,0,45,0,1,0,2,0,...,0,0,1,0,0,0,1,0,2.0,9.0
4,0,0,0,0,2,1,0,1,0,0,...,0,0,0,1,1,0,0,0,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,...,2,2,1,1,0,1,0,0,6.0,9.0
7039,0,0,1,1,72,1,2,1,0,2,...,2,2,1,1,0,0,0,1,8.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,...,0,0,0,1,1,0,0,0,1.0,9.0
7041,1,1,1,0,4,1,2,1,0,0,...,0,0,0,1,0,1,0,0,5.0,9.0


In [24]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf2= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores2 = cross_val_score(clf2,X3,y,cv=5,scoring='accuracy')
print(scores2)
print('Accuracy of XGBoost cross-vaild test:',scores2.mean())

# Accuracy of XGBoost cross-vaild test for Label Encoder: 0.9362212662992337
# Accuracy of XGBoost cross-vaild test for one-hot encoding: 0.9420217771205808
# 交叉驗證為 Label Encoder 結果略佳
# 可能在 one-hot encoding 會陷入 curse of dimensionaility

[0.7735983  0.76721079 0.74520937 0.76420455 0.78551136]
Accuracy of XGBoost cross-vaild test: 0.7671468723788631


## P4：Label Encoding (這裡不用)

In [25]:
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='customerID')
df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

In [26]:
# One hot encoding 後 XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4 = cross_val_score(clf4,X,y,cv=5,scoring='accuracy')
print(scores4)
print('Accuracy of XGBoost cross-vaild test:',scores4.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.75514549 0.7707594  0.75372605 0.77201705 0.7734375 ]
Accuracy of XGBoost cross-vaild test: 0.7650170978772823


In [27]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X,y)
scores41 = cross_val_score(clf41,X,y,cv=5,scoring='accuracy')
print(scores41)
print('Accuracy of RandomForest cross-vaild test:',scores41.mean())

# Accuracy of RandomForest cross-vaild test: 0.9327261728726979

[0.79276082 0.79276082 0.77714691 0.79261364 0.79332386]
Accuracy of RandomForest cross-vaild test: 0.7897212118523775


In [28]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X,y)
scores42 = cross_val_score(clf42,X,y,cv=5,scoring='accuracy')
print(scores42)
print('Accuracy of Lightgbm cross-vaild test:',scores42.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9373706143298831

[0.79701916 0.8012775  0.77572747 0.80042614 0.80255682]
Accuracy of Lightgbm cross-vaild test: 0.7954014170269049


## P4：One Hot Encoding

In [29]:
# 清除 Label Encoder 後，再做 One Hot Encoding
X2

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,...,0,0,0,1,1,0,0,0,1.0,1.0
1,1,0,0,0,34,1,0,0,2,0,...,0,0,1,0,0,1,0,0,3.0,9.0
2,1,0,0,0,2,1,0,0,2,2,...,0,0,0,1,0,1,0,0,3.0,8.0
3,1,0,0,0,45,0,1,0,2,0,...,0,0,1,0,0,0,1,0,2.0,9.0
4,0,0,0,0,2,1,0,1,0,0,...,0,0,0,1,1,0,0,0,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,...,2,2,1,1,0,1,0,0,6.0,9.0
7039,0,0,1,1,72,1,2,1,0,2,...,2,2,1,1,0,0,0,1,8.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,...,0,0,0,1,1,0,0,0,1.0,9.0
7041,1,1,1,0,4,1,2,1,0,0,...,0,0,0,1,0,1,0,0,5.0,9.0


In [30]:
# One hot encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4a = cross_val_score(clf4,X2,y,cv=5,scoring='accuracy')
print(scores4a)
print('Accuracy of XGBoost cross-vaild test:',scores4a.mean())
# Accuracy of XGBoost cross-vaild test: 0.9420217771205808

[0.7735983  0.77572747 0.74875798 0.77485795 0.77769886]
Accuracy of XGBoost cross-vaild test: 0.7701281131040713


In [31]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X2,y)
scores41a = cross_val_score(clf41,X2,y,cv=5,scoring='accuracy')
print(scores41a)
print('Accuracy of RandomForest cross-vaild test:',scores41a.mean())

# Accuracy of RandomForest cross-vaild test: 0.9223215485952412

[0.79347055 0.78637331 0.76792051 0.79332386 0.77769886]
Accuracy of RandomForest cross-vaild test: 0.7837574198335376


In [32]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X2,y)
scores42a = cross_val_score(clf42,X2,y,cv=5,scoring='accuracy')
print(scores42a)
print('Accuracy of Lightgbm cross-vaild test:',scores42a.mean())

# Accuracy of Lightgbm cross-vaild test: 0.9362212662992337

[0.79488999 0.8012775  0.78069553 0.80752841 0.79616477]
Accuracy of Lightgbm cross-vaild test: 0.7961112410478096


## P4：Frequency Encoding

In [33]:
# Frequency encoding：用類別出現頻率當作該類別數值
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60


In [34]:
X4b=pd.DataFrame(X)
enc1=X4b['PaymentMethod'].value_counts()
X4b['PaymentMethod']=X4b['PaymentMethod'].apply(lambda x : enc1[x]) 

In [35]:
X4b

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2365,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,1612,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,1612,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,1544,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2365,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,1612,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1522,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2365,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,1612,74.40,306.60


In [36]:
# Frequency encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4b= XGBClassifier(**params)

# 5-fold validation
scores4b = cross_val_score(clf4b,X4b,y,cv=5,scoring='accuracy')
print(scores4b)
print('Accuracy of XGBoost cross-vaild test:',scores4b.mean())

# Accuracy of XGBoost cross-vaild test: 0.936194380965183

[0.75301632 0.78566359 0.74946771 0.76420455 0.77698864]
Accuracy of XGBoost cross-vaild test: 0.7658681608490869


In [37]:
# Frequency encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41b=RF.fit(X4b,y)
scores41b = cross_val_score(clf41b,X4b,y,cv=5,scoring='accuracy')
print(scores41b)
print('Accuracy of RandomForest cross-vaild test:',scores41b.mean())

# Accuracy of RandomForest cross-vaild test: 0.9315633821750235

[0.79630944 0.78779276 0.7735983  0.79332386 0.79332386]
Accuracy of RandomForest cross-vaild test: 0.7888696448157946


In [38]:
# Frequency encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42b=LGBM.fit(X4b,y)
scores42b = cross_val_score(clf42b,X4b,y,cv=5,scoring='accuracy')
print(scores42b)
print('Accuracy of Lightgbm cross-vaild test:',scores42b.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9408321010888561

[0.80056778 0.80553584 0.78424414 0.80184659 0.80184659]
Accuracy of Lightgbm cross-vaild test: 0.7988081892380153


## P4：Target Encoding

In [39]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [40]:
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='customerID')
df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

In [42]:
# Target encoding：使用 Target (預測目標) 來達成 Features 的 Encoding
# 清除 Label Encoder
# 類別特徵：workclass, marital-status, occupation, relationship, race, sex, native-country
from category_encoders import *

enc = TargetEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],
                    min_samples_leaf=20, smoothing=10).fit(X, y)
X4c = enc.transform(X)

In [43]:
X4c

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0.269209,0,0.196649,0.312791,1,0.249267,0.249267,0.189591,0.417667,0.215315,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,29.85,29.85
1,0.261603,0,0.329580,0.312791,34,0.267096,0.250442,0.189591,0.146112,0.399288,0.225021,0.416355,0.335231,0.336804,0.112695,0.163301,0.191067,56.95,1889.50
2,0.261603,0,0.329580,0.312791,2,0.267096,0.250442,0.189591,0.146112,0.215315,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.191067,53.85,108.15
3,0.261603,0,0.329580,0.312791,45,0.249267,0.249267,0.189591,0.146112,0.399288,0.225021,0.151663,0.335231,0.336804,0.112695,0.163301,0.167098,42.30,1840.75
4,0.269209,0,0.329580,0.312791,2,0.267096,0.250442,0.418928,0.417667,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.261603,0,0.196649,0.154502,24,0.267096,0.286099,0.189591,0.146112,0.399288,0.225021,0.151663,0.300702,0.299414,0.112695,0.335651,0.191067,84.80,1990.50
7039,0.269209,0,0.196649,0.154502,72,0.267096,0.286099,0.418928,0.417667,0.215315,0.225021,0.416355,0.300702,0.299414,0.112695,0.335651,0.152431,103.20,7362.90
7040,0.269209,0,0.196649,0.154502,11,0.249267,0.249267,0.189591,0.146112,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,29.60,346.45
7041,0.261603,1,0.196649,0.312791,4,0.267096,0.286099,0.418928,0.417667,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.191067,74.40,306.60


In [44]:
# Target encoding 後 XGBoost / 前處理在 P2
from xgboost import XGBClassifier

params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores4c = cross_val_score(clf4,X4c,y,cv=5,scoring='accuracy')
print(scores4c)
print('Accuracy of XGBoost cross-vaild test:',scores4c.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.75727466 0.75869411 0.73953158 0.77485795 0.77627841]
Accuracy of XGBoost cross-vaild test: 0.7613273436995935


In [45]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4c,y)
scores41c = cross_val_score(clf41,X4c,y,cv=5,scoring='accuracy')
print(scores41c)
print('Accuracy of RandomForest cross-vaild test:',scores41c.mean())

# Accuracy of RandomForest cross-vaild test for target encoding: 0.8591567243513352

[0.79630944 0.79063165 0.76933996 0.79900568 0.80042614]
Accuracy of RandomForest cross-vaild test: 0.7911425737144332


In [46]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4c,y)
scores42c = cross_val_score(clf42,X4c,y,cv=5,scoring='accuracy')
print(scores42c)
print('Accuracy of Lightgbm cross-vaild test:',scores42c.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454765425460412
# Accuracy of Lightgbm cross-vaild test for target encoding: 0.8740826747563275

[0.80837473 0.80198722 0.77785664 0.80752841 0.79900568]
Accuracy of Lightgbm cross-vaild test: 0.7989505371314278


## P4：Leave-One-Out Encoding

In [47]:
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60


In [49]:
encc = LeaveOneOutEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],sigma=0.05).fit(X, y)
X4d = encc.transform(X)

Source：http://contrib.scikit-learn.org/category_encoders/leaveoneout.html 

In [50]:
# One hot encoding 後 XGBoost
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf4= XGBClassifier(**params)

# 5-fold validation
scores4d = cross_val_score(clf4,X4d,y,cv=5,scoring='accuracy')
print(scores4d)
print('Accuracy of XGBoost cross-vaild test:',scores4d.mean())

# Accuracy of XGBoost cross-vaild test for LOO encoding:  0.9466258905766904

[0.75727466 0.75869411 0.73953158 0.77485795 0.77627841]
Accuracy of XGBoost cross-vaild test: 0.7613273436995935


In [51]:
# One hot encoding 後 RandomForest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf41=RF.fit(X4d,y)
scores41d = cross_val_score(clf41,X4d,y,cv=5,scoring='accuracy')
print(scores41d)
print('Accuracy of RandomForest cross-vaild test:',scores41d.mean())

# Accuracy of RandomForest cross-vaild test: 0.9559214948245731

[0.79630944 0.7920511  0.77501774 0.79971591 0.79119318]
Accuracy of RandomForest cross-vaild test: 0.7908574746757855


In [52]:
# One hot encoding 後 Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf42=LGBM.fit(X4d,y)
scores42d = cross_val_score(clf42,X4d,y,cv=5,scoring='accuracy')
print(scores42d)
print('Accuracy of Lightgbm cross-vaild test:',scores42d.mean())
# Accuracy of Lightgbm cross-vaild test: 0.9454899852130663

[0.80837473 0.80198722 0.77785664 0.80752841 0.79900568]
Accuracy of Lightgbm cross-vaild test: 0.7989505371314278


## P5：Combinations of numerical and categorical feature transformation

挑選較常用的六個組合

In [53]:
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df=df.drop(columns='customerID')
df.dropna(axis=0) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

In [55]:
#XGBoost 

from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores5 = cross_val_score(clf5,X,y,cv=5,scoring='accuracy')
print(scores5)
print('Accuracy of XGBoost cross-vaild test:',scores5.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.75514549 0.7707594  0.75372605 0.77201705 0.7734375 ]
Accuracy of XGBoost cross-vaild test: 0.7650170978772823


In [57]:
# Standardization *  LOO
from category_encoders import *
encc = LeaveOneOutEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],sigma=0.05).fit(X, y)
X5a = encc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5a = sc.fit_transform(X5a)

In [58]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5a = cross_val_score(clf5,X5a,y,cv=5,scoring='accuracy')
print(scores5a)
print('Accuracy of XGBoost cross-vaild test:',scores5a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9466258905766904

[0.75727466 0.75869411 0.73953158 0.77485795 0.77627841]
Accuracy of XGBoost cross-vaild test: 0.7613273436995935


In [59]:
# Standardization *  Target Encoding
enc = TargetEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],min_samples_leaf=20, smoothing=10).fit(X, y)
X5b = enc.transform(X)

#StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X5b = sc.fit_transform(X5b)

In [60]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5b = cross_val_score(clf5,X5b,y,cv=5,scoring='accuracy')
print(scores5b)
print('Accuracy of XGBoost cross-vaild test:',scores5b.mean())

# Accuracy of XGBoost cross-vaild test: 0.9512770533673882

[0.75727466 0.75869411 0.73953158 0.77485795 0.77627841]
Accuracy of XGBoost cross-vaild test: 0.7613273436995935


In [61]:
# Equal‐Frequency Binning *  Label Encoding
from sklearn.preprocessing import KBinsDiscretizer
disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['MonthlyCharges']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[['TotalCharges']])

X31 = pd.DataFrame(X)
X31[['MonthlyCharges']]=disc11.transform(X31[['MonthlyCharges']])
X31[['TotalCharges']]=disc21.transform(X31[['TotalCharges']])
X5c = pd.DataFrame(X31)

In [62]:
X5c

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,2.0,0.0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,3.0,5.0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,3.0,1.0
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,2.0,5.0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,6.0,5.0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,9.0,9.0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,2.0,2.0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,5.0,2.0


In [63]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5c = cross_val_score(clf5,X5c,y,cv=5,scoring='accuracy')
print(scores5c)
print('Accuracy of XGBoost cross-vaild test:',scores5c.mean())

# Accuracy of XGBoost cross-vaild test: 0.935065196935072

[0.76863023 0.7707594  0.75514549 0.77201705 0.7734375 ]
Accuracy of XGBoost cross-vaild test: 0.7679979353506677


In [64]:
# Equal‐Frequency Binning * Leave‐One‐Out Encoding
encc = LeaveOneOutEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],sigma=0.05).fit(X5c, y)
X5d = encc.transform(X5c)

In [65]:
X5d

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0.269209,0,0.196649,0.312791,1,0.249267,0.249267,0.189591,0.417667,0.215315,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,2.0,0.0
1,0.261603,0,0.329580,0.312791,34,0.267096,0.250442,0.189591,0.146112,0.399288,0.225021,0.416355,0.335231,0.336804,0.112695,0.163301,0.191067,3.0,5.0
2,0.261603,0,0.329580,0.312791,2,0.267096,0.250442,0.189591,0.146112,0.215315,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.191067,3.0,1.0
3,0.261603,0,0.329580,0.312791,45,0.249267,0.249267,0.189591,0.146112,0.399288,0.225021,0.151663,0.335231,0.336804,0.112695,0.163301,0.167098,2.0,5.0
4,0.269209,0,0.329580,0.312791,2,0.267096,0.250442,0.418928,0.417667,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.261603,0,0.196649,0.154502,24,0.267096,0.286099,0.189591,0.146112,0.399288,0.225021,0.151663,0.300702,0.299414,0.112695,0.335651,0.191067,6.0,5.0
7039,0.269209,0,0.196649,0.154502,72,0.267096,0.286099,0.418928,0.417667,0.215315,0.225021,0.416355,0.300702,0.299414,0.112695,0.335651,0.152431,9.0,9.0
7040,0.269209,0,0.196649,0.154502,11,0.249267,0.249267,0.189591,0.146112,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.452854,2.0,2.0
7041,0.261603,1,0.196649,0.312791,4,0.267096,0.286099,0.418928,0.417667,0.399288,0.391276,0.416355,0.335231,0.336804,0.427097,0.335651,0.191067,5.0,2.0


In [66]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5d = cross_val_score(clf5,X5d,y,cv=5,scoring='accuracy')
print(scores5d)
print('Accuracy of XGBoost cross-vaild test:',scores5d.mean())

# Accuracy of XGBoost cross-vaild test: 0.9408522650893938

[0.76508162 0.76792051 0.76082328 0.77485795 0.76491477]
Accuracy of XGBoost cross-vaild test: 0.7667196270727142


In [67]:
# Equal‐Frequency Binning * Target Encoding
enc = TargetEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],min_samples_leaf=20, smoothing=10).fit(X5c, y)
X5e = enc.transform(X5c)

In [68]:
#XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf5= XGBClassifier(**params)

# 5-fold validation
scores5e = cross_val_score(clf5,X5e,y,cv=5,scoring='accuracy')
print(scores5e)
print('Accuracy of XGBoost cross-vaild test:',scores5e.mean())

# Accuracy of XGBoost cross-vaild test: 0.9420083344535556

[0.76508162 0.76792051 0.76082328 0.77485795 0.76491477]
Accuracy of XGBoost cross-vaild test: 0.7667196270727142


Standard Scalar 效果較佳，且 Label Encoder 效果又較 Target Encoding, LOO 佳

## P6：Categorical values of a feature is high (超過20種)

所有 label encoder 的結果視為 baseline

In [91]:
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',sep=',') 
from sklearn import preprocessing
le=preprocessing.LabelEncoder()

for col in df[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']]:
    df[col]=le.fit_transform(df[col])

df.dropna(axis=1) #有遺失值，即丟棄該樣本
y=df['Churn']#應變數
X=df.drop(columns='Churn') #剩下的為自變數
X=X.drop(columns='customerID')

X['TotalCharges'].replace(" ", 0, inplace=True)
X['TotalCharges'] = X['TotalCharges'].astype(float)

X6 = pd.DataFrame(X)

disc11 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc11.fit(X[['MonthlyCharges']])
disc21 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc21.fit(X[['TotalCharges']])

X6[['MonthlyCharges']]=disc11.transform(X[['MonthlyCharges']])
X6[['TotalCharges']]=disc21.transform(X[['TotalCharges']])
X6 = pd.DataFrame(X6)

In [92]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [93]:
# One Hot Encoder
from category_encoders import * 
enc6 = OneHotEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']).fit(X6,y)
X6 = enc6.transform(X6)

In [94]:
X6

Unnamed: 0,gender_1,gender_2,SeniorCitizen,Partner_1,Partner_2,Dependents_1,Dependents_2,tenure,PhoneService_1,PhoneService_2,...,Contract_2,Contract_3,PaperlessBilling_1,PaperlessBilling_2,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,MonthlyCharges,TotalCharges
0,1,0,0,1,0,1,0,1,1,0,...,0,0,1,0,1,0,0,0,2.0,0.0
1,0,1,0,0,1,1,0,34,0,1,...,1,0,0,1,0,1,0,0,3.0,5.0
2,0,1,0,0,1,1,0,2,0,1,...,0,0,1,0,0,1,0,0,3.0,1.0
3,0,1,0,0,1,1,0,45,1,0,...,1,0,0,1,0,0,1,0,2.0,5.0
4,1,0,0,0,1,1,0,2,0,1,...,0,0,1,0,1,0,0,0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,0,1,24,0,1,...,1,0,1,0,0,1,0,0,6.0,5.0
7039,1,0,0,1,0,0,1,72,0,1,...,1,0,1,0,0,0,0,1,9.0,9.0
7040,1,0,0,1,0,0,1,11,1,0,...,0,0,1,0,1,0,0,0,2.0,2.0
7041,0,1,1,1,0,1,0,4,0,1,...,0,0,1,0,0,1,0,0,5.0,2.0


In [95]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf6= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores6 = cross_val_score(clf6,X6,y,cv=5,scoring='accuracy')
print(scores6)
print('Accuracy of XGBoost cross-vaild test:',scores6.mean())

# Accuracy of XGBoost cross-vaild test: 0.935031590267509

[0.76295245 0.77643719 0.73882186 0.765625   0.77698864]
Accuracy of XGBoost cross-vaild test: 0.764165026775921


In [96]:
# Target Encoder
enc6a = TargetEncoder(cols=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
               'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],min_samples_leaf=20, smoothing=10).fit(X, y)
X6a = enc6a.transform(X)

In [97]:
# XGBoost 
scores6a = cross_val_score(clf6,X6a,y,cv=5,scoring='accuracy')
print(scores6a)
print('Accuracy of XGBoost cross-vaild test:',scores6a.mean())

# Accuracy of XGBoost cross-vaild test: 0.9396894743917192

[0.75727466 0.75869411 0.73953158 0.77485795 0.77627841]
Accuracy of XGBoost cross-vaild test: 0.7613273436995935


## P7：Undersampling

In [98]:
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60


In [99]:
# 記得跑上面
# Nearmiss
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy = 'majority')
X7, y7 = nm.fit_resample(X, y)

In [100]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7 = cross_val_score(clf7,X7,y7,cv=5,scoring='accuracy')
print(scores7)
print('Accuracy of XGBoost cross-vaild test:',scores7.mean())

# Accuracy of XGBoost cross-vaild test: 0.957330827067669

[0.39705882 0.60561497 0.60160428 0.62382865 0.61579652]
Accuracy of XGBoost cross-vaild test: 0.5687806484404642


In [101]:
# ClusterCentroids
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(voting='hard')
X7a, y7a = cc.fit_resample(X, y)

In [102]:
scores7a = cross_val_score(clf7,X7a,y7a,cv=5,scoring='accuracy')
print(scores7a)
print('Accuracy of XGBoost cross-vaild test:',scores7a.mean())

# Accuracy of XGBoost cross-vaild test: 0.7978070175438596

[0.73128342 0.70855615 0.69919786 0.71485944 0.72824632]
Accuracy of XGBoost cross-vaild test: 0.7164286379027697


In [103]:
# EditedNN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X7c, y7c = en.fit_resample(X, y)

In [104]:
scores7c = cross_val_score(clf7,X7c,y7c,cv=5,scoring='accuracy')
print(scores7c)
print('Accuracy of XGBoost cross-vaild test:',scores7c.mean())

# Accuracy: 0.9586666666666668

[0.84764826 0.86912065 0.85261003 0.85875128 0.85568066]
Accuracy of XGBoost cross-vaild test: 0.856762176271002


In [105]:
# Neighbourhood Cleaning Rule
from imblearn.under_sampling import NeighbourhoodCleaningRule
ecr = NeighbourhoodCleaningRule()
X7d, y7d = ecr.fit_resample(X, y)

In [106]:
scores7d = cross_val_score(clf7,X7d,y7d,cv=5,scoring='accuracy')
print(scores7d)
print('Accuracy of XGBoost cross-vaild test:',scores7d.mean())

# Accuracy: 0.9472198368398456

[0.8343254  0.84722222 0.84126984 0.8550149  0.84508441]
Accuracy of XGBoost cross-vaild test: 0.8445833530366798


In [107]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X7e, y7e = tl.fit_resample(X, y)

In [108]:
scores7e = cross_val_score(clf7,X7e,y7e,cv=5,scoring='accuracy')
print(scores7e)
print('Accuracy of XGBoost cross-vaild test:',scores7e.mean())

# Accuracy: 0.9322485207100591

[0.79024768 0.78947368 0.76625387 0.78560372 0.8024787 ]
Accuracy of XGBoost cross-vaild test: 0.7868115292103225


In [109]:
# One Sided Selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X7f, y7f = oss.fit_resample(X, y)

In [110]:
scores7f = cross_val_score(clf7,X7f,y7f,cv=5,scoring='accuracy')
print(scores7f)
print('Accuracy of XGBoost cross-vaild test:',scores7f.mean())

# Accuracy: 0.9385542168674699

[0.79318358 0.79705655 0.77691712 0.79550736 0.80232558]
Accuracy of XGBoost cross-vaild test: 0.7929980364959559


## P7：Oversampling

In [111]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X7g, y7g = smote.fit_resample(X, y)

In [112]:
# XGBoost 
from xgboost import XGBClassifier
params = {'objective':'binary:logistic','max_depth': 4,'alpha': 10,'learning_rate': 1.0,'n_estimators':100}  
clf7= XGBClassifier(**params)

# 5-fold validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores7g = cross_val_score(clf7,X7g,y7g,cv=5,scoring='accuracy')
print(scores7g)
print('Accuracy of XGBoost cross-vaild test:',scores7g.mean())

# Accuracy: 0.948709630911188

[0.69903382 0.75120773 0.87101449 0.89656839 0.89560174]
Accuracy of XGBoost cross-vaild test: 0.8226852338290336


In [113]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X7h, y7h = bsmote.fit_resample(X, y)

In [114]:
scores7h = cross_val_score(clf7,X7h,y7h,cv=5,scoring='accuracy')
print(scores7h)
print('Accuracy of XGBoost cross-vaild test:',scores7h.mean())

# Accuracy: 0.9473327566320646

[0.68695652 0.747343   0.87874396 0.88158531 0.89028516]
Accuracy of XGBoost cross-vaild test: 0.8169827894172779


In [115]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
X7i, y7i = adasyn.fit_resample(X, y)

In [116]:
scores7i = cross_val_score(clf7,X7i,y7i,cv=5,scoring='accuracy')
print(scores7i)
print('Accuracy of XGBoost cross-vaild test:',scores7i.mean())

# Accuracy: 0.9323432664896079

[0.69164265 0.74098991 0.87265738 0.87746276 0.89139837]
Accuracy of XGBoost cross-vaild test: 0.8148302121430758


## P7：Ensemble

In [117]:
# SMOTE + ENN
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X7j, y7j = smotenn.fit_resample(X, y)

In [118]:
scores7j = cross_val_score(clf7,X7j,y7j,cv=5,scoring='accuracy')
print(scores7j)
print('Accuracy of XGBoost cross-vaild test:',scores7j.mean())

# Accuracy: 0.9785811232163164

[0.90201005 0.94472362 0.97152429 0.95725063 0.97652976]
Accuracy of XGBoost cross-vaild test: 0.9504076684062952


In [119]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X7k, y7k = smotetl.fit_resample(X, y)

In [120]:
scores7k = cross_val_score(clf7,X7k,y7k,cv=5,scoring='accuracy')
print(scores7k)
print('Accuracy of XGBoost cross-vaild test:',scores7k.mean())

# Accuracy: 0.9412366580787633

[0.71378446 0.75288221 0.88621554 0.8877193  0.89819458]
Accuracy of XGBoost cross-vaild test: 0.8277592175021304


SMOTE+ENN 效果最好

## P9：Imbalance Ratio vs. Resampling Strategy

In [121]:
# 見 P7
# 要與其他資料集比較

## P10：ML algorithms vs. different resampling strategies (ENN)

In [122]:
# 使用方法 (參考最好的組合與相關的方法)
# ENN
# Tomek Links
# One Sided Selection
# SMOTE
# Borderline-SMOTE
# SMOTE + ENN
# SMOTE + Tomek Links

In [123]:
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
en = EditedNearestNeighbours(kind_sel="all")
X0, y0 = en.fit_resample(X, y)

In [124]:
# XGBoost Accuracy: 0.8589394520028113
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100)
clf0=RF.fit(X0,y0)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
scores0 = cross_val_score(clf0,X0,y0,cv=5,scoring='accuracy')
print(scores0)
print('Accuracy of RandomForest cross-vaild test:',scores0.mean())

# Accuracy of RandomForest cross-vaild test : 0.952

[0.85787321 0.86400818 0.85363357 0.86489253 0.87512794]
Accuracy of RandomForest cross-vaild test: 0.8631070867163576


In [125]:
# Lightgbm
import lightgbm as lgb
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1, max_depth=-5, feature_fraction=0.5, random_state=42)
clf0a=LGBM.fit(X0,y0)
scores0a = cross_val_score(clf0a,X0,y0,cv=5,scoring='accuracy')
print(scores0a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.85480573 0.87730061 0.86796315 0.86386899 0.86796315]
Accuracy of Lightgbm cross-vaild test: 0.8663803262355234


## P10：Tomek Links

In [126]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X01, y01 = tl.fit_resample(X, y)

In [127]:
scores01 = cross_val_score(clf0,X01,y01,cv=5,scoring='accuracy')
print(scores01)
print('Accuracy of RandomForest cross-vaild test:',scores01.mean())

# Accuracy of RandomForest cross-vaild test : 0.9370245139475909

[0.81888545 0.80727554 0.79102167 0.80804954 0.81254841]
Accuracy of RandomForest cross-vaild test: 0.8075561220452142


In [128]:
scores01a = cross_val_score(clf0a,X01,y01,cv=5,scoring='accuracy')
print(scores01a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test :  0.9426666666666668

[0.82043344 0.81346749 0.79489164 0.81346749 0.82494191]
Accuracy of Lightgbm cross-vaild test: 0.8663803262355234


## P10：One Sided Selection

In [129]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
X02, y02 = oss.fit_resample(X, y)

In [130]:
scores02 = cross_val_score(clf0,X02,y02,cv=5,scoring='accuracy')
print(scores02)
print('Accuracy of RandomForest cross-vaild test:',scores02.mean())
# Accuracy of RandomForest cross-vaild test : 0.9268595339441598

[0.81733746 0.81424149 0.7879257  0.80015492 0.8140976 ]
Accuracy of RandomForest cross-vaild test: 0.8067514322782398


In [131]:
scores02a = cross_val_score(clf0a,X02,y02,cv=5,scoring='accuracy')
print(scores02a)
print('Accuracy of Lightgbm cross-vaild test:',scores0a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9426666666666668

[0.81346749 0.81733746 0.79489164 0.81099923 0.82106894]
Accuracy of Lightgbm cross-vaild test: 0.8663803262355234


## P10：SMOTE

In [132]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X03, y03 = smote.fit_resample(X, y)

In [133]:
scores03 = cross_val_score(clf0,X03,y03,cv=5,scoring='accuracy')
print(scores03)
print('Accuracy of RandomForest cross-vaild test:',scores03.mean())
# Accuracy of RandomForest cross-vaild test : 0.9403714917339485

[0.73188406 0.77584541 0.88599034 0.89366844 0.89801837]
Accuracy of RandomForest cross-vaild test: 0.8370813223966396


In [134]:
scores03a = cross_val_score(clf0a,X03,y03,cv=5,scoring='accuracy')
print(scores03a)
print('Accuracy of Lightgbm cross-vaild test:',scores03a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9535539215686274

[0.70917874 0.7589372  0.88599034 0.89850169 0.88883519]
Accuracy of Lightgbm cross-vaild test: 0.8282886315823884


## P10：Borderline-SMOTE

In [135]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE()
X04, y04 = bsmote.fit_resample(X, y)

In [136]:
scores04 = cross_val_score(clf0,X04,y04,cv=5,scoring='accuracy')
print(scores04)
print('Accuracy of RandomForest cross-vaild test:',scores04.mean())
# Accuracy of RandomForest cross-vaild test : 0.9396914648212226

[0.72173913 0.76376812 0.88164251 0.90091832 0.90865152]
Accuracy of RandomForest cross-vaild test: 0.8353439197913529


In [137]:
scores04a = cross_val_score(clf0a,X04,y04,cv=5,scoring='accuracy')
print(scores04a)
print('Accuracy of Lightgbm cross-vaild test:',scores04a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.948027201076509

[0.7057971  0.77198068 0.88357488 0.89076849 0.89995167]
Accuracy of Lightgbm cross-vaild test: 0.830414562333784


## P10：SMOTE + ENN

In [138]:
from imblearn.combine import SMOTEENN
smotenn = SMOTEENN(smote = SMOTE(), enn = EditedNearestNeighbours(sampling_strategy='all'))
X05, y05 = smotenn.fit_resample(X, y)

In [139]:
scores05 = cross_val_score(clf0,X05,y05,cv=5,scoring='accuracy')
print(scores05)
print('Accuracy of RandomForest cross-vaild test:',scores05.mean())
# Accuracy of RandomForest cross-vaild test : 0.9791304347826086

[0.91471572 0.93896321 0.96237458 0.9590301  0.97742475]
Accuracy of RandomForest cross-vaild test: 0.9505016722408026


In [143]:
scores05a = cross_val_score(clf0a,X05,y05,cv=5,scoring='accuracy')
print(scores05a)
print('Accuracy of SVM cross-vaild test:',scores05a.mean())
# Accuracy of SVM cross-vaild test : 0.7417391304347826

[0.90217391 0.93561873 0.9632107  0.9590301  0.97324415]
Accuracy of SVM cross-vaild test: 0.946655518394649


## P10：SMOTE + Tomek Links

In [144]:
from imblearn.combine import SMOTETomek
smotetl = SMOTETomek(smote = SMOTE(), tomek = TomekLinks(sampling_strategy='majority'))
X06, y06 = smotetl.fit_resample(X, y)

In [145]:
scores06 = cross_val_score(clf0,X06,y06,cv=5,scoring='accuracy')
print(scores06)
print('Accuracy of RandomForest cross-vaild test:',scores06.mean())
# Accuracy of RandomForest cross-vaild test : 0.9413830072366658

[0.73082707 0.77243108 0.89267803 0.89819458 0.90972919]
Accuracy of RandomForest cross-vaild test: 0.8407719901559314


In [146]:
scores06a = cross_val_score(clf0a,X06,y06,cv=5,scoring='accuracy')
print(scores06a)
print('Accuracy of Lightgbm cross-vaild test:',scores06a.mean())
# Accuracy of Lightgbm cross-vaild test : 0.9490777515167759

[0.71428571 0.7764411  0.89067202 0.89919759 0.90371113]
Accuracy of Lightgbm cross-vaild test: 0.8368615118538573


## 輸出

In [150]:
data=pd.DataFrame([[scores1.mean(),scores11.mean()],
         [scores1.mean(),scores2.mean()],         
         [scores3.mean(),scores31.mean()],
         [scores4.mean(),scores41.mean(),scores42.mean()],
         [scores4a.mean(),scores41a.mean(),scores42a.mean()],
         [scores4b.mean(),scores41b.mean(),scores42b.mean()],
         [scores4c.mean(),scores41c.mean(),scores42c.mean()],
         [scores4d.mean(),scores41d.mean(),scores42d.mean()],
         [scores5.mean(),scores5a.mean(),scores5b.mean(),scores5c.mean(),scores5d.mean(),scores5e.mean()],
         [scores1.mean(),scores6.mean(),scores6a.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores7g.mean(),scores7h.mean()],
         [scores7.mean(),scores7a.mean(),#scores7b.mean(),
         scores7c.mean(),scores7d.mean(),scores7e.mean(),scores7f.mean(),scores7g.mean(),scores7h.mean(),scores7i.mean(),scores7j.mean(),scores7k.mean()],
         [scores0.mean(),scores0a.mean()],#scores0b.mean(),scores0c.mean()],
         [scores01.mean(),scores01a.mean()],#scores01b.mean(),scores01c.mean()],
         [scores02.mean(),scores02a.mean()],#scores02b.mean(),scores02c.mean()],
         [scores03.mean(),scores03a.mean()],#scores03b.mean(),scores03c.mean()],
         [scores04.mean(),scores04a.mean()],#scores04b.mean(),scores04c.mean()],
         [scores05.mean(),scores05a.mean()],#scores05b.mean(),scores05c.mean()],
         [scores06.mean(),scores06a.mean()]],#scores06b.mean(),scores06c.mean()]],
         index=['Q1','Q2','Q3','Q4L','Q4O','Q4F','Q4T','Q4LOL','Q5','Q6','Q7','Q8','Q9','Q10-1','Q10-2','Q10-3','Q10-4','Q10-5','Q10-6','Q10-7']     )

In [152]:
data.to_csv('C.csv')