In [19]:
# 引用packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import SMOTE

In [20]:
# 資料前處理
# 讀取train_data(正式版)
train_data = pd.read_csv('./data/train.csv') 

# 將連續型資料標準化並傳回train_data 
train_standard = train_data[['CreditScore', 'Balance', 'EstimatedSalary']]
z_scaler = StandardScaler()
# z_scaler = Normalizer()
z_scaler.fit(train_standard)
train_standard_scaled = z_scaler.transform(train_standard)
train_standard_scaled = pd.DataFrame(train_standard_scaled, columns=['CreditScore', 'Balance', 'EstimatedSalary'])
train_data['Balance'] = train_standard_scaled['Balance']
train_data['EstimatedSalary'] = train_standard_scaled['EstimatedSalary']
train_data['CreditScore'] = train_standard_scaled['CreditScore']

train_features = train_data
#去除超過3個標準差的離群值(全部經標準化欄位)
# train_features = train_data[abs(train_data['CreditScore'])<3]
# train_features = train_features[abs(train_features['Balance'])<3]
# train_features = train_features[abs(train_features['EstimatedSalary'])<3]

# 將非數值資料轉換為數值
train_features_drop = train_features.drop(['RowNumber','CustomerId', 'Surname', 'Exited'],axis=1)
train_features_sub = pd.get_dummies(train_features_drop)

# 建立train_data的類別值與資料
train_data_class = train_features['Exited'].values 
train_data_sub = train_features_sub.values


# 讀取test_data
test_data = pd.read_csv('./data/test.csv') 

# 將連續型資料標準化並傳回test_data 
test_standard = test_data[['CreditScore', 'Balance', 'EstimatedSalary']]
test_standard_scaled = z_scaler.transform(test_standard)
test_standard_scaled = pd.DataFrame(test_standard_scaled, columns=['CreditScore', 'Balance', 'EstimatedSalary'])
test_data['Balance']  = test_standard_scaled['Balance']
test_data['EstimatedSalary']  = test_standard_scaled['EstimatedSalary']
test_data['CreditScore']  = test_standard_scaled['CreditScore']

# 將非數值資料轉換為數值
test_features = test_data.drop(['RowNumber','CustomerId', 'Surname'],axis=1)
test_features_sub = pd.get_dummies(test_features)
test_data_sub = test_features_sub.values

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score

# 從train_data中切訓練資料
train_x, test_x, train_y, test_y = train_test_split(train_data_sub, train_data_class, 
                                                    train_size=0.75, 
                                                    random_state=123)

# 利用SMOTE維持class數量的平衡
# resampled_x, resampled_y = SMOTE(random_state=42).fit_resample(train_x, train_y)

# 利用遞迴找到 max_depth最佳參數
max_parameters=[0,0,0,0,0,0]
for x in range(3,11):
    
    # 建立 XGBClassifier 模型
    xgb_Model = XGBClassifier(n_estimators=200, max_depth= x, use_label_encoder =False, eval_metric="error")
    outcomes = xgb_Model.fit(train_x, train_y).predict(test_x)
#     outcomes = xgb_Model.fit(resampled_x, resampled_y).predict(test_x)

    # 計算Accuracy, Precision & Recall
    accuracy = xgb_Model.score(test_x, test_y)
    precision = precision_score(test_y, outcomes, average='binary')
    recall = recall_score(test_y, outcomes, average=None)
    f_score = (2*precision*recall[1]) / (precision+recall[1])
    score = 0.3*accuracy + 0.3*precision + 0.4*f_score

    if score > max_parameters[0]:
        max_parameters[0] = score 
        max_parameters[1] = x 
    elif score > max_parameters[2]:
        max_parameters[2] = score 
        max_parameters[3] = x
    elif score > max_parameters[4]:
        max_parameters[4] = score 
        max_parameters[5] = x
        
print(max_parameters)

[0.7276699275362319, 3, 0.7187311564407896, 6, 0.7034474115436802, 7]


In [22]:
#XGBoost
from xgboost import XGBClassifier

# 建立 XGBClassifier 模型
xgb_Model = XGBClassifier(n_estimators=200, max_depth= 3, use_label_encoder =False, eval_metric="error")
outcomes = xgb_Model.fit(train_data_sub, train_data_class).predict(test_data_sub)
# resampled_x, resampled_y = SMOTE().fit_resample(train_data_sub, train_data_class)
# outcomes = xgb_Model.fit(resampled_x, resampled_y).predict(test_data_sub)
print(outcomes.sum())

284


In [4]:
# Neural Network
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(30,10,2),random_state=0)
nn.fit(train_data_sub, train_data_class)
outcomes = nn.predict(test_data_sub)



In [48]:
# SVM
from sklearn import svm
classifier = svm.SVC(kernel='linear')
classifier.fit(train_data_sub, train_data_class)
outcomes = classifier.predict(test_data_sub)

In [55]:
#KNC
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(train_data_sub, train_data_class)
outcomes = knc.predict(test_data_sub)

In [38]:
#LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_data_sub, train_data_class)
outcomes = classifier.predict(test_data_sub)

In [18]:
# 將資料寫入csv檔
test_data_class = pd.Series(outcomes)
test_df = pd.DataFrame({'RowNumber':test_data['RowNumber'], 'Exited':test_data_class})
test_df.to_csv('./data/xgboost(200&3).csv')