In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 處理訓練資料的國家資訊
geo = pd.get_dummies(train["Geography"])
train = pd.concat([train, geo], axis = 1)

# 處理訓練資料的性別資訊
gen = OneHotEncoder(sparse = False)
train["Gender"] = gen.fit_transform(train[["Gender"]])

# 選取訓練資料的變數
x = train[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values
y = train["Exited"].values

# 處理測試資料的國家資訊
geo_test = pd.get_dummies(test["Geography"])
test = pd.concat([test, geo_test], axis = 1)

# 處理測試資料的性別資訊
gen_test = OneHotEncoder(sparse = False)
test["Gender"] = gen_test.fit_transform(test[["Gender"]])

# 選取測試資料的變數
x_test = test[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values

# 標準化
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [2]:
# logistic regression

LRclassifier = LogisticRegression()
LRclassifier = LRclassifier.fit(x, y)

# 預測並儲存結果
result = LRclassifier.predict(x_test)

result = pd.Series(result, name = "Exited")
result = result.to_frame()

out = pd.concat([test[["RowNumber"]], result], axis = 1)
out.to_csv("logistic_regression.csv")

# 評估訓練資料的準確度
LRclassifier.score(x, y)

0.79575

In [9]:
# random forest

RFclassifier = ensemble.RandomForestClassifier(n_estimators = 10, min_samples_leaf = 9, max_features = "sqrt")
RFclassifier = RFclassifier.fit(x, y)

# 預測並儲存結果
result = RFclassifier.predict(x_test)

result = pd.Series(result, name = "Exited")
result = result.to_frame()

out = pd.concat([test[["RowNumber"]], result], axis = 1)
out.to_csv("random_forest.csv")

# 評估訓練資料的準確度
RFclassifier.score(x, y)

0.85375

In [13]:
# xgboost

XGBclassifier = XGBClassifier(verbosity = 0, subsample = 0.5, min_child_weight = 5,
                              max_depth = 8, gamma = 7, eta = 0.8)
XGBclassifier = XGBclassifier.fit(x, y)

# 預測並儲存結果
result = XGBclassifier.predict(x_test)

result = pd.Series(result, name = "Exited")
result = result.to_frame()

out = pd.concat([test[["RowNumber"]], result], axis = 1)
out.to_csv("random_forest.csv")

# 評估訓練資料的準確度
XGBclassifier.score(x, y)

0.865