In [None]:
import copy
import numpy as np
import pandas as pd
import pandas_profiling as pdp
from datetime import datetime
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 2000)

In [None]:
import xgboost as xgb
from sklearn.svm import LinearSVC, libsvm, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

# エグゼクティブサマリ
- モデル精度  
  mさん：0.89722（決定木）   
  nさん ：0.917   (バギング)  
  hさん ：0.91851（ロジスティック＆決定木）  
  ttさん ：0.93800 (勾配ブースティング＆アダブースト＆ランダムフォレスト＆Extra-trees)  
  全員のスタッキング：**0.93549**  
  
  
- チームの方針  
  それぞれがベストと思うモデルを持ち寄る。  
  集まったモデルをスタッキングしてメタモデルを作成し、精度検証する。  
  
  
- 疑問点  
  hさんのは２種類のモデルをスタッキングしたメタモデルの予測値を使用。（予測値１つ）  
  ttさんのは４種類のモデルそのまま使用。（予測値４つ）  
  hさんの分に関してはスタッキングのスタッキングになっている。  
  こういうのはあまりしない？

In [None]:
# 自作の説明変数を作成する関数
def create_explain_variable(df):
    df_add_var = copy.deepcopy(df)
    # 年齢を年代に変換
    classes = [15,25,30,35,40,45,50,55,60,120]
    df_add_var["ageclass"] = pd.cut(df_add_var.age, classes, labels=np.arange(9)).astype(int)
    # 年齢を50歳で折り返して線形にする
    df_add_var["age_abs"] = np.abs(50 - df_add_var.age)
    # 経過日数を２か月、６か月、１年に分割する
    classes = [-2,0,60,120,180,360,720,2000]
    df_add_var["pdaysclass"] = pd.cut(df_add_var.pdays, classes, labels=np.arange(7)).astype(int)
    # 新規顧客フラグ
    df_add_var["new_customers"] = 0
    df_add_var.loc[df_add_var.previous==0, "new_customers"] = 1
    # 最終接触日を10日未満、10～20日、20日以降に分割し、月と結合する
    df_add_var["daysplit"] = 0
    df_add_var.loc[(10 < df_add_var.day)&(df_add_var.day <= 20), "daysplit"] = 1
    df_add_var.loc[(20 <= df_add_var.day), "daysplit"] = 2
    df_add_var["md"] = df_add_var.month + df_add_var.daysplit.astype(str)
    # 21歳から働き始めたとみなして、平均年間貯蓄額を算出（21歳以下、学生は0とみなす）
    df_add_var["mean_balance"] = df_add_var.balance / (df_add_var.age - 21)
    df_add_var.loc[(df_add_var.age <= 21)|(df_add_var.job=="student"), "mean_balance"] = 0
    # 平均接触回数
    df_add_var["mean_meet_num"] = df_add_var.previous / df_add_var.pdays
    df_add_var.loc[df_add_var.pdays==-1, "mean_meet_num"] = 0
    return df_add_var

In [None]:
# カテゴリデータのダミー変数化
def val2cate(df1, df2):
    df_train = copy.deepcopy(df1)
    df_test  = copy.deepcopy(df2)
    for col in ["job","marital","education","default","housing","loan","contact","month","poutcome","md"]:
        if col not in df_train.columns: continue
        for idx, val in enumerate(sorted(df_train[col].unique())):
            df_train.loc[df_train[col]==val, col] = idx
            df_test.loc[df_test[col]==val, col]   = idx
    return df_train, df_test
# 数値データの標準化（しなくてよいけど一応）
def val2num(df1, df2):
    df_train = copy.deepcopy(df1)
    df_test  = copy.deepcopy(df2)
    for col in ["age","balance","duration","campaign","pdays"]:
        if col not in df_train.columns: continue
        train_mean = df_train[col].mean()
        train_std  = df_train[col].std()
        df_train[col] = df_train[col].apply(lambda x: (x - train_mean) / train_std)
        df_test[col]  = df_test[col].apply(lambda x: (x - train_mean) / train_std)
    return df_train, df_test

In [None]:
path = "../../../../study/bank/motodata/"
df_train = pd.read_csv(path + "train.csv")
df_test  = pd.read_csv(path + "test.csv")

In [None]:
# データ観察（pdays と previousはピアソン、スピアマン高めなのでpdaysをあとで削除）
pdp.ProfileReport(df_train)

In [None]:
# 外れ値データを削除（全員のモデルを足すときは実行していない）
dict_del = {"previous":[275],
            "duration":[4918],
            "balance" :[-6847,102127]}
for col, vals in dict_del.items():
    for val in vals:
        del_idx = df_train[df_train[col]==val].index
        df_train.drop(del_idx, inplace=True)
df_train.reset_index(drop=True, inplace=True)

In [None]:
# 自作変数
df_train_add_var = create_explain_variable(df_train)
df_test_add_var  = create_explain_variable(df_test)

In [None]:
# 不要な列を削除
drop_cols = ["id","pdays"]
df_train_add_var.drop(drop_cols, axis=1, inplace=True)
df_test_add_var.drop(drop_cols, axis=1, inplace=True)

In [None]:
# ダミー変数化、標準化
df_train_add_var, df_test_add_var = val2cate(df_train_add_var, df_test_add_var)
df_train_add_var, df_test_add_var = val2num(df_train_add_var, df_test_add_var)

In [None]:
# スタッキングする関数
def calc_proba(proba):
    return 1 - proba[:,0]
def stacking(df_train, df_test, clf, name, seed=15, cv=4):
    statime = datetime.now()
    df_auc  = df_train_pred = pd.DataFrame()
    x, y    = df_train.drop("y", axis=1), df_train.y
    # train
    k   = 1
    skf = StratifiedKFold(n_splits=cv, random_state=seed)
    for train, valid in skf.split(x, y):
        clf.fit(x.iloc[train,:], y[train])
        train_pred, valid_pred = clf.predict_proba(x.iloc[train,:]), clf.predict_proba(x.iloc[valid,:])
        train_pred, valid_pred = calc_proba(train_pred),             calc_proba(valid_pred)
        auc     = pd.DataFrame({"k" : [k],
                                "train_"+name : roc_auc_score(y[train], train_pred),
                                "valid_"+name : roc_auc_score(y[valid], valid_pred)})
        df_auc  = df_auc.append(auc, ignore_index=True)
        df_pred = pd.DataFrame({"idx" : valid, name : valid_pred, "y" : y[valid]})
        df_train_pred = df_train_pred.append(df_pred, ignore_index=True)
        k += 1
    # test
    clf.fit(x, y)
    all_pred, test_pred = clf.predict_proba(x), clf.predict_proba(df_test)
    all_pred, test_pred = calc_proba(all_pred), calc_proba(test_pred)
    auc    = pd.DataFrame({"k" : ["all"],
                           "train_"+name : roc_auc_score(y, all_pred)})
    df_auc = df_auc.append(auc, ignore_index=True)
    df_test_pred = pd.DataFrame({"idx" : df_test.index, name : test_pred})
    print("clf:%s time:%s end" % (name, datetime.now() - statime))
    return df_train_pred, df_test_pred, df_auc

In [None]:
# クロスバリデーション k=4 でスタッキング（ハイパーパラメータはGridSearchで決定）
seed=15
# 勾配ブースティングでモデル作成
classifier = xgb.XGBClassifier(learning_rate=0.01, max_depth=8, min_child_weight=6, n_estimators=3000, random_state=seed)
df_train_xgb, df_test_xgb, df_auc_xgb = stacking(df_train_add_var, df_test_add_var, classifier, "xgb")
# アダブーストでモデル作成
classifier = AdaBoostClassifier(learning_rate=0.9, n_estimators=3000, random_state=seed)
df_train_ab,  df_test_ab,  df_auc_ab  = stacking(df_train_add_var, df_test_add_var, classifier, "ab")
# ランダムフォレストでモデル作成
classifier = RandomForestClassifier(max_depth=10, min_samples_leaf=2, max_features="sqrt", n_estimators=3000, n_jobs=-1, random_state=seed)
df_train_rf,  df_test_rf,  df_auc_rf  = stacking(df_train_add_var, df_test_add_var, classifier, "rf")
# ExtraTreesでモデル作成
classifier = ExtraTreesClassifier(max_depth=8, min_samples_leaf=1, n_estimators=1000, n_jobs=-1, random_state=seed)
df_train_et,  df_test_et,  df_auc_et  = stacking(df_train_add_var, df_test_add_var, classifier, "et")

In [None]:
# 各モデルのAUC
#  xgb : 勾配ブースティング
#  ab  : アダブースト
#  rf  : ランダムフォレスト
#  et  : Extra-trees
pd.concat([df_auc_xgb, df_auc_ab, df_auc_rf, df_auc_et], axis=1)

In [None]:
# train
df_train_sta = pd.merge(df_train_rf,  df_train_xgb, on=["idx","y"])
df_train_sta = pd.merge(df_train_sta, df_train_et,  on=["idx","y"])
df_train_sta = pd.merge(df_train_sta, df_train_ab,  on=["idx","y"])
df_train_sta = df_train_sta[["idx", "xgb", "ab", "rf", "et", "y"]]
# test
df_test_sta  = pd.merge(df_test_rf,  df_test_xgb, on="idx")
df_test_sta  = pd.merge(df_test_sta, df_test_et,  on="idx")
df_test_sta  = pd.merge(df_test_sta, df_test_ab,  on="idx")
df_test_sta  = df_test_sta[["idx", "xgb", "ab", "rf", "et"]]
print("train shape :", df_train_sta.shape)
print("test  shape :", df_test_sta.shape)
df_train_sta.head()

In [None]:
# チームのみんなが作ったモデルを加える
path = "../../../../study/bank/team_predict/"
df_train_m = pd.read_csv(path+"train_miyata.csv",   names=["idx","pred_m"])
df_train_n = pd.read_csv(path+"train_nakamura.csv", names=["idx","pred_n"])
df_train_h = pd.read_csv(path+"train_hayashi.csv",  names=["idx","pred_h"])
df_test_m  = pd.read_csv(path+"test_miyata.csv",    names=["idx","pred_m"])
df_test_n  = pd.read_csv(path+"test_nakamura.csv",  names=["idx","pred_n"])
df_test_h  = pd.read_csv(path+"test_hayashi.csv",   names=["idx","pred_h"])
print("miyata   train/test model shape :", df_train_m.shape, df_test_m.shape)
print("nakamura train/test model shape :", df_train_n.shape, df_test_n.shape)
print("hayashi  train/test model shape :", df_train_h.shape, df_test_h.shape)
df_train_sta.idx = df_train_sta.idx+1
df_train_sta = pd.merge(df_train_sta, df_train_m, on="idx")
df_train_sta = pd.merge(df_train_sta, df_train_n, on="idx")
df_train_sta = pd.merge(df_train_sta, df_train_h, on="idx")
df_train_sta = df_train_sta[["idx", "xgb", "ab", "rf", "et", "pred_m", "pred_n", "pred_h", "y"]]
df_test_sta.idx  = df_test_sta.idx+1
df_test_sta  = pd.merge(df_test_sta,  df_test_m, on="idx")
df_test_sta  = pd.merge(df_test_sta,  df_test_n, on="idx")
df_test_sta  = pd.merge(df_test_sta,  df_test_h, on="idx")
df_test_sta  = df_test_sta[["idx", "xgb", "ab", "rf", "et", "pred_m", "pred_n", "pred_h"]]
df_train_sta.head()

In [None]:
x    = df_train_sta.drop(["idx","y"], axis=1)
y    = df_train_sta.y
test = df_test_sta.drop("idx", axis=1)
# meta model（ハイパーパラメータはGridSearchで決定）
classifier = xgb.XGBClassifier(gamma=3, learning_rate=0.1, max_depth=4, min_child_weight=6, n_estimators=500, random_state=seed)
classifier.fit(x, y)
train_pred = calc_proba(classifier.predict_proba(x))
print("train auc:", roc_auc_score(y, train_pred))
test_pred  = calc_proba(classifier.predict_proba(test))

In [None]:
df_result = pd.DataFrame({"idx"    : np.arange(1, df_test_sta.shape[0]+1),
                          "result" : test_pred})
path = "../../../../study/bank/submit/"
df_result.to_csv(path + "result_20180627_2.csv", header=None, index=False)

# ここから説明不要  
GridSearchでハイパーパラメータを決定

In [None]:
##########################################
# cross validation
seed  = 15

In [None]:
#勾配ブースティング
classifier = xgb.XGBClassifier(random_state=seed)
parameters = {'n_estimators'     : [500,1000],
              'learning_rate'    : [0.1,0.3,0.5,0.9], 
              'max_depth'        : [4,6],
              'min_child_weight' : [4,6,10],
              'gamma'            : [0,3,10]}

In [None]:
#extra trees
classifier = ExtraTreesClassifier(random_state=seed)
parameters = {'n_estimators'     : [500, 1000],
              'max_depth'        : [6, 8, 10],
              'min_samples_leaf' : [1, 3, 5, 9, 17],
              'min_samples_split': [0.1, 0.3, 0.5, 0.7, 0.9]}

In [None]:
#ランダムフォレスト
classifier = RandomForestClassifier(random_state=seed)
parameters = {'n_estimators'     : [500, 1000],
              'max_depth'        : [4, 6, 8],
              'min_samples_leaf' : [100, 200, 300],
              'min_samples_split': [0.1, 0.3, 0.5, 0.7, 0.9]}

In [None]:
#k最近傍法
classifier = KNeighborsClassifier()
parameters = {'n_neighbors':[1,2,3,4,5],
              "leaf_size"  :[1,2,3,4,5,10],
              "p"          :[1,2,3,4,5]}

In [None]:
#SVM
classifier = SVC(random_state=seed)
parameters = {'C'    :np.logspace(-4, 4, 3),
              'gamma':np.logspace(-4, 4, 3)}

In [None]:
#アダブースト
classifier = AdaBoostClassifier(random_state=seed)
parameters = {'n_estimators'    :[3000],
              'learning_rate'   :[0.1,0.2,0.3,0.5,0.7,0.9]}

In [None]:
#ロジスティック回帰
classifier = LogisticRegression(random_state=seed)
parameters = {'C' : [0.1,1,10,20,30,40,50,60,70,80,90,100]}

In [None]:
x = df_train_add_var.drop("y", axis=1)
y = df_train_add_var.y
train_X, test_X, train_y, test_y = train_test_split(x, y, 
                                                    test_size=0.2,
                                                    random_state=seed)

cv = GridSearchCV(classifier, parameters, cv=4, scoring="roc_auc", verbose=10, n_jobs=-1)
cv.fit(train_X, train_y)

In [None]:
x = df_train_sta.drop(["idx","y"], axis=1)
y = df_train_sta.y
train_X, test_X, train_y, test_y = train_test_split(x, y, 
                                                    test_size=0.2,
                                                    random_state=seed)

cv = GridSearchCV(classifier, parameters, cv=4, scoring="roc_auc", verbose=10, n_jobs=-1)
cv.fit(train_X, train_y)

In [None]:
pred = calc_proba(cv.predict_proba(test_X))
print("best model auc:", roc_auc_score(test_y, pred))
cv.best_estimator_.get_params

In [None]:
############################################
#DeepLearning（適当）

In [None]:
from keras.layers.core import Activation, Flatten, Dense, Dropout
from keras.optimizers import RMSprop
import keras

In [None]:
y_cate = keras.utils.np_utils.to_categorical(y,  2)

In [None]:
model = keras.Sequential()
model.add(Dense(512, activation='relu', input_dim=x.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [None]:
#学習の実行
epochs=128
batch_size=100
history1 = model.fit(x, y_cate,  epochs=epochs, batch_size=batch_size)