In [None]:
import copy
import warnings
import numpy as np
import pandas as pd
import pandas_profiling as pdp
from datetime import datetime
from IPython.core.display import display

import xgboost as xgb
from sklearn.svm import LinearSVC, libsvm, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 2000)

In [None]:
# 自作の説明変数を作成する関数
def create_explain_variable(df):
    df_add_var = copy.deepcopy(df)
    # キャンペーンの丸め
    df_add_var.loc[20 < df_add_var.campaign, "campaign"] = 21
    # duration の対数変換
    df_add_var.loc[df_add_var.duration==0, "duration"] = 1e-7
    df_add_var.duration = df_add_var.duration.apply(lambda x: np.log(x))
    # 月+日
    dict_month = {"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12,"jan":1,"feb":2,"mar":3}
    df_add_var.month = df_add_var.month.apply(lambda x: dict_month[x]).astype(str)
    df_add_var["md"] = (df_add_var.month + df_add_var.day.apply(lambda x: "%02d" % x)).astype(int)
    df_add_var.month = df_add_var.month.astype(int)
    # 月＋日を365日に変換
    end_days = [132,229,332,431,532,631,732,832,931,1032,1131,1232]
    all_days, ch_days = [], {}
    for idx, d in enumerate(end_days):
        all_days += list(np.arange((idx+1)*100+1,d))
    for idx, d in enumerate(all_days):
        ch_days[d] = idx+1
    df_add_var["md_num"]   = df_add_var.md.apply(lambda x: ch_days[x])
    # 7で割った余りを曜日として扱う
    df_add_var["md_num_r"] = df_add_var.md_num.apply(lambda x: x % 7)
    # 21歳から働き始めたとみなして、平均年間貯蓄額を算出（21歳以下、学生は0とみなす）
    df_add_var["mean_balance"] = df_add_var.balance / (df_add_var.age - 21)
    df_add_var.loc[(df_add_var.age <= 21)|(df_add_var.job=="student"), "mean_balance"] = 0
    # 平均接触回数
    df_add_var["mean_meet_num"] = df_add_var.previous / df_add_var.pdays
    df_add_var.loc[df_add_var.pdays==-1, "mean_meet_num"] = 0
    return df_add_var
    
def add_col_high_rate(df_train, df_test):
    # 可能性の高そうな職業などについている人に点数をつける
    def calc_high_rate(df, col, threshold=0.12):
        y_cnt  = df.groupby(col, as_index=False).y.count()
        y_sum  = df.groupby(col, as_index=False).y.sum()
        y_rate = pd.merge(y_sum, y_cnt, on=col, suffixes=("_sum","_cnt"))
        y_rate["rate"] = y_rate.y_sum / y_rate.y_cnt
        return list(y_rate.query("@threshold < rate")[col])
    dict_high_rate = {}
    dict_high_rate["job"]       = calc_high_rate(df_train, "job")
    dict_high_rate["marital"]   = calc_high_rate(df_train, "marital")
    dict_high_rate["education"] = calc_high_rate(df_train, "education")
    dict_high_rate["poutcome"]  = ["success"]
    df_train["high_rate"] = 0
    df_test["high_rate"]  = 0
    for col, vals in dict_high_rate.items():
        df_train.loc[df_train[col].isin(vals), "high_rate"] = df_train.high_rate + 2
        df_test.loc[df_test[col].isin(vals),   "high_rate"] = df_test.high_rate + 2
    return df_train, df_test

In [None]:
# カテゴリデータのダミー変数化
def dummies(df, cols):
    df_droped  = copy.deepcopy(df.drop(cols, axis=1)).reset_index(drop=True)
    df_dummies = df.reset_index(drop=True)
    df_dummies = pd.get_dummies(df_dummies[cols], drop_first=True)
    return pd.merge(df_droped, df_dummies, left_index=True, right_index=True)

# 数値データの標準化
def standardization(df, cols, df_test=None):
    mean   = df[cols].mean()
    std    = df[cols].std()
    df_std = copy.deepcopy(df)
    df_std[cols] = df_std[cols].apply(lambda x: (x - mean[x.name]) / std[x.name])
    df_test_std  = copy.deepcopy(df_test)
    if df_test is not None:
        df_test_std[cols] = df_test_std[cols].apply(lambda x: (x - mean[x.name]) / std[x.name])
    return df_std, df_test_std

In [None]:
path = "../../../../study/bank/motodata/"
df_train = pd.read_csv(path + "train.csv")
df_test  = pd.read_csv(path + "test.csv")

In [None]:
df_train.head()

In [None]:
# 外れ値データを削除（全員のモデルを足すときは実行していない）
dict_del = {"previous":[275],
            "balance" :[-6847,102127]}
for col, vals in dict_del.items():
    for val in vals:
        del_idx = df_train[df_train[col]==val].index
        df_train.drop(del_idx, inplace=True)
df_train.reset_index(drop=True, inplace=True)

In [None]:
# 自作変数
df_train_add_var = create_explain_variable(df_train)
df_test_add_var  = create_explain_variable(df_test)
df_train_add_var, df_test_add_var = add_col_high_rate(df_train_add_var, df_test_add_var) 

In [None]:
# 不要な列を削除
drop_cols = ["id","month","md"]
df_train_add_var.drop(drop_cols, axis=1, inplace=True)
df_test_add_var.drop(drop_cols, axis=1, inplace=True)

In [None]:
# 標準化
std_cols = df_train_add_var.select_dtypes(include=["int","float"]).columns
std_cols = std_cols.drop("y")
df_train_add_var, df_test_add_var = standardization(df_train_add_var, std_cols, df_test_add_var)
# ダミー変数化
dummies_cols = df_train_add_var.select_dtypes(include="object").columns
df_dummies   = pd.concat([df_train_add_var, df_test_add_var])
df_dummies   = dummies(df_dummies, dummies_cols)
df_train_add_var, df_test_add_var = df_dummies[df_dummies.y.notnull()], df_dummies[df_dummies.y.isnull()]
df_train_add_var.y = df_train_add_var.y.astype(int)
df_test_add_var.drop("y", axis=1, inplace=True)

In [None]:
display(df_train_add_var.head())
display(df_test_add_var.head())

In [None]:
pdp.ProfileReport(df_train_add_var)

In [None]:
# スタッキング
class stacking():
    
    def __init__(self, train, test, metric, y_col="y", seed=15):
        self.seed  = seed
        self.y_col = y_col
        self.train,       self.test,       self.metric       = train, test, metric
        self.stack_train, self.stack_test, self.stack_metric = [],    [],   []
        
    def _append_df(self, original, append):
        return original.append(append, ignore_index=True)
    
    def _stack(self, train, test, metric):
        self.stack_train.append(train)
        self.stack_test.append(test)
        self.stack_metric.append(metric)
    
    def calc_proba(self, proba):
        return 1 - proba[:,0]
    
    def fit(self, clf, cv=4, name="clf"):
        statime = datetime.now()
        r_train, r_metric = pd.DataFrame(), pd.DataFrame()
        # train
        x, y    = self.train.drop(self.y_col, axis=1), self.train[self.y_col]
        skf     = StratifiedKFold(n_splits=cv, random_state=self.seed)
        for k, (train, valid) in enumerate(skf.split(x, y)):
            clf.fit(x.iloc[train,:], y[train])
            train_pred, valid_pred = clf.predict_proba(x.iloc[train,:]), clf.predict_proba(x.iloc[valid,:])
            train_pred, valid_pred = self.calc_proba(train_pred),        self.calc_proba(valid_pred)
            result = pd.DataFrame({"k"           : [k+1],
                                   "train_"+name : self.metric(y[train], train_pred),
                                   "valid_"+name : self.metric(y[valid], valid_pred)})
            v_pred = pd.DataFrame({"idx" : valid,
                                   name  : valid_pred,
                                   "y"   : y[valid]})
            r_train  = self._append_df(r_train, v_pred)
            r_metric = self._append_df(r_metric, result)
        # test
        clf.fit(x, y)
        all_pred, test_pred = clf.predict_proba(x),      clf.predict_proba(self.test)
        all_pred, test_pred = self.calc_proba(all_pred), self.calc_proba(test_pred)
        result   = pd.DataFrame({"k"           : ["all"],
                                 "train_"+name : self.metric(y, all_pred)})
        t_pred   = pd.DataFrame({"idx" : df_test.index,
                                 name  : test_pred})
        r_metric = self._append_df(r_metric, result)
        self._stack(r_train, t_pred, r_metric)
        print("%s training end. time:%s" % (name, datetime.now()-statime))

In [None]:
# クロスバリデーション k=4 でスタッキング（ハイパーパラメータはGridSearchで決定）
seed=15

names       = ["gb1","gb2","gb3","gb4","gb5","gb6","gb7","ab","rf","et"]
classifiers = [xgb.XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=4, gamma=3, n_estimators=1000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.01, max_depth=5, min_child_weight=4, gamma=3, n_estimators=1000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.01, max_depth=7, min_child_weight=4, gamma=3, n_estimators=1000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=3, gamma=3, n_estimators=1000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=5, gamma=3, n_estimators=1000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=4, gamma=3, n_estimators=2000, random_state=seed),
               xgb.XGBClassifier(learning_rate=0.02, max_depth=6, min_child_weight=4, gamma=3, n_estimators=1000, random_state=seed),
               AdaBoostClassifier(learning_rate=0.1, n_estimators=3000, random_state=seed),
               RandomForestClassifier(max_depth=24, min_samples_leaf=4, min_samples_split=5, n_estimators=1000, n_jobs=-1, random_state=seed),
               ExtraTreesClassifier(max_depth=24, min_samples_leaf=1, n_estimators=500, n_jobs=-1, random_state=seed)]

s = stacking(df_train_add_var, df_test_add_var, roc_auc_score)
for idx, clf in enumerate(classifiers):
    s.fit(clf, name=names[idx])

In [None]:
# 各モデルのAUC
#  xgb : 勾配ブースティング
#  ab  : アダブースト
#  rf  : ランダムフォレスト
#  et  : Extra-trees
pd.concat(s.stack_metric, axis=1)

In [None]:
import functools
df_train_stack = functools.reduce(lambda x, y: pd.merge(x, y, on =["idx","y"]), s.stack_train)
df_test_stack  = functools.reduce(lambda x, y: pd.merge(x, y, on ="idx"),       s.stack_test)
print("train shape :", df_train_stack.shape)
print("test  shape :", df_test_stack.shape)
df_train_stack.head()

In [None]:
x    = df_train_stack.drop(["idx","y"], axis=1)
y    = df_train_stack.y
test = df_test_stack.drop("idx", axis=1)
# meta model（ハイパーパラメータはGridSearchで決定）
classifier = xgb.XGBClassifier(gamma=10, learning_rate=0.1, max_depth=8, min_child_weight=10, n_estimators=1000, random_state=seed)
classifier.fit(x, y)
train_pred = s.calc_proba(classifier.predict_proba(x))
print("train auc:", roc_auc_score(y, train_pred))
test_pred  = s.calc_proba(classifier.predict_proba(test))

In [None]:
df_result = pd.DataFrame({"idx"    : np.arange(1, df_test_stack.shape[0]+1),
                          "result" : test_pred})
path = "../../../../study/bank/submit/"
df_result.to_csv(path + "result_20180909_4.csv", header=None, index=False)

In [None]:
##########################################
# cross validation
seed  = 15

In [None]:
#勾配ブースティング
classifier = xgb.XGBClassifier(random_state=seed)
parameters = {'n_estimators'     : [500,1000],
              'learning_rate'    : [0.1,0.3,0.5], 
              'max_depth'        : [6,8,10],
              'min_child_weight' : [4,6,10],
              'gamma'            : [0,3,10]}

In [None]:
#extra trees
classifier = ExtraTreesClassifier(random_state=seed)
parameters = {'n_estimators'     : [500, 1000],
              'max_depth'        : [24],
              'min_samples_leaf' : [1, 3, 5]}

In [None]:
#ランダムフォレスト
classifier = RandomForestClassifier(random_state=seed)
parameters = {'n_estimators'     : [1000],
              'max_depth'        : [24],
              'min_samples_leaf' : [2,5,6,10],
              'min_samples_split': [2,10,15,20]}

In [None]:
#k最近傍法
classifier = KNeighborsClassifier()
parameters = {'n_neighbors':[62,63,64,65,66,67,68,69,70],
              "leaf_size"  :[1],
              "p"          :[1]}

In [None]:
#SVM
classifier = SVC(random_state=seed)
parameters = {'C'    :np.logspace(-4, 4, 3),
              'gamma':np.logspace(-4, 4, 3)}

In [None]:
#アダブースト
classifier = AdaBoostClassifier(random_state=seed)
parameters = {'n_estimators'    :[3000,5000],
              'learning_rate'   :[0.1,0.2,0.3]}

In [None]:
#ロジスティック回帰
classifier = LogisticRegression(random_state=seed)
parameters = {'C' : [0.1,1,10,20,30,40,50,60,70,80,90,100]}

In [None]:
x = df_train_add_var.drop("y", axis=1)
y = df_train_add_var.y
train_X, test_X, train_y, test_y = train_test_split(x, y, 
                                                    test_size=0.2,
                                                    random_state=seed)

cv = GridSearchCV(classifier, parameters, cv=4, scoring="roc_auc", verbose=10, n_jobs=-1)
cv.fit(train_X, train_y)

In [None]:
x = df_train_stack.drop(["idx","y"], axis=1)
y = df_train_stack.y
train_X, test_X, train_y, test_y = train_test_split(x, y, 
                                                    test_size=0.2,
                                                    random_state=seed)

cv = GridSearchCV(classifier, parameters, cv=4, scoring="roc_auc", verbose=10, n_jobs=-1)
cv.fit(train_X, train_y)

In [None]:
pred = calc_proba(cv.predict_proba(test_X))
print("best model auc:", roc_auc_score(test_y, pred))
cv.best_estimator_.get_params