In [None]:
import os
import copy
import numpy as np
import pandas as pd
import pandas_profiling as pdp
from IPython.core.display import display
from datetime import datetime
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_colwidth", 2000)

In [None]:
import xgboost as xgb
from sklearn.svm import LinearSVC, libsvm, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
# カテゴリデータのダミー変数化
def dummies(df, cols):
    df_droped  = copy.deepcopy(df.drop(cols, axis=1)).reset_index(drop=True)
    df_dummies = df.reset_index(drop=True)
    df_dummies = pd.get_dummies(df_dummies[cols], drop_first=True)
    return pd.merge(df_droped, df_dummies, left_index=True, right_index=True)

# 数値データの標準化
def standardization(df, cols, df_test=None):
    mean   = df[cols].mean()
    std    = df[cols].std()
    df_std = copy.deepcopy(df)
    df_std[cols] = df_std[cols].apply(lambda x: (x - mean[x.name]) / std[x.name])
    df_test_std  = copy.deepcopy(df_test)
    if df_test is not None:
        df_test_std[cols] = df_test_std[cols].apply(lambda x: (x - mean[x.name]) / std[x.name])
    return df_std, df_test_std

In [None]:
# スタッキングする関数
def calc_proba(proba):
    return 1 - proba[:,0]

def stacking(df_train, df_test, clf, name, seed=15, cv=4):
    statime = datetime.now()
    df_auc  = df_train_pred = pd.DataFrame()
    x, y    = df_train.drop("y", axis=1), df_train.y
    # train
    k   = 1
    skf = StratifiedKFold(n_splits=cv, random_state=seed)
    for train, valid in skf.split(x, y):
        clf.fit(x.iloc[train,:], y[train])
        train_pred, valid_pred = clf.predict_proba(x.iloc[train,:]), clf.predict_proba(x.iloc[valid,:])
        train_pred, valid_pred = calc_proba(train_pred),             calc_proba(valid_pred)
        auc     = pd.DataFrame({"k" : [k],
                                "train_"+name : roc_auc_score(y[train], train_pred),
                                "valid_"+name : roc_auc_score(y[valid], valid_pred)})
        df_auc  = df_auc.append(auc, ignore_index=True)
        df_pred = pd.DataFrame({"idx" : valid, name : valid_pred, "y" : y[valid]})
        df_train_pred = df_train_pred.append(df_pred, ignore_index=True)
        k += 1
    # test
    clf.fit(x, y)
    all_pred, test_pred = clf.predict_proba(x), clf.predict_proba(df_test)
    all_pred, test_pred = calc_proba(all_pred), calc_proba(test_pred)
    auc    = pd.DataFrame({"k" : ["all"],
                           "train_"+name : roc_auc_score(y, all_pred)})
    df_auc = df_auc.append(auc, ignore_index=True)
    df_test_pred = pd.DataFrame({"idx" : df_test.index, name : test_pred})
    print("clf:%s time:%s end" % (name, datetime.now() - statime))
    return df_train_pred, df_test_pred, df_auc

In [None]:
path    = "../../../../Users/tenni/Documents/kaggle/HomeCredditDefaultRisk/data/"
dict_df = {}
for file in os.listdir(path):
    if file.find(".csv")==-1 or -1 < file.find("sample"): continue
    filename = os.path.splitext(file)[0]
    enc      = "utf-8"
    if filename=="HomeCredit_columns_description":
        enc  = "ISO-8859-1"
    print(filename)
    dict_df[filename] = pd.read_csv(path + file, encoding=enc)

In [None]:
df_cols_description = dict_df["HomeCredit_columns_description"]
del dict_df["HomeCredit_columns_description"]
df_cols_description

In [None]:
for key, df in dict_df.items():
    print(key)
    display(df.head(3))

In [None]:
for key, df in dict_df.items():
    print(key)
    display(df.describe())

In [None]:
# naを含む列を削除する
dict_df_droped_na = {}
for key, df in dict_df.items():
    if -1 < key.find("test"): continue
    if -1 < key.find("train"):
        df_droped_na = df.dropna(axis=1)
        keep_cols    = df_droped_na.columns.drop("TARGET")
        key_test     = key.replace("train","test")
        dict_df_droped_na[key]      = df_droped_na
        dict_df_droped_na[key_test] = dict_df[key_test][keep_cols]        
    else:
        dict_df_droped_na[key] = df.dropna(axis=1)
for key, df in dict_df.items():
    print(key)
    print(df.shape, dict_df_droped_na[key].shape)

In [None]:
dict_df_edited = {}
# 標準化
for key, df in dict_df_droped_na.items():
    std_cols = df.select_dtypes(include=["int","float"]).columns
    std_cols = [col for col in std_cols if col not in ["SK_ID_CURR","SK_ID_BUREAU","SK_ID_PREV","TARGET"]]
    if -1 < key.find("test") or len(std_cols)==0: continue
    if -1 < key.find("train"):
        key_test = key.replace("train","test")
        dict_df_edited[key], dict_df_edited[key_test] = standardization(df, std_cols, dict_df_droped_na[key_test])
    else:
        dict_df_edited[key], _ = standardization(df, std_cols)
# ダミー変数化
for key, df in dict_df_edited.items():
    dummy_cols = df.select_dtypes(include="object").columns
    if -1 < key.find("test") or len(dummy_cols)==0: continue    
    if -1 < key.find("train"):
        key_test   = key.replace("train","test")
        df_dummy = pd.concat([df, dict_df_edited[key_test]])
        df_dummy = dummies(df_dummy, dummy_cols)
        dict_df_edited[key], dict_df_edited[key_test] = df_dummy[df_dummy.TARGET.notnull()], df_dummy[df_dummy.TARGET.isnull()]
        dict_df_edited[key].TARGET = dict_df_edited[key].TARGET.astype(int)
        dict_df_edited[key_test].drop("TARGET", axis=1, inplace=True)
    else:
        dict_df_edited[key] = dummies(df, dummy_cols)   

In [None]:
print(dict_df_edited['application_train'].shape)
dict_df_edited['application_train'].head()

In [None]:
df = dict_df_edited['application_train']
drop_cols = []
for col in df:
    col_sum = df[col].sum()
    if type(col_sum) is np.int64:
        if col_sum / df.shape[0] < 0.01:
            drop_cols.append(col)

seed = 15
x = dict_df_edited['application_train'].drop(drop_cols + ["SK_ID_CURR","TARGET"], axis=1)
y = dict_df_edited['application_train'].TARGET
train_X, test_X, train_y, test_y = train_test_split(x, y, 
                                                    test_size=0.4,
                                                    random_state=seed)

statime = datetime.now()
classifier = xgb.XGBClassifier(gamma=3, learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=1000, random_state=seed)
classifier.fit(x, y, verbose=10)
datetime.now() - statime

In [None]:
pred = calc_proba(classifier.predict_proba(test_X))
print("best model auc:", roc_auc_score(test_y, pred))

In [None]:
test = dict_df_edited['application_test'].drop(drop_cols + ["SK_ID_CURR"], axis=1)
test_pred = calc_proba(classifier.predict_proba(test))

In [None]:
df_result = pd.DataFrame({"SK_ID_CURR" : dict_df_edited['application_test'].SK_ID_CURR,
                          "TARGET"     : test_pred})
path    = "../../../../Users/tenni/Documents/kaggle/HomeCredditDefaultRisk/submit/"
df_result.to_csv(path + "result_20180705.csv", index=False)