<a href="https://colab.research.google.com/github/tomo77/MoneyPlanner/blob/main/home_credit_default_risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [53]:
from google.colab import drive
drive.mount('/content/drive')
input_dir = "/content/drive/MyDrive/dev/input"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
application_train = pd.read_csv(f'{input_dir}/home-credit-default-risk/application_train.csv')
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

# 総所得金額を世帯人数で割った値
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# 総所得金額を就労期間で割った値
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# 外部スコアの平均など
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 就労期間を年齢で割った値
application_train['EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# 年金支払い金額を所得金額で割った値
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# 年金支払い額を借入金で割った値
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

# メモリ削減

各カラムのデータに応じてデータ型を最適化する

In [56]:
def reduce_mem_usage(df):
  start_mem = df.memory_usage().sum() / 1024**2
  print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

  for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
      c_min = df[col].min()
      c_max = df[col].max()
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)
      else:
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
    else:
      pass

  end_mem = df.memory_usage().sum() / 1024**2
  print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
  print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

  return df

In [57]:
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 309.69 MB
Memory usage after optimization is: 99.71 MB
Decreased by 67.8%


# データセット作成

In [58]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

# カテゴリ変数をcategory型に変換
for col in x_train.columns:
  if x_train[col].dtype == "object":
    x_train[col] = x_train[col].astype("category")


# バリデーション設計

In [59]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()
# 不均衡データになっている

mean: 0.0807


Unnamed: 0_level_0,count
TARGET,Unnamed: 1_level_1
0,282686
1,24825


In [60]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold,
              n_splits=5
              ):

  train_oof = np.zeros(len(input_x))
  metrics = []
  imp = pd.DataFrame()

  # Cross-Validation
  cv = list(StratifiedKFold(n_splits=len(list_nfold), shuffle=True, random_state=0).split(x_train, y_train))

  for nfold in list_nfold:
    print("-"*20, nfold, "-"*20)

    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
    x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
    print(x_tr.shape, y_tr.shape, id_tr.shape)
    print(x_va.shape, y_va.shape, id_va.shape)

    # train
    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_va, y_va)],
              callbacks=[lgb.early_stopping(stopping_rounds=100,
                                            verbose=True)])

    # モデルの保存
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "wb") as f:
      pickle.dump(model, f, protocol=4)

    # evaluate
    y_tr_pred = model.predict_proba(x_tr)[:, 1]
    y_va_pred = model.predict_proba(x_va)[:, 1]
    metric_tr = roc_auc_score(y_tr, y_tr_pred)
    metric_va = roc_auc_score(y_va, y_va_pred)
    metrics.append([nfold, metric_tr, metric_va])
    print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

    # oof
    train_oof[idx_va] = y_va_pred

    # imp
    _imp = pd.DataFrame({
        "col": input_x.columns,
        "imp": model.feature_importances_,
        "nfold": nfold
    })
    imp = pd.concat([imp, _imp])

  print("-"*20, "result", "-"*20)
  metrics = np.array(metrics)
  print(metrics)
  print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
      metrics[:, 1].mean(), metrics[:, 1].std(),
      metrics[:, 2].mean(), metrics[:, 2].std()
  ))
  print("[oof] {:.4f}".format(
      roc_auc_score(input_y, train_oof)
  ))

  # oof
  train_oof = pd.concat([
      input_id,
      pd.DataFrame({"pred": train_oof})
  ], axis=1)

  # importance
  imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
  imp.columns = ["col", "imp", "imp_std"]

  return train_oof, imp, metrics

In [61]:
params = {
  "boosting_type": 'gbdt',
  "objective": 'binary',
  "metric": 'auc',
  "learning_rate": 0.05,
  "num_leaves": 32,
  "n_estimators": 100000,
  "random_state": 123,
  "importance_type": "gain",
}

train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5)

-------------------- 0 --------------------
(246008, 130) (246008,) (246008, 1)
(61503, 130) (61503,) (61503, 1)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.188421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13692
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[436]	valid_0's auc: 0.766028
[auc] tr:0.8614, va:0.7660
-------------------- 1 --------------------
(246009, 130) (246009,) (246009, 1)
(61502, 130) (61502,) (61502, 1)
[LightGBM] [Info] Number of positive: 19860, number of neg

In [62]:
imp.sort_values("imp", ascending=False)[:100]

Unnamed: 0,col,imp,imp_std
44,EXT_SOURCE_mean,113496.802340,1275.265061
10,ANNUITY_div_CREDIT,23182.473565,1248.867620
112,ORGANIZATION_TYPE,21488.596948,1683.023976
41,EXT_SOURCE_3,11342.311082,1073.885576
24,DAYS_BIRTH,7640.224438,578.142948
...,...,...,...
118,REG_CITY_NOT_WORK_CITY,102.406912,48.672100
54,FLAG_DOCUMENT_16,100.494956,9.538030
89,LIVE_CITY_NOT_WORK_CITY,100.214056,40.176666
56,FLAG_DOCUMENT_18,85.704771,27.084422


# モデル推論

In [63]:
application_test = pd.read_csv(f'{input_dir}/home-credit-default-risk/application_test.csv')
application_test = reduce_mem_usage(application_test)

application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

# 総所得金額を世帯人数で割った値
application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']

# 総所得金額を就労期間で割った値
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']

# 外部スコアの平均など
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 就労期間を年齢で割った値
application_test['EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']

# 年金支払い金額を所得金額で割った値
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']

# 年金支払い額を借入金で割った値
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

for col in x_test.columns:
  if x_test[col].dtype == "object":
    x_test[col] = x_test[col].astype("category")


Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


In [64]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4]):
  pred = np.zeros((len(input_x), len(list_nfold)))

  for nfold in list_nfold:
    print("-"*20, nfold, "-"*20)
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "rb") as f:
      model = pickle.load(f)
    pred[:, nfold] = model.predict_proba(input_x)[:, 1]

  pred = pd.concat([
      input_id,
      pd.DataFrame({"pred": pred.mean(axis=1)})
  ], axis=1)

  print()

  return pred

In [65]:
test_pred = predict_lgb(x_test, id_test, list_nfold=[0,1,2,3,4])
df_submit = test_pred.rename(columns={"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index=None)

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.028528
1,100005,0.106274
2,100013,0.021244
3,100028,0.041968
4,100038,0.183107
