In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import sys
import gc
import os
HOME = os.path.expanduser('~')
import datetime
sys.path.append(f'{HOME}/kaggle/data_analysis/model')
from params_HC import params_lgb

sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from feature_manage import FeatureManage
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Global Variable
sys.path.append(f'../py')
from info_home_credit import hcdr_key_cols
key, target, ignore_list = hcdr_key_cols()
#========================================================================

prev_key = 'SK_ID_PREV'

acr = 'AMT_CREDIT'
aan = 'AMT_ANNUITY'
adp = 'AMT_DOWN_PAYMENT'
cpy = 'CNT_PAYMENT'
co_type = 'NAME_CONTRACT_TYPE'
dd = 'DAYS_DECISION'
amt_list = ["f000_app_"+acr, "f000_app_"+aan]

#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base0*')[[key, target]].set_index(key)
manage = FeatureManage(key, target)
manage.set_base(base)
feat_key_list = [['f000_app'], ['f004_app']]
app_train, app_test = manage.feature_matrix(feat_key_list=feat_key_list)
#========================================================================

target = 'CNT_PAYMENT'
ignore_list.remove("TARGET")
ignore_list.append(target)
# applicationでの予測用データ
app = pd.concat([app_train, app_test], axis=0)
del app_train, app_test

prev = utils.read_df_pkl('../input/clean_prev*')[[key, acr, aan, cpy]]
df = prev.merge(app.drop(amt_list, axis=1), on=key, how='left')
del prev
gc.collect()

app.rename(columns={'f000_app_AMT_CREDIT':acr} , inplace=True)
app.rename(columns={'f000_app_AMT_ANNUITY':aan}, inplace=True)

100%|██████████| 3/3 [00:00<00:00, 78.41it/s]
100%|██████████| 3/3 [00:02<00:00,  1.36it/s]


In [2]:
import ml_utils
from sklearn.model_selection import StratifiedKFold

df.drop(['TARGET'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

train = df[~df[cpy].isnull()]
test = df[df[cpy].isnull()]
Y = train[target]

model_type='lgb'
learning_rate = 0.03
early_stopping_rounds = 100
num_boost_round = 30000
metric = 'rmse'

seed = 1208
set_type = 'all'
n_fold = 5

folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
kfold = list(folds.split(train, Y))

params = params_lgb()
params['learning_rate'] = learning_rate
params['objective'] = 'regression'
params['metric'] = metric

#========================================================================
# Train & Prediction Start
feim_list = []
score_list = []
oof_pred = np.zeros(len(train))
y_test = np.zeros(len(test))
app_pred = np.zeros(len(app))

use_cols = [col for col in train.columns if col not in ignore_list]
x_test = test[use_cols]
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

for num_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train, y_train = train[use_cols].iloc[trn_idx, :], Y.iloc[trn_idx]
    x_val, y_val = train[use_cols].iloc[val_idx, :], Y.iloc[val_idx]

    print(f"Fold{num_fold} | Train:{x_train.shape} | Valid:{x_val.shape}")

    score, tmp_oof, tmp_pred, feim, estimator = ml_utils.Regressor(
        model_type=model_type
        , x_train=x_train
        , y_train=y_train
        , x_val=x_val
        , y_val=y_val
        , x_test=x_test
        , params=params
        , seed=seed
        , get_score=metric
        , get_model=True
    )
    feim_list.append(feim.set_index('feature').rename(columns={'importance':f'imp_{num_fold}'}))

    print(f"Fold{num_fold} CV: {score}")
    score_list.append(score)
    oof_pred[val_idx] = tmp_oof
    y_test += tmp_pred
    
    # applicationの予測
    app_pred += estimator.predict(app[use_cols])

n_feature = len(x_train.columns)
del x_train, y_train, x_val, y_val
gc.collect()

Fold0 | Train:(922379, 694) | Valid:(230620, 694)
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 7.51517
[400]	valid_0's rmse: 6.16118
[600]	valid_0's rmse: 5.18176
[800]	valid_0's rmse: 4.67026
[1000]	valid_0's rmse: 4.18181
[1200]	valid_0's rmse: 3.83095
[1400]	valid_0's rmse: 3.68065
[1600]	valid_0's rmse: 3.56629
[1800]	valid_0's rmse: 3.45075
[2000]	valid_0's rmse: 3.3712
[2200]	valid_0's rmse: 3.34232
[2400]	valid_0's rmse: 3.31321
[2600]	valid_0's rmse: 3.28532
[2800]	valid_0's rmse: 3.26274
[3000]	valid_0's rmse: 3.24144
[3200]	valid_0's rmse: 3.22682
[3400]	valid_0's rmse: 3.21491
[3600]	valid_0's rmse: 3.20273
[3800]	valid_0's rmse: 3.18909
[4000]	valid_0's rmse: 3.17846
[4200]	valid_0's rmse: 3.17118
[4400]	valid_0's rmse: 3.15816
[4600]	valid_0's rmse: 3.14985
[4800]	valid_0's rmse: 3.14062
[5000]	valid_0's rmse: 3.1335
[5200]	valid_0's rmse: 3.12741
[5400]	valid_0's rmse: 3.12031
[5600]	valid_0's rmse: 3.11592
[5800]	valid_0's rmse: 3.

49

In [3]:
cv_score = np.mean(score_list)
print(f'''
#========================================================================
# Model: {model_type}
# CV   : {cv_score}
#========================================================================''')

y_test /= (num_fold+1)

pred_col = 'prediction'
train[pred_col] = oof_pred
test[pred_col] = y_test
stack_cols = [key, target, pred_col]

df_stack = pd.concat([train[stack_cols], test[stack_cols]], ignore_index=True, axis=0)
cpy_pred = df_stack[pred_col].values.astype('float32')

#========================================================================
# Saving
utils.to_pkl_gzip(obj=cpy_pred, path=f"../previous_pred_CNT_PAYMENT_CV{str(cv_score).replace('.','-')}")
#========================================================================


# Model: lgb
# CV   : 2.9225943085922026


In [11]:
df_stack.loc[df_stack[target].isnull(), target] = df_stack.loc[df_stack[target].isnull(), pred_col]
df_stack['diff_Pred_CNT_PAYMENT'] = df_stack[pred_col] - df_stack[target]
df_stack.rename(columns={pred_col:'Pred_CNT_PAYMENT'}, inplace=True)
aggs = {}
aggs['diff_Pred_CNT_PAYMENT'] = ['sum', 'mean', 'max', 'min', 'std', 'skew']
aggs['Pred_CNT_PAYMENT'] = ['sum', 'mean', 'max', 'min', 'std', 'skew']
df_agg = df_stack.groupby(key).agg(aggs)
new_cols = [f"{k}-{method}" for k in aggs.keys() for method in aggs[k]]
df_agg.columns = new_cols
df_agg = base.join(df_agg)
prefix='f005_pre_'
utils.save_feature(df_feat=df_agg, is_train=2, target="TARGET", ignore_list=ignore_list+["TARGET"], prefix=prefix)

(307511,) | diff_Pred_CNT_PAYMENT-sum
(307511,) | diff_Pred_CNT_PAYMENT-mean
(307511,) | diff_Pred_CNT_PAYMENT-max
(307511,) | diff_Pred_CNT_PAYMENT-min
(307511,) | diff_Pred_CNT_PAYMENT-std
(307511,) | diff_Pred_CNT_PAYMENT-skew
(307511,) | Pred_CNT_PAYMENT-sum
(307511,) | Pred_CNT_PAYMENT-mean
(307511,) | Pred_CNT_PAYMENT-max
(307511,) | Pred_CNT_PAYMENT-min
(307511,) | Pred_CNT_PAYMENT-std
(307511,) | Pred_CNT_PAYMENT-skew
(48744,) | diff_Pred_CNT_PAYMENT-sum
(48744,) | diff_Pred_CNT_PAYMENT-mean
(48744,) | diff_Pred_CNT_PAYMENT-max
(48744,) | diff_Pred_CNT_PAYMENT-min
(48744,) | diff_Pred_CNT_PAYMENT-std
(48744,) | diff_Pred_CNT_PAYMENT-skew
(48744,) | Pred_CNT_PAYMENT-sum
(48744,) | Pred_CNT_PAYMENT-mean
(48744,) | Pred_CNT_PAYMENT-max
(48744,) | Pred_CNT_PAYMENT-min
(48744,) | Pred_CNT_PAYMENT-std
(48744,) | Pred_CNT_PAYMENT-skew


In [16]:
prefix = "f005_app_"
base_train = base[~base["TARGET"].isnull()]
len_train = len(base_train)
app_train_pred = app_pred[:len_train]
app_test_pred = app_pred[len_train:]
utils.to_pkl_gzip(obj=app_train_pred, path=f'../features/train_{prefix}App_Pred_CNT_PAYMENT')
utils.to_pkl_gzip(obj=app_test_pred, path=f'../features/test_{prefix}App_Pred_CNT_PAYMENT')