In [1]:
import os
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
PATH = '/home/kai/data/kaggle/homecredit/'


In [2]:
# Read train and test
train = pd.read_pickle(PATH + 'inter/train_only_0.pkl')
test = pd.read_pickle(PATH + 'inter/test_only_0.pkl')
print(train.shape, test.shape)
'done'

(307511, 329) (48744, 328)


'done'

# Cut large piece and get new train&test, and large_pred piece

In [3]:
def large_new(df1, df2, prediction, threshold):
    print('begin cut large')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,' ,test_df.shape,train_df.shape)
    half_pred = pred[pred['TARGET'] > threshold]
    index_half = half_pred.index
    print('length of half',len(half_pred))
    test_half = test_df.iloc[index_half]
    test_half['TARGET'] = 1

    # drop index_half for test_df, add train
    test_df.drop(index_half, inplace = True)
    train_df = pd.concat([train_df,test_half])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, half_pred

# Cut small piece and get new train& test, and small_pred piece

In [4]:
def small_new(df1, df2, prediction, threshold):
    print('begin cut small')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,',test_df.shape,train_df.shape)
    small_pred = pred[pred['TARGET'] < threshold]
    index_small = small_pred.index
    print('length of small',len(small_pred))
    test_small = test_df.iloc[index_small]
    test_small['TARGET'] = 0

    # drop index_small for test_df, add train
    test_df.drop(index_small, inplace = True)
    train_df = pd.concat([train_df,test_small])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, small_pred

# Models

In [5]:
def model_1(train_df, test_df, threshold1, threshold2, n_splits):
    print('begin blended lightgbm')
    # Cross validation model
    stratified = False

    if stratified:
        folds = StratifiedKFold(n_splits, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits, shuffle=True, random_state=45)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    #     fold_importance_df = pd.DataFrame()
    #     fold_importance_df["feature"] = feats
    # #     fold_importance_df["importance"] = clf.feature_importances_
    #     fold_importance_df["fold"] = n_fold + 1
    #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    
    test_df['TARGET'] = sub_preds
    pred =test_df[['SK_ID_CURR', 'TARGET']]
    len_small = len(pred[pred['TARGET']< threshold1])
    len_large = len(pred[pred['TARGET']> threshold2])
    print('length of small and large', len_small, len_large)
    return pred, len_small, len_large

def model_2(train_df, test_df, threshold1, threshold2, n_splits):
    pass

# Concate back to a submission

In [6]:
# col = 'SK_ID_CURR'
def concat_pred(cutout_pred, modified_pred):
    print('begin concat predictions')
    test_concat = pd.concat([modified_pred, cutout_pred], axis = 0)
    print(test_concat.shape)
    test = pd.read_csv(PATH + 'application_test.csv')
    test = test[['SK_ID_CURR']]
    pred = test.merge(test_concat, how = 'left', on = 'SK_ID_CURR')
    return pred


# Run

In [7]:
threshold1, threshold2 = 0.005, 0.5
n_splits = 5

pred1, small, large = model_1(train, test, threshold1, threshold2,n_splits)

print('begin cutting')
train_small, test_small, cut_small = small_new(train, test, pred1, threshold1)
train_large, test_large, cut_large = large_new(train, test, pred1, threshold2)


begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.74607	valid_1's auc: 0.740495
[200]	training's auc: 0.765681	valid_1's auc: 0.754649
[300]	training's auc: 0.780493	valid_1's auc: 0.762655
[400]	training's auc: 0.791257	valid_1's auc: 0.766253
[500]	training's auc: 0.799451	valid_1's auc: 0.767723
[600]	training's auc: 0.80664	valid_1's auc: 0.768981
[700]	training's auc: 0.813334	valid_1's auc: 0.76967
[800]	training's auc: 0.819083	valid_1's auc: 0.769776
[900]	training's auc: 0.824678	valid_1's auc: 0.77012
[1000]	training's auc: 0.829886	valid_1's auc: 0.770198
[1100]	training's auc: 0.834757	valid_1's auc: 0.770211
[1200]	training's auc: 0.839347	valid_1's auc: 0.770411
[1300]	training's auc: 0.843902	valid_1's auc: 0.770243
[1400]	training's auc: 0.848712	valid_1's auc: 0.770065
Early stopping, best iteration is:
[1214]	training's auc: 0.839914	valid_1's auc: 0.770445
Fold  1 AUC : 0.770450
Training until validation sco

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (48713, 329) (307542, 329)
begin cut new
old shapes, (48744, 329) (307511, 329)
length of half 127
new shapes, (48617, 329) (307638, 329)


In [8]:
modified_small,_,_ =  model_1(train_small, test_small, threshold1, threshold2,n_splits)
pred_small = concat_pred(modified_small,cut_small)

begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.747127	valid_1's auc: 0.739968
[200]	training's auc: 0.765884	valid_1's auc: 0.753647
[300]	training's auc: 0.780765	valid_1's auc: 0.761991
[400]	training's auc: 0.790972	valid_1's auc: 0.765447
[500]	training's auc: 0.799438	valid_1's auc: 0.76754
[600]	training's auc: 0.806608	valid_1's auc: 0.768489
[700]	training's auc: 0.813161	valid_1's auc: 0.769064
[800]	training's auc: 0.819272	valid_1's auc: 0.769408
[900]	training's auc: 0.824848	valid_1's auc: 0.769862
[1000]	training's auc: 0.830152	valid_1's auc: 0.76999
[1100]	training's auc: 0.835148	valid_1's auc: 0.769951
[1200]	training's auc: 0.839924	valid_1's auc: 0.770023
[1300]	training's auc: 0.844353	valid_1's auc: 0.770059
[1400]	training's auc: 0.848973	valid_1's auc: 0.76999
[1500]	training's auc: 0.853191	valid_1's auc: 0.769938
Early stopping, best iteration is:
[1315]	training's auc: 0.845082	valid_1's auc: 0.77

In [9]:
modified_large,_,_ =  model_1(train_large, test_large, threshold1, threshold2,n_splits)
pred_large = concat_pred(cut_large,modified_large)

begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.747556	valid_1's auc: 0.745128
[200]	training's auc: 0.766574	valid_1's auc: 0.757431
[300]	training's auc: 0.781179	valid_1's auc: 0.765618
[400]	training's auc: 0.791989	valid_1's auc: 0.769787
[500]	training's auc: 0.80029	valid_1's auc: 0.771551
[600]	training's auc: 0.807244	valid_1's auc: 0.772518
[700]	training's auc: 0.813873	valid_1's auc: 0.772882
[800]	training's auc: 0.819943	valid_1's auc: 0.773139
[900]	training's auc: 0.825542	valid_1's auc: 0.773403
[1000]	training's auc: 0.830896	valid_1's auc: 0.773417
[1100]	training's auc: 0.835933	valid_1's auc: 0.773565
[1200]	training's auc: 0.840618	valid_1's auc: 0.773286
Early stopping, best iteration is:
[1081]	training's auc: 0.834966	valid_1's auc: 0.773616
Fold  1 AUC : 0.773554
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.748204	valid_1's auc: 0.734264
[200]	training's auc

In [10]:
blend_sub = pd.DataFrame()
blend_sub['SK_ID_CURR'] = pred_small['SK_ID_CURR']
blend_sub['TARGET'] = (pred_small['TARGET']+pred_large['TARGET'])/2

In [19]:
print(blend_sub.head())
blend_sub.to_csv(PATH + 'submission/622_blend_train.csv', index = False)

   SK_ID_CURR    TARGET
0      100001  0.031317
1      100005  0.096532
2      100013  0.011653
3      100028  0.035580
4      100038  0.139134


In [18]:
for i in [blend_sub, pred_small, pred_large,pred1]:
    print((i>0.5).sum())
    print((i<0.005).sum())

SK_ID_CURR    48744
TARGET          144
dtype: int64
SK_ID_CURR     0
TARGET        36
dtype: int64
SK_ID_CURR    48744
TARGET          122
dtype: int64
SK_ID_CURR     0
TARGET        40
dtype: int64
SK_ID_CURR    48744
TARGET          167
dtype: int64
SK_ID_CURR     0
TARGET        33
dtype: int64
SK_ID_CURR    48744
TARGET          127
dtype: int64
SK_ID_CURR     0
TARGET        31
dtype: int64
