In [1]:
import os
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
PATH = '/home/kai/data/kaggle/homecredit/'


In [2]:
# Read train and test
train = pd.read_pickle(PATH + 'train_factorized_std_with_linear.pkl')
test = pd.read_pickle(PATH + 'test_factorized_std_with_linear.pkl')

df = pd.read_pickle(PATH + 'inter/bureau_sup.pkl')

print(train.shape, test.shape, df.shape)


(307511, 1894) (48744, 1893) (356255, 64)


In [3]:
train = train.merge(df, on = 'SK_ID_CURR', how = 'left')
test = test.merge(df, on = 'SK_ID_CURR', how = 'left')

In [4]:
train.shape, test.shape

((307511, 1957), (48744, 1956))

# Cut large piece and get new train&test, and large_pred piece

In [6]:
def large_new(df1, df2, prediction, threshold):
    print('begin cut large')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,' ,test_df.shape,train_df.shape)
    half_pred = pred[pred['TARGET'] > threshold]
    index_half = half_pred.index
    print('length of half',len(half_pred))
    test_half = test_df.iloc[index_half]
    test_half['TARGET'] = 1

    # drop index_half for test_df, add train
    test_df.drop(index_half, inplace = True)
    train_df = pd.concat([train_df,test_half])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, half_pred

# Cut small piece and get new train& test, and small_pred piece

In [7]:
def small_new(df1, df2, prediction, threshold):
    print('begin cut small')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,',test_df.shape,train_df.shape)
    small_pred = pred[pred['TARGET'] < threshold]
    index_small = small_pred.index
    print('length of small',len(small_pred))
    test_small = test_df.iloc[index_small]
    test_small['TARGET'] = 0

    # drop index_small for test_df, add train
    test_df.drop(index_small, inplace = True)
    train_df = pd.concat([train_df,test_small])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, small_pred

# Models

In [17]:
def model_1(train_df, test_df, threshold1, threshold2, n_splits=5):
    print('begin blended lightgbm')
    # Cross validation model
    stratified = False

    if stratified:
        folds = StratifiedKFold(n_splits, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        
#         lgbm_params = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric': 'auc',
#         'learning_rate': 0.05,
#         'num_iteration': 4000,
#         'num_threads': 16,
        
#         'num_leaves': int(round(44.368535336628419)),
#         'feature_fraction': 0.28231763168020257,
#         'bagging_fraction': 0.94901525271474951,
#         'max_depth': int(round(8.0430115561596267)),
#         'lambda_l1': 0.30680079516647751,
#         'lambda_l2': 0.079128660903201031,
#         'min_split_gain': 0.054005067457890979,
#         'min_child_weight': 98.172643147364937
#     }

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=12,
            n_estimators=10000,
            num_iteration= 4000,
            learning_rate=0.05,
            num_leaves=int(round(44.368535336628419)),
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=int(round(8.0430115561596267)),
            reg_alpha=0.079128660903201031,
            bagging_fraction= 0.94901525271474951,
            feature_fraction = 0.28231763168020257,
            reg_lambda=0.30680079516647751,
            min_split_gain=0.054005067457890979,
            min_child_weight=98.172643147364937,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    #     fold_importance_df = pd.DataFrame()
    #     fold_importance_df["feature"] = feats
    # #     fold_importance_df["importance"] = clf.feature_importances_
    #     fold_importance_df["fold"] = n_fold + 1
    #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    
    test_df['TARGET'] = sub_preds
    pred =test_df[['SK_ID_CURR', 'TARGET']]
    len_small = len(pred[pred['TARGET']< threshold1])
    len_large = len(pred[pred['TARGET']> threshold2])
    print('length of small and large', len_small, len_large)
    return pred, len_small, len_large

def model_2(train_df, test_df,threshold1, threshold2, n_splits=5):
    print('begin cv')
    target = train_df['TARGET']
    ignore_cols = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR']
    features = [x for x in train_df.columns if x not in ignore_cols]
    train = train_df[features]
    test = test_df[features]

    from sklearn.model_selection import StratifiedKFold
    from lightgbm import LGBMClassifier
    from sklearn.metrics import roc_auc_score
    import lightgbm as lgbm

    lgbm_train = lgbm.Dataset(data=train,
                              label=target,

                              categorical_feature=[],#categorical_feats,
                              free_raw_data=False)

#     lgbm_params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
#               'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 4000, 'verbose': 0 ,
#               'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
#               'min_split_gain':.01, 'min_child_weight':1,'num_threads': 12,'feature_fraction': 0.2,
#                   'scale_pos_weight':1}
    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_iteration': 4000,
        'num_threads': 8,
        
        'num_leaves': int(round(44.368535336628419)),
        'feature_fraction': 0.28231763168020257,
        'bagging_fraction': 0.94901525271474951,
        'max_depth': int(round(8.0430115561596267)),
        'lambda_l1': 0.30680079516647751,
        'lambda_l2': 0.079128660903201031,
        'min_split_gain': 0.054005067457890979,
        'min_child_weight': 98.172643147364937}


    cv_results = lgbm.cv(train_set=lgbm_train,
                         params=lgbm_params,
                         nfold=n_splits,
                         early_stopping_rounds=150,
                         verbose_eval=100,
                         metrics=['auc'])

    optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
    print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
    print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

    clf = lgbm.train(train_set=lgbm_train,
                     params=lgbm_params,
                     num_boost_round=optimum_boost_rounds,
                    verbose_eval=100)

    """ Predict on test set and create submission """
    y_pred = clf.predict(test)
    out_df = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'], 'TARGET': y_pred})
    small_len = out_df[out_df['TARGET']< threshold1]
    large_len = out_df[out_df['TARGET']> threshold2]
    print('length of small and large,',len(small_len),len(large_len))
    return out_df, small_len, large_len

# Concate back to a submission

In [9]:
# col = 'SK_ID_CURR'
def concat_pred(cutout_pred, modified_pred):
    print('begin concat predictions')
    test_concat = pd.concat([modified_pred, cutout_pred], axis = 0)
    print(test_concat.shape)
    test = pd.read_csv(PATH + 'application_test.csv')
    test = test[['SK_ID_CURR']]
    pred = test.merge(test_concat, how = 'left', on = 'SK_ID_CURR')
    return pred


# Run

In [12]:
threshold1, threshold2 = 0.005, 0.5
n_splits = 5

pred1, small, large = model_1(train, test, threshold1, threshold2,n_splits)

print('begin cutting')
train_small, test_small, cut_small = small_new(train, test, pred1, threshold1)

begin blended lightgbm




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.807592	valid_1's auc: 0.782037
[200]	training's auc: 0.83447	valid_1's auc: 0.791848
[300]	training's auc: 0.851726	valid_1's auc: 0.795577
[400]	training's auc: 0.86464	valid_1's auc: 0.797151
[500]	training's auc: 0.875936	valid_1's auc: 0.797159
[600]	training's auc: 0.885902	valid_1's auc: 0.797349
Early stopping, best iteration is:
[451]	training's auc: 0.870577	valid_1's auc: 0.797419
Fold  1 AUC : 0.797421




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.807776	valid_1's auc: 0.781827
[200]	training's auc: 0.834545	valid_1's auc: 0.792588
[300]	training's auc: 0.851594	valid_1's auc: 0.796403
[400]	training's auc: 0.864678	valid_1's auc: 0.797187
[500]	training's auc: 0.875568	valid_1's auc: 0.797676
[600]	training's auc: 0.884697	valid_1's auc: 0.79788
[700]	training's auc: 0.893404	valid_1's auc: 0.798226
[800]	training's auc: 0.901749	valid_1's auc: 0.798273
[900]	training's auc: 0.908935	valid_1's auc: 0.798032
Early stopping, best iteration is:
[744]	training's auc: 0.89705	valid_1's auc: 0.798328
Fold  2 AUC : 0.798361




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.809291	valid_1's auc: 0.775262
[200]	training's auc: 0.836173	valid_1's auc: 0.784845
[300]	training's auc: 0.853751	valid_1's auc: 0.788312
[400]	training's auc: 0.866425	valid_1's auc: 0.789047
[500]	training's auc: 0.876816	valid_1's auc: 0.789204
[600]	training's auc: 0.886764	valid_1's auc: 0.789543
[700]	training's auc: 0.895714	valid_1's auc: 0.789344
[800]	training's auc: 0.903334	valid_1's auc: 0.789289
Early stopping, best iteration is:
[647]	training's auc: 0.89108	valid_1's auc: 0.789561
Fold  3 AUC : 0.789561




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.808881	valid_1's auc: 0.77792
[200]	training's auc: 0.836091	valid_1's auc: 0.787847
[300]	training's auc: 0.853733	valid_1's auc: 0.791741
[400]	training's auc: 0.866946	valid_1's auc: 0.793119
[500]	training's auc: 0.876878	valid_1's auc: 0.793676
[600]	training's auc: 0.887468	valid_1's auc: 0.793864
[700]	training's auc: 0.896204	valid_1's auc: 0.79398
[800]	training's auc: 0.904038	valid_1's auc: 0.794186
[900]	training's auc: 0.911904	valid_1's auc: 0.793942
[1000]	training's auc: 0.917972	valid_1's auc: 0.793595
Early stopping, best iteration is:
[804]	training's auc: 0.90445	valid_1's auc: 0.794247
Fold  4 AUC : 0.794258




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.808541	valid_1's auc: 0.781717
[200]	training's auc: 0.835112	valid_1's auc: 0.790393
[300]	training's auc: 0.852776	valid_1's auc: 0.793796
[400]	training's auc: 0.866193	valid_1's auc: 0.794919
[500]	training's auc: 0.876853	valid_1's auc: 0.795126
[600]	training's auc: 0.886981	valid_1's auc: 0.79497
[700]	training's auc: 0.895022	valid_1's auc: 0.795024
Early stopping, best iteration is:
[560]	training's auc: 0.883468	valid_1's auc: 0.795292
Fold  5 AUC : 0.795289
Full AUC score 0.794952
length of small and large 467 202
begin cutting
begin cut small
old shapes, (48744, 1957) (307511, 1957)
length of small 467


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (48277, 1957) (307978, 1957)


In [None]:
# stratified = False, seed = 1001, bayesian optimization paras


In [15]:
x = pred1[pred1['TARGET']>0.5]
x

Unnamed: 0,SK_ID_CURR,TARGET
202,101363,0.512845
634,104410,0.660982
998,106854,0.565535
1291,109046,0.523270
1597,111345,0.500303
1610,111452,0.621281
1829,112917,0.574808
1935,113627,0.751126
1993,113999,0.511524
2035,114283,0.632261


In [14]:
pred1.to_csv(PATH+'/submission/06_28_pred1.csv', index = False)

# Percentage

In [18]:
k = 30
for i in range(k+1):
    tmp = i/k
    print('threshold,', tmp)
    print(1-(pred1['TARGET']<= tmp).sum()/len(pred1))
    

threshold, 0.0
1.0
threshold, 0.03333333333333333
0.5699368127359264
threshold, 0.06666666666666667
0.3252297718693583
threshold, 0.1
0.20933858526177584
threshold, 0.13333333333333333
0.14284835056622358
threshold, 0.16666666666666666
0.10046364680781228
threshold, 0.2
0.07323978335795178
threshold, 0.23333333333333334
0.05299113737075334
threshold, 0.26666666666666666
0.04027162317413424
threshold, 0.3
0.02997291974396854
threshold, 0.3333333333333333
0.022095027080256013
threshold, 0.36666666666666664
0.016802067946824262
threshold, 0.4
0.012350237978007494
threshold, 0.43333333333333335
0.00878056786476289
threshold, 0.4666666666666667
0.0062161496799606475
threshold, 0.5
0.0045749220416871905
threshold, 0.5333333333333333
0.0032619399310684916
threshold, 0.5666666666666667
0.0023182340390611733
threshold, 0.6
0.001374528147053966
threshold, 0.6333333333333333
0.000984736582964052
threshold, 0.6666666666666666
0.0006770064007878274
threshold, 0.7
0.00036927621861149174
threshold, 0

In [20]:
pred1[pred1['TARGET']>1/2]['TARGET'].max()
pred1.to_csv(PATH+'submission/06_25_2.csv', index = False)

# Train new with small

In [51]:
# n_splits = 3

print('start modified_small')
modified_small,_,_ =  model_2(train_small, test_ori, threshold1, threshold2,n_splits)

# pred_small = concat_pred(modified_small,cut_small)

start modified_small
begin cv




[100]	cv_agg's auc: 0.754762 + 0.00437548


KeyboardInterrupt: 

# Large

In [14]:
pred1 = pd.read_csv(PATH+'/submission/06_28_pred1.csv')

train_large, test_large, cut_large = large_new(train, test, pred1, 0.5)

begin cut large
old shapes, (48744, 1956) (307511, 1957)
length of half 202


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (48542, 1956) (307713, 1957)


In [None]:

print('start modified_large')
# test_ori = pd.read_pickle(PATH + 'test_0_with_linear.pkl')
# print('read ori test done')

test_ori = pd.read_pickle(PATH + 'test_factorized_std_with_linear.pkl')
test_ori = test_ori.merge(df, on = 'SK_ID_CURR', how = 'left')
print(test_ori.shape)

n_splits = 5
# modified_large,_,_ =  model_1(train_large, test_ori, threshold1, threshold2, n_splits)
modified_large_2,_,_ =  model_2(train_large, test_ori, 0.005, 0.5, 5)

start modified_large
(48744, 1956)
begin cv




[100]	cv_agg's auc: 0.780892 + 0.00277921
[200]	cv_agg's auc: 0.791241 + 0.00301825


In [20]:
modified_large_2.shape

NameError: name 'modified_large_2' is not defined

In [27]:
modified_large,_,_ =  model_1(train_large, test_large, test_ori, threshold1, threshold2, n_splits)

begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.776383	valid_1's auc: 0.762149
[200]	training's auc: 0.793873	valid_1's auc: 0.774697
[300]	training's auc: 0.808924	valid_1's auc: 0.784543
[400]	training's auc: 0.819702	valid_1's auc: 0.789038
[500]	training's auc: 0.828517	valid_1's auc: 0.79196
[600]	training's auc: 0.836371	valid_1's auc: 0.794034
[700]	training's auc: 0.843331	valid_1's auc: 0.795198
[800]	training's auc: 0.84969	valid_1's auc: 0.796317
[900]	training's auc: 0.855507	valid_1's auc: 0.796782
[1000]	training's auc: 0.861077	valid_1's auc: 0.797453
[1100]	training's auc: 0.86631	valid_1's auc: 0.798057
[1200]	training's auc: 0.871352	valid_1's auc: 0.798438
[1300]	training's auc: 0.875819	valid_1's auc: 0.798694
[1400]	training's auc: 0.880168	valid_1's auc: 0.798866
[1500]	training's auc: 0.884395	valid_1's auc: 0.798976
[1600]	training's auc: 0.888517	valid_1's auc: 0.798971
[1700]	training's auc: 0.89233

In [53]:
pred_large = concat_pred(cut_large,modified_large)
pred_large.to_csv(PATH+'06_26_large.csv', index = False)
# modified_large.to_csv(PATH +'06_25_large2.csv',index = False)

begin concat predictions
(48744, 2)


In [11]:
blend_sub = pd.DataFrame()
blend_sub['SK_ID_CURR'] = pred_small['SK_ID_CURR']
blend_sub['TARGET'] = (pred_small['TARGET']+pred_large['TARGET'])/2

In [17]:
for i in [blend_sub, pred_small, pred_large,pred1]:
    print(i.head(1))
    print((i['TARGET']>0.5).sum())
    print((i['TARGET']<0.005).sum())

   SK_ID_CURR    TARGET
0      100001  0.036878
247
357
   SK_ID_CURR    TARGET
0      100001  0.036896
213
382
   SK_ID_CURR   TARGET
0      100001  0.03686
297
337
   SK_ID_CURR    TARGET
0      100001  0.036144
222
362


In [20]:
blend_sub.to_csv(PATH+ 'submission/0622_blend_2.csv', index = False)

In [21]:
pred_large.to_csv(PATH+ 'submission/0622_blend_3.csv', index = False)