In [1]:
import os
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
PATH = '/home/kai/data/kaggle/homecredit/'


In [2]:
# Read train and test
train = pd.read_pickle(PATH + 'train_factorized_1_with_linear.pkl')
test = pd.read_pickle(PATH + 'test_factorized_1_with_linear.pkl')
print(train.shape, test.shape)
'done'

# train_ori = pd.read_pickle(PATH+'train_0.pkl')
# print(train_ori.shape)

(307511, 1918) (48744, 1917)


'done'

# Cut large piece and get new train&test, and large_pred piece

In [3]:
def large_new(df1, df2, prediction, threshold):
    print('begin cut large')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,' ,test_df.shape,train_df.shape)
    half_pred = pred[pred['TARGET'] > threshold]
    index_half = half_pred.index
    print('length of half',len(half_pred))
    test_half = test_df.iloc[index_half]
    test_half['TARGET'] = 1

    # drop index_half for test_df, add train
    test_df.drop(index_half, inplace = True)
    train_df = pd.concat([train_df,test_half])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, half_pred

# Cut small piece and get new train& test, and small_pred piece

In [4]:
def small_new(df1, df2, prediction, threshold):
    print('begin cut small')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,',test_df.shape,train_df.shape)
    small_pred = pred[pred['TARGET'] < threshold]
    index_small = small_pred.index
    print('length of small',len(small_pred))
    test_small = test_df.iloc[index_small]
    test_small['TARGET'] = 0

    # drop index_small for test_df, add train
    test_df.drop(index_small, inplace = True)
    train_df = pd.concat([train_df,test_small])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, small_pred

# Models

In [5]:
def model_1(train_df, test_df, threshold1, threshold2, n_splits):
    print('begin blended lightgbm')
    # Cross validation model
    stratified = True

    if stratified:
        folds = StratifiedKFold(n_splits, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits, shuffle=True, random_state=45)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=8,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=44,
            feature_fraction=0.28231763168020257,
            bagging_fraction=0.94901525271474951,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            lambda_l1=0.30680079516647751,
            lambda_l2=0.079128660903201031,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.054005067457890979,
            min_child_weight=98.172643147364937,
            silent=-1,
            verbose=-1, )
#         clf = LGBMClassifier(
#             nthread=16,
#             n_estimators=10000,
#             learning_rate=0.02,
#             num_leaves=34,
#             colsample_bytree=0.9497036,
#             subsample=0.8715623,
#             max_depth=8,
#             reg_alpha=0.041545473,
#             reg_lambda=0.0735294,
#             min_split_gain=0.0222415,
#             min_child_weight=39.3259775,
#             silent=-1,
#             verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    #     fold_importance_df = pd.DataFrame()
    #     fold_importance_df["feature"] = feats
    # #     fold_importance_df["importance"] = clf.feature_importances_
    #     fold_importance_df["fold"] = n_fold + 1
    #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    
    test_df['TARGET'] = sub_preds
    pred =test_df[['SK_ID_CURR', 'TARGET']]
    len_small = len(pred[pred['TARGET']< threshold1])
    len_large = len(pred[pred['TARGET']> threshold2])
    print('length of small and large', len_small, len_large)
    return pred, len_small, len_large

def model_2(train_df, test_df, threshold1, threshold2, n_splits):
    print('begin cv')
    target = train_df['TARGET']
    ignore_cols = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR']
    features = [x for x in train_df.columns if x not in ignore_cols]
    train = train_df[features]
    test = test_df[features]

    from sklearn.model_selection import StratifiedKFold
    from lightgbm import LGBMClassifier
    from sklearn.metrics import roc_auc_score
    import lightgbm as lgbm

    lgbm_train = lgbm.Dataset(data=train,
                              label=target,

                              categorical_feature=[],#categorical_feats,
                              free_raw_data=False)

    lgbm_params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
              'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 4000, 'verbose': 0 ,
              'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
              'min_split_gain':.01, 'min_child_weight':1,'num_threads': 12,'feature_fraction': 0.1,
                  'scale_pos_weight':5}

    cv_results = lgbm.cv(train_set=lgbm_train,
                         params=lgbm_params,
                         nfold=n_splits,
                         early_stopping_rounds=150,
                         verbose_eval=100,
                         metrics=['auc'])

    optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
    print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
    print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

    clf = lgbm.train(train_set=lgbm_train,
                     params=lgbm_params,
                     num_boost_round=optimum_boost_rounds)

    """ Predict on test set and create submission """
    y_pred = clf.predict(test)
    out_df = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'], 'TARGET': y_pred})
    small_len = out_df[out_df['TARGET']< threshold1]
    large_len = out_df[out_df['TARGET']> threshold2]
    print('length of small and large,',len(small_len),len(large_len))
    return out_df, small_len, large_len

# Concate back to a submission

In [6]:
# col = 'SK_ID_CURR'
def concat_pred(cutout_pred, modified_pred):
    print('begin concat predictions')
    test_concat = pd.concat([modified_pred, cutout_pred], axis = 0)
    print(test_concat.shape)
    test = pd.read_csv(PATH + 'application_test.csv')
    test = test[['SK_ID_CURR']]
    pred = test.merge(test_concat, how = 'left', on = 'SK_ID_CURR')
    return pred


# Run

In [None]:
threshold1, threshold2 = 0.005, 0.5
n_splits = 5

pred1, small, large = model_1(train, test, threshold1, threshold2,n_splits)

print('begin cutting')
train_small, test_small, cut_small = small_new(train, test, pred1, threshold1)

begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.782127	valid_1's auc: 0.761802
[200]	training's auc: 0.797856	valid_1's auc: 0.773246
[300]	training's auc: 0.810309	valid_1's auc: 0.779136
[400]	training's auc: 0.820814	valid_1's auc: 0.782878
[500]	training's auc: 0.829389	valid_1's auc: 0.784785


In [None]:
# begin blended lightgbm
# Training until validation scores don't improve for 200 rounds.
# [100]	training's auc: 0.760995	valid_1's auc: 0.747655
# [200]	training's auc: 0.787239	valid_1's auc: 0.767821
# [300]	training's auc: 0.804904	valid_1's auc: 0.777963
# [400]	training's auc: 0.816896	valid_1's auc: 0.78245
# [500]	training's auc: 0.826391	valid_1's auc: 0.784581
# [600]	training's auc: 0.834454	valid_1's auc: 0.786036
# [700]	training's auc: 0.841425	valid_1's auc: 0.78699
# [800]	training's auc: 0.848295	valid_1's auc: 0.7877
# [900]	training's auc: 0.854413	valid_1's auc: 0.787975
# [1000]	training's auc: 0.860486	valid_1's auc: 0.788071
# [1100]	training's auc: 0.866441	valid_1's auc: 0.788217
# [1200]	training's auc: 0.871591	valid_1's auc: 0.78807
# [1300]	training's auc: 0.876439	valid_1's auc: 0.787861
# Early stopping, best iteration is:
# [1149]	training's auc: 0.869002	valid_1's auc: 0.788274
# Fold  1 AUC : 0.788295
# Training until validation scores don't improve for 200 rounds.
# [100]	training's auc: 0.761636	valid_1's auc: 0.749166
# [200]	training's auc: 0.787094	valid_1's auc: 0.766459
# [300]	training's auc: 0.804966	valid_1's auc: 0.776223
# [400]	training's auc: 0.816918	valid_1's auc: 0.780878
# [500]	training's auc: 0.826485	valid_1's auc: 0.782966
# [600]	training's auc: 0.834523	valid_1's auc: 0.784121
# [700]	training's auc: 0.84163	valid_1's auc: 0.784766
# [800]	training's auc: 0.84846	valid_1's auc: 0.78513
# [900]	training's auc: 0.854675	valid_1's auc: 0.785663
# [1000]	training's auc: 0.860514	valid_1's auc: 0.786093
# [1100]	training's auc: 0.866371	valid_1's auc: 0.786266
# [1200]	training's auc: 0.87197	valid_1's auc: 0.786449
# [1300]	training's auc: 0.876999	valid_1's auc: 0.7865
# [1400]	training's auc: 0.882129	valid_1's auc: 0.78662
# [1500]	training's auc: 0.886662	valid_1's auc: 0.786443
# Early stopping, best iteration is:
# [1376]	training's auc: 0.881037	valid_1's auc: 0.786672
# Fold  2 AUC : 0.786678
# Training until validation scores don't improve for 200 rounds.
# [100]	training's auc: 0.760659	valid_1's auc: 0.747171
# [200]	training's auc: 0.786241	valid_1's auc: 0.766948
# [300]	training's auc: 0.804015	valid_1's auc: 0.77869
# [400]	training's auc: 0.816011	valid_1's auc: 0.784033
# [500]	training's auc: 0.825183	valid_1's auc: 0.78673
# [600]	training's auc: 0.833243	valid_1's auc: 0.788508
# [700]	training's auc: 0.840292	valid_1's auc: 0.789757
# [800]	training's auc: 0.847093	valid_1's auc: 0.790439
# [900]	training's auc: 0.85335	valid_1's auc: 0.79105
# [1000]	training's auc: 0.859572	valid_1's auc: 0.791406
# [1100]	training's auc: 0.8651	valid_1's auc: 0.791723
# [1200]	training's auc: 0.870299	valid_1's auc: 0.791952
# [1300]	training's auc: 0.875536	valid_1's auc: 0.792333
# [1400]	training's auc: 0.880218	valid_1's auc: 0.792424
# [1500]	training's auc: 0.88484	valid_1's auc: 0.792505
# [1600]	training's auc: 0.889312	valid_1's auc: 0.792511
# [1700]	training's auc: 0.893781	valid_1's auc: 0.792586
# [1800]	training's auc: 0.897868	valid_1's auc: 0.792656
# [1900]	training's auc: 0.902011	valid_1's auc: 0.792562
# Early stopping, best iteration is:
# [1769]	training's auc: 0.896448	valid_1's auc: 0.79271
# Fold  3 AUC : 0.792695
# Training until validation scores don't improve for 200 rounds.
# [100]	training's auc: 0.760525	valid_1's auc: 0.7427
# [200]	training's auc: 0.787447	valid_1's auc: 0.763296
# [300]	training's auc: 0.805483	valid_1's auc: 0.773621
# [400]	training's auc: 0.817332	valid_1's auc: 0.777943
# [500]	training's auc: 0.826812	valid_1's auc: 0.780438
# [600]	training's auc: 0.835014	valid_1's auc: 0.781843
# [700]	training's auc: 0.842207	valid_1's auc: 0.782736
# [800]	training's auc: 0.84887	valid_1's auc: 0.783554
# [900]	training's auc: 0.855065	valid_1's auc: 0.783935
# [1000]	training's auc: 0.861097	valid_1's auc: 0.784151
# [1100]	training's auc: 0.866699	valid_1's auc: 0.784371
# [1200]	training's auc: 0.872043	valid_1's auc: 0.784373
# [1300]	training's auc: 0.877109	valid_1's auc: 0.784312
# Early stopping, best iteration is:
# [1156]	training's auc: 0.869768	valid_1's auc: 0.784438
# Fold  4 AUC : 0.784448
# Training until validation scores don't improve for 200 rounds.
# [100]	training's auc: 0.759691	valid_1's auc: 0.753795
# [200]	training's auc: 0.786579	valid_1's auc: 0.772909
# [300]	training's auc: 0.804432	valid_1's auc: 0.782423
# [400]	training's auc: 0.816241	valid_1's auc: 0.78653
# [500]	training's auc: 0.825496	valid_1's auc: 0.788426
# [600]	training's auc: 0.833668	valid_1's auc: 0.789596
# [700]	training's auc: 0.840896	valid_1's auc: 0.79026
# [800]	training's auc: 0.847615	valid_1's auc: 0.790726
# [900]	training's auc: 0.854026	valid_1's auc: 0.790958
# [1000]	training's auc: 0.859974	valid_1's auc: 0.790957
# [1100]	training's auc: 0.865571	valid_1's auc: 0.791336
# [1200]	training's auc: 0.870813	valid_1's auc: 0.791357
# [1300]	training's auc: 0.875808	valid_1's auc: 0.791205
# Early stopping, best iteration is:
# [1153]	training's auc: 0.868402	valid_1's auc: 0.791415
# Fold  5 AUC : 0.791413
# Full AUC score 0.788704
# length of small and large 138 168
# begin cutting
# begin cut small
# old shapes, (48744, 1918) (307511, 1918)
# length of small 138
# /home/kai/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: 
# A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead

# See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#   # Remove the CWD from sys.path while we load stuff.
# new shapes, (48606, 1918) (307649, 1918)

# Percentage

In [None]:
k = 30
for i in range(k+1):
    tmp = i/k
    print('threshold,', tmp)
    print((pred1[pred1['TARGET']<= tmp]).sum()/len(pred1))
    

# Train new with small

In [8]:
# n_splits = 3
print('start modified_small')
modified_small,_,_ =  model_2(train_small, test_small, threshold1, threshold2,n_splits)

pred_small = concat_pred(modified_small,cut_small)

start modified_small


NameError: name 'train_small' is not defined

# Large

In [None]:
train_large, test_large, cut_large = large_new(train, test, pred1, threshold2)
print('start modified_large')
modified_large,_,_ =  model_2(train_large, test_large, threshold1, threshold2,n_splits)
pred_large = concat_pred(cut_large,modified_large)

begin cut large
old shapes, (48744, 1793) (307511, 1794)
length of half 3779


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (44965, 1793) (311290, 1794)
start modified_large
begin cv




[100]	cv_agg's auc: 0.778437 + 0.00384728
[200]	cv_agg's auc: 0.7882 + 0.00367632
[300]	cv_agg's auc: 0.795218 + 0.00376976
[400]	cv_agg's auc: 0.800156 + 0.00372864
[500]	cv_agg's auc: 0.803605 + 0.00380218
[600]	cv_agg's auc: 0.806298 + 0.00386017
[700]	cv_agg's auc: 0.808319 + 0.00385315
[800]	cv_agg's auc: 0.810175 + 0.00386175
[900]	cv_agg's auc: 0.811516 + 0.00387726
[1000]	cv_agg's auc: 0.812944 + 0.00389397
[1100]	cv_agg's auc: 0.813961 + 0.00395794
[1200]	cv_agg's auc: 0.81485 + 0.00398087
[1300]	cv_agg's auc: 0.815577 + 0.00401451
[1400]	cv_agg's auc: 0.816304 + 0.0040666
[1500]	cv_agg's auc: 0.816805 + 0.00410597
[1600]	cv_agg's auc: 0.817236 + 0.00415379
[1700]	cv_agg's auc: 0.817663 + 0.00421838
[1800]	cv_agg's auc: 0.818019 + 0.0042507
[1900]	cv_agg's auc: 0.818342 + 0.00424498
[2000]	cv_agg's auc: 0.818605 + 0.00425817
[2100]	cv_agg's auc: 0.818839 + 0.00428616
[2200]	cv_agg's auc: 0.819015 + 0.00428035
[2300]	cv_agg's auc: 0.81918 + 0.00425663
[2400]	cv_agg's auc: 0.819

In [11]:
blend_sub = pd.DataFrame()
blend_sub['SK_ID_CURR'] = pred_small['SK_ID_CURR']
blend_sub['TARGET'] = (pred_small['TARGET']+pred_large['TARGET'])/2

In [17]:
for i in [blend_sub, pred_small, pred_large,pred1]:
    print(i.head(1))
    print((i['TARGET']>0.5).sum())
    print((i['TARGET']<0.005).sum())

   SK_ID_CURR    TARGET
0      100001  0.036878
247
357
   SK_ID_CURR    TARGET
0      100001  0.036896
213
382
   SK_ID_CURR   TARGET
0      100001  0.03686
297
337
   SK_ID_CURR    TARGET
0      100001  0.036144
222
362


In [20]:
blend_sub.to_csv(PATH+ 'submission/0622_blend_2.csv', index = False)

In [21]:
pred_large.to_csv(PATH+ 'submission/0622_blend_3.csv', index = False)