In [1]:
import os
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
PATH = '/home/kai/data/kaggle/homecredit/'


In [2]:
# Read train and test
train = pd.read_pickle(PATH + 'train_0.pkl')
test = pd.read_pickle(PATH + 'test_0.pkl')
print(train.shape, test.shape)
'done'

(307511, 1754) (48744, 1753)


'done'

# Cut large piece and get new train&test, and large_pred piece

In [3]:
def large_new(df1, df2, prediction, threshold):
    print('begin cut large')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,' ,test_df.shape,train_df.shape)
    half_pred = pred[pred['TARGET'] > threshold]
    index_half = half_pred.index
    print('length of half',len(half_pred))
    test_half = test_df.iloc[index_half]
    test_half['TARGET'] = 1

    # drop index_half for test_df, add train
    test_df.drop(index_half, inplace = True)
    train_df = pd.concat([train_df,test_half])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, half_pred

# Cut small piece and get new train& test, and small_pred piece

In [4]:
def small_new(df1, df2, prediction, threshold):
    print('begin cut small')
    pred = prediction.copy()
    train_df, test_df = df1.copy(), df2.copy()
    print('old shapes,',test_df.shape,train_df.shape)
    small_pred = pred[pred['TARGET'] < threshold]
    index_small = small_pred.index
    print('length of small',len(small_pred))
    test_small = test_df.iloc[index_small]
    test_small['TARGET'] = 0

    # drop index_small for test_df, add train
    test_df.drop(index_small, inplace = True)
    train_df = pd.concat([train_df,test_small])
    print('new shapes,',test_df.shape,train_df.shape)
    return train_df, test_df, small_pred

# Models

In [5]:
def model_1(train_df, test_df, threshold1, threshold2, n_splits):
    print('begin blended lightgbm')
    # Cross validation model
    stratified = True

    if stratified:
        folds = StratifiedKFold(n_splits, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits, shuffle=True, random_state=45)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=16,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    #     fold_importance_df = pd.DataFrame()
    #     fold_importance_df["feature"] = feats
    # #     fold_importance_df["importance"] = clf.feature_importances_
    #     fold_importance_df["fold"] = n_fold + 1
    #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    
    test_df['TARGET'] = sub_preds
    pred =test_df[['SK_ID_CURR', 'TARGET']]
    len_small = len(pred[pred['TARGET']< threshold1])
    len_large = len(pred[pred['TARGET']> threshold2])
    print('length of small and large', len_small, len_large)
    return pred, len_small, len_large

def model_2(train_df, test_df, threshold1, threshold2, n_splits):
    print('begin cv')
    target = train_df['TARGET']
    ignore_cols = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR']
    features = [x for x in train_df.columns if x not in ignore_cols]
    train = train_df[features]
    test = test_df[features]

    from sklearn.model_selection import StratifiedKFold
    from lightgbm import LGBMClassifier
    from sklearn.metrics import roc_auc_score
    import lightgbm as lgbm

    lgbm_train = lgbm.Dataset(data=train,
                              label=target,

                              categorical_feature=[],#categorical_feats,
                              free_raw_data=False)

    lgbm_params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
              'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 4000, 'verbose': 0 ,
              'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
              'min_split_gain':.01, 'min_child_weight':1,'num_threads': 12,'feature_fraction': 0.95}

    cv_results = lgbm.cv(train_set=lgbm_train,
                         params=lgbm_params,
                         nfold=n_splits,
                         early_stopping_rounds=150,
                         verbose_eval=100,
                         metrics=['auc'])

    optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
    print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
    print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

    clf = lgbm.train(train_set=lgbm_train,
                     params=lgbm_params,
                     num_boost_round=optimum_boost_rounds)

    """ Predict on test set and create submission """
    y_pred = clf.predict(test)
    out_df = pd.DataFrame({'SK_ID_CURR': test_cleaned['SK_ID_CURR'], 'TARGET': y_pred})
    small_len = out_df[out_df['TARGET']< threshold1]
    large_len = out_df[out_df['TARGET']> threshold2]
    print('length of small and large,' len(small_len),len(large_len))
    return out_df, small_len, large_len

# Concate back to a submission

In [6]:
# col = 'SK_ID_CURR'
def concat_pred(cutout_pred, modified_pred):
    print('begin concat predictions')
    test_concat = pd.concat([modified_pred, cutout_pred], axis = 0)
    print(test_concat.shape)
    test = pd.read_csv(PATH + 'application_test.csv')
    test = test[['SK_ID_CURR']]
    pred = test.merge(test_concat, how = 'left', on = 'SK_ID_CURR')
    return pred


# Run

In [7]:
threshold1, threshold2 = 0.005, 0.5
splits = 5

pred1, small, large = model_1(train, test, threshold1, threshold2,n_splits)

print('begin cutting')
train_small, test_small, cut_small = small_new(train, test, pred1, threshold1)



begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.760721	valid_1's auc: 0.751877
[200]	training's auc: 0.788426	valid_1's auc: 0.772518
[300]	training's auc: 0.808035	valid_1's auc: 0.784701
[400]	training's auc: 0.821158	valid_1's auc: 0.789974
[500]	training's auc: 0.831417	valid_1's auc: 0.793318
[600]	training's auc: 0.840338	valid_1's auc: 0.795182
[700]	training's auc: 0.848099	valid_1's auc: 0.796448
[800]	training's auc: 0.855077	valid_1's auc: 0.797361
[900]	training's auc: 0.861648	valid_1's auc: 0.797891
[1000]	training's auc: 0.867748	valid_1's auc: 0.79819
[1100]	training's auc: 0.873578	valid_1's auc: 0.798436
[1200]	training's auc: 0.879083	valid_1's auc: 0.798666
[1300]	training's auc: 0.884268	valid_1's auc: 0.798874
[1400]	training's auc: 0.889303	valid_1's auc: 0.798999
[1500]	training's auc: 0.894	valid_1's auc: 0.799004
[1600]	training's auc: 0.898582	valid_1's auc: 0.799006
[1700]	training's auc: 0.902972

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (48382, 1754) (307873, 1754)
begin cut new
old shapes, (48744, 1754) (307511, 1754)
length of half 222
new shapes, (48522, 1754) (307733, 1754)


In [9]:
n_splits = 5
print('start modified_small')
modified_small,_,_ =  model_1(train_small, test_small, threshold1, threshold2,n_splits)

pred_small = concat_pred(modified_small,cut_small)

begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.761199	valid_1's auc: 0.75245
[200]	training's auc: 0.788297	valid_1's auc: 0.771412
[300]	training's auc: 0.808161	valid_1's auc: 0.783667
[400]	training's auc: 0.821293	valid_1's auc: 0.788847
[500]	training's auc: 0.831885	valid_1's auc: 0.791875
[600]	training's auc: 0.840844	valid_1's auc: 0.793543
[700]	training's auc: 0.84884	valid_1's auc: 0.794455
[800]	training's auc: 0.855803	valid_1's auc: 0.795087
[900]	training's auc: 0.86229	valid_1's auc: 0.79561
[1000]	training's auc: 0.868445	valid_1's auc: 0.796019
[1100]	training's auc: 0.8741	valid_1's auc: 0.796268
[1200]	training's auc: 0.879683	valid_1's auc: 0.796514
[1300]	training's auc: 0.884739	valid_1's auc: 0.796538
[1400]	training's auc: 0.88974	valid_1's auc: 0.796397
[1500]	training's auc: 0.894348	valid_1's auc: 0.796291
Early stopping, best iteration is:
[1318]	training's auc: 0.885608	valid_1's auc: 0.796585

In [10]:
train_large, test_large, cut_large = large_new(train, test, pred1, threshold2)
print('start modified_large')
modified_large,_,_ =  model_1(train_large, test_large, threshold1, threshold2,n_splits)
pred_large = concat_pred(cut_large,modified_large)

begin cut new
old shapes, (48744, 1754) (307511, 1754)
length of half 222


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


new shapes, (48522, 1754) (307733, 1754)
start modified_large
begin blended lightgbm
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.762495	valid_1's auc: 0.748653
[200]	training's auc: 0.790586	valid_1's auc: 0.770404
[300]	training's auc: 0.810257	valid_1's auc: 0.782054
[400]	training's auc: 0.823106	valid_1's auc: 0.787786
[500]	training's auc: 0.833135	valid_1's auc: 0.790999
[600]	training's auc: 0.841783	valid_1's auc: 0.793037
[700]	training's auc: 0.849172	valid_1's auc: 0.794452
[800]	training's auc: 0.856186	valid_1's auc: 0.795297
[900]	training's auc: 0.862603	valid_1's auc: 0.795978
[1000]	training's auc: 0.868766	valid_1's auc: 0.796487
[1100]	training's auc: 0.874655	valid_1's auc: 0.797129
[1200]	training's auc: 0.87995	valid_1's auc: 0.79743
[1300]	training's auc: 0.884913	valid_1's auc: 0.797756
[1400]	training's auc: 0.889788	valid_1's auc: 0.797969
[1500]	training's auc: 0.894403	valid_1's auc: 0.798008
[1600]	training's auc: 

In [14]:
blend_sub = pd.DataFrame()
blend_sub['SK_ID_CURR'] = pred_small['SK_ID_CURR']
blend_sub['TARGET'] = (pred_small['TARGET']+pred_large['TARGET'])/2