In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
import pickle
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import gc

print_to_file = False 
test_run = False 

train = pd.read_pickle(ENV.lightgbm_train_764.value)
print('train shape is: {}'.format(train.shape))
test = pd.read_pickle(ENV.lightgbm_test_764.value)
print('test shape is: {}'.format(test.shape))
fe_id = 'comb_764'

train shape is: (307511, 764)
test shape is: (48744, 763)


In [2]:
train['SK_ID_CURR'] = train['SK_ID_CURR'].astype(int)
test['SK_ID_CURR'] = test['SK_ID_CURR'].astype(int)
targets = train.TARGET.values

In [3]:
train_id = train['SK_ID_CURR']
test_id = test['SK_ID_CURR']

# main func

In [4]:
from sklearn.model_selection import train_test_split

def get_time(timezone='America/New_York', time_format='%Y-%m-%d %H:%M:%S'):
    from datetime import datetime
    from dateutil import tz

    # METHOD 1: Hardcode zones:
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz(timezone)

    utc = datetime.utcnow()

    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    utc = utc.replace(tzinfo=from_zone)

    # Convert time zone
    est = utc.astimezone(to_zone)

    return est.strftime(time_format)

import sys, time
class Logger(object):
    def __init__(self, logtofile=True, logfilename='log'):
        self.terminal = sys.stdout
        self.logfile = "{}_{}.log".format(logfilename, int(time.time()))
        self.logtofile = logtofile

    def write(self, message):
        #         self.terminal.write(message)
        if self.logtofile:
            self.log = open(self.logfile, "a")
            self.log.write('[' + get_time() + '] ' + message)
            self.log.close()

    def flush(self):
        # this flush method is needed for python 3 compatibility.
        # this handles the flush command by doing nothing.
        # you might want to specify some extra behavior here.
        pass


def divert_printout_to_file():
    sys.stdout = Logger(logfilename='logfile')

if print_to_file:
    divert_printout_to_file()  # note: comment this to use pdb

import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, train_df, test_df, holdout, num_folds, submission_file_name, fe_img_name, stratified = False, debug= False, colsample=0.67, max_depth=8, num_leaves=31, min_child_samples=20, subsample=0.7, reg_lambda=0.3, lr=0.04, seed=1001, verbose=100, rounds=None):
    print(train_df.shape, test_df.shape, holdout.shape)
    print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
    # Divide in training/validation and test data
    if df is not None:
        train_df = df[df['TARGET'].notnull()]
        test_df = df[df['TARGET'].isnull()]
        print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
        del df
        gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
        
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    holdout_final_preds = np.zeros(holdout.shape[0])
    feature_importance_df = pd.DataFrame()
    feature_importance_gain_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    train_scores = []
    holdout_scores = []
    scores = []
    actual_y = []
    pred_y = []
    diff_val_holdout = []
    SK_ID_CURR = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
#         print('valid index : ',list(valid_idx)[:5])
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
#         print('MEAN: train({}) vs valid({}): '.format(len(train_y), len(valid_y)), np.mean(train_y), np.mean(valid_y))
        SK_ID_CURR.extend(train_df['SK_ID_CURR'].iloc[valid_idx])
        clf = LGBMClassifier(
            nthread=18,
            n_estimators=30000,
            learning_rate=lr,
            num_leaves=num_leaves,
            colsample_bytree=colsample, # 0.67
            subsample=subsample,
            subsample_freq=0, ## disable subsampling
            max_depth=max_depth,
            reg_alpha=0.65,
            reg_lambda=reg_lambda,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            min_child_samples=min_child_samples,
            silent=-1,
            verbose=-1, )
        if rounds is not None:
            clf.n_estimators = rounds
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                eval_metric= 'auc', verbose=verbose)
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits
            holdout_preds = clf.predict_proba(holdout[feats])[:, 1] 
        else:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                eval_metric= 'auc', verbose=verbose, early_stopping_rounds= 200)
            oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
            holdout_preds = clf.predict_proba(holdout[feats], num_iteration=clf.best_iteration_)[:, 1] 
            
        holdout_final_preds += holdout_preds / folds.n_splits
        score = roc_auc_score(valid_y, oof_preds[valid_idx])
        train_score = clf.best_score_['training']['auc']
        holdout_score = roc_auc_score(holdout['TARGET'], holdout_preds)
        diff = abs(score - holdout_score)
        actual_y.extend(list(valid_y))
        pred_y.extend(list(oof_preds[valid_idx]))
        best_rounds = rounds if rounds is not None else clf.best_iteration_
        print('Fold %2d [%5d] AUC : ho: %.6f / te: %.6f / tr: %.6f (diff: %.6f)' % (n_fold + 1, best_rounds, holdout_score, score,  train_score, diff))
        scores.append(score)
        train_scores.append(train_score)
        holdout_scores.append(holdout_score)
        diff_val_holdout.append(diff)
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        fold_importance_gain_df = pd.DataFrame()
        fold_importance_gain_df["feature"] = feats
        fold_importance_gain_df["importance"] = clf.booster_.feature_importance(importance_type='gain')
        fold_importance_gain_df["fold"] = n_fold + 1
        feature_importance_gain_df = pd.concat([feature_importance_gain_df, fold_importance_gain_df], axis=0)
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    holdout_roc = roc_auc_score(holdout['TARGET'], holdout_final_preds)
    holdout_mean = np.mean(holdout_scores)
    full_te_mean = np.mean(scores)
    full_tr_mean = np.mean(train_scores)
    predsAndActual = pd.DataFrame()
    predsAndActual['preds'] = pred_y
    predsAndActual['label'] = actual_y
    predsAndActual['SK_ID_CURR'] = SK_ID_CURR
    predsAndActual.to_pickle('{}_oof.pkl'.format(submission_file_name))
#     print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    print('Full HO score %.6f' % holdout_roc)
    print('FULL HO mean {:.6f}, std {:.6f}'.format(holdout_mean, np.std(holdout_scores)))
    print('FULL TE mean {:.6f}, std {:.6f}'.format(full_te_mean, np.std(scores)))
    print('FULL TR mean {:.6f}, std {:.6f}'.format(full_tr_mean, np.std(train_scores)))
    print('FULL DIFF mean {:.6f}, std {:.6f}'.format(np.mean(diff_val_holdout), np.std(diff_val_holdout)))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        print(submission_file_name)
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
#     if not print_to_file:
#         display_importances(feature_importance_df, fe_img_name)
    feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    feature_importance_gain_df = feature_importance_gain_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    return feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds,test_df 

# Display/plot feature importance
def display_importances(feature_importance_df_, fe_img_name):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig(fe_img_name+'.png')


def convert_and_save_imp_df(fe_imp_df, dumpfilename):
    fe_imp_df_mean = fe_imp_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    pickle.dump(fe_imp_df_mean, open(dumpfilename,'wb'))

In [5]:
def runlgb(train, test, holdout):
    colsamples = [0.07]#[0.1,0.15,0.2]#[0.03,0.04,0.05,0.06,0.07,0.08]
    seeds = [20]#[300,4000,50000,600000,7000000,80000000,523445,31275479] # 20
    depth = [5]
    leaves = [16]
    min_child_sam = [20]#, 800]
    subsamples = [1]#0.8, 0.7, 0.6, 0.5, 0.4] # was 1
    reg_lambdas = [0.5]
    # lrs = lrs.tolist()
    lrs2 = [0.05]
    nfolds = 5
    rounds = [None] #[1000]#, 1300, 1600, 1900, 2200, 2500]
    for seed in seeds:
        for colsample in colsamples:
            for d in depth:
                for l in leaves:
                    for mcs in min_child_sam:
                        for subsample in subsamples:
                            for reg_lambda in reg_lambdas:
                                for lr in lrs2:
                                    for r in rounds:
                                        filename = 'fe_936_col{}_lr{}_n{}'.format(len(train.columns), lr, nfolds)
                                        print('#############################################')
                                        print(colsample, seed, d, l, mcs, subsample, reg_lambda, lr, 'nfolds:', nfolds)
                                        print('#############################################')
                                        numfeats = len(train.columns)
                                        with timer("Run LightGBM with kfold"):
                                            return kfold_lightgbm(None, train, test, holdout, nfolds, filename+'.csv', filename, colsample=colsample, verbose=None, max_depth=d, num_leaves=l, min_child_samples=mcs, subsample=subsample, reg_lambda=reg_lambda, lr=lr, seed=seed, stratified=True, rounds=r,debug=False)
    #                                         

# add Feature Wei

In [6]:
extra_feature_wei = pd.read_pickle('../../data/add_features/install_preapp_hand_fe.pkl')

# sure_add_features_wei = ['Wei_Remaning_CNT_Payment_TOTAL',
#  'Wei_TOTAl_NEEDPAY_INCOME_RATIO',
#  'Wei_Remaining_AMT_Payment_TOTAL',
#  'Wei_Normed_CNT_LATE_PAYMENT_LAST','SK_ID_CURR']
# extra_feature_wei = extra_feature_wei[sure_add_features_wei]

print(extra_feature_wei.shape)
print(extra_feature_wei.columns)

train = train.merge(extra_feature_wei, how='left', left_on='SK_ID_CURR',right_on='SK_ID_CURR')
print(train.shape)

test = test.merge(extra_feature_wei, how='left', left_on='SK_ID_CURR',right_on='SK_ID_CURR')
print(test.shape)

(339587, 19)
Index(['SK_ID_CURR', 'Wei_Normed_CNT_LATE_PAYMENT_MEAN',
       'Wei_Normed_CNT_LATE_PAYMENT_MAX', 'Wei_Normed_CNT_LESS_PAYMENT_MEAN',
       'Wei_Normed_CNT_LESS_PAYMENT_MAX', 'Wei_Install_Payment_Rate_MEAN',
       'Wei_Install_Payment_Rate_MIN', 'Wei_CNT_installment_per_version_STD',
       'Wei_Remaining_AMT_Payment_TOTAL', 'Wei_CNT_NOT_TERMINATION',
       'Wei_Normed_CNT_LATE_PAYMENT_LAST', 'Wei_Normed_CNT_LESS_PAYMENT_LAST',
       'Wei_Install_Payment_Rate_LAST', 'Wei_CNT_installment_per_version_LAST',
       'Wei_Remaining_AMT_Payment_LAST', 'Wei_Remaning_CNT_Payment_TOTAL',
       'Wei_IF_TERMINATION_LAST', 'Wei_Remaing_Payment_Ratio_CURR',
       'Wei_TOTAl_NEEDPAY_INCOME_RATIO'],
      dtype='object')
(307511, 782)
(48744, 781)


# add Feature Shiyi

In [7]:
extra_feature_shiyi = pd.read_pickle('../../data/add_features/shiyi/shiyifeature1.pkl')



train = train.merge(extra_feature_shiyi, how='left', left_on='SK_ID_CURR',right_on='SK_ID_CURR')
print(train.shape)

test = test.merge(extra_feature_shiyi, how='left', left_on='SK_ID_CURR',right_on='SK_ID_CURR')
print(test.shape)

(307511, 790)
(48744, 789)


In [7]:
# sure_drop = pickle.load(open('../../data/add_features/dropping0824_list','rb'))
sure_drop = []

In [8]:
drop_columns = ['NAME_EDUCATION_TYPE_CODE_GENDER_AMT_CREDIT_mean_abs_diff',
                'inst_DAYS_INSTALMENT_std']

drop_columns = list(set(sure_drop + drop_columns))
train = train.drop(drop_columns,axis=1)
print(train.shape)
test = test.drop(drop_columns,axis=1)
print(test.shape)

(307511, 762)
(48744, 761)


In [9]:
train_drop = train.copy()
test_drop = test.copy()

In [10]:
train_df, holdout = train_test_split(train_drop, test_size=1/10000, random_state=99)
print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
print(train_df.shape, test_drop.shape, holdout.shape)

MEAN: train(307480) vs holdout(31):  0.08072720176922077 0.0967741935483871
(307480, 762) (48744, 761) (31, 762)


# Experiment

In [11]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds,test_preds = runlgb(train_drop, test_drop, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.05 nfolds: 5
#############################################
(307511, 762) (48744, 761) (31, 762)
MEAN: train(307511) vs holdout(31):  0.08072881945686496 0.0967741935483871
Fold  1 [ 1145] AUC : ho: 0.773810 / te: 0.793847 / tr: 0.863795 (diff: 0.020038)
Fold  2 [ 1115] AUC : ho: 0.738095 / te: 0.795462 / tr: 0.862026 (diff: 0.057366)
Fold  3 [ 1117] AUC : ho: 0.750000 / te: 0.797585 / tr: 0.862372 (diff: 0.047585)
Fold  4 [ 1283] AUC : ho: 0.773810 / te: 0.795941 / tr: 0.869706 (diff: 0.022131)
Fold  5 [ 1084] AUC : ho: 0.714286 / te: 0.798190 / tr: 0.860893 (diff: 0.083904)
Full HO score 0.761905
FULL HO mean 0.750000, std 0.022588
FULL TE mean 0.796205, std 0.001551
FULL TR mean 0.863759, std 0.003115
FULL DIFF mean 0.046205, std 0.023715
fe_936_col762_lr0.05_n5.csv
Run LightGBM with kfold - done in 144s


In [13]:
def large_new(train_ori,test_preds,th=0.5,label='TARGET',random_state=19):
    print('original train shape is:{}. Test shape is: {}'.format(train_ori.shape, test_preds.shape))
    large_test = test_preds[test_preds[label] > th].copy()
    print('In test, the prediction greater than {} is selected. The shape is: {}'.format(th, large_test.shape))
    large_test_ori = large_test.copy()
    large_test[label] = 1
    index_large = large_test.index
    
    train_new = pd.concat([train_ori,large_test])
    train_new = train_new.sample(frac=1,random_state=random_state)
    test_new = test_preds.drop(index_large)
    print('new train shape is: {}'.format(train_new.shape))
    print('new test shape is: {}'.format(test_new.shape))
    
    return train_new,test_new,large_test_ori

In [14]:
train_new,test_new,large_test = large_new(train_drop,test_drop,th=0.6)

original train shape is:(307511, 762). Test shape is: (48744, 762)
In test, the prediction greater than 0.6 is selected. The shape is: (62, 762)
new train shape is: (307573, 762)
new test shape is: (48682, 762)


In [15]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds,test_preds = runlgb(train_new, test_new, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.05 nfolds: 5
#############################################
(307573, 762) (48682, 762) (31, 762)
MEAN: train(307573) vs holdout(31):  0.08091412445175616 0.0967741935483871
Fold  1 [ 1071] AUC : ho: 0.773810 / te: 0.792599 / tr: 0.861578 (diff: 0.018789)
Fold  2 [ 1520] AUC : ho: 0.761905 / te: 0.795077 / tr: 0.879233 (diff: 0.033172)
Fold  3 [ 1135] AUC : ho: 0.738095 / te: 0.800065 / tr: 0.861707 (diff: 0.061970)
Fold  4 [ 1039] AUC : ho: 0.726190 / te: 0.797378 / tr: 0.858906 (diff: 0.071188)
Fold  5 [ 1294] AUC : ho: 0.773810 / te: 0.798421 / tr: 0.870835 (diff: 0.024611)
Full HO score 0.761905
FULL HO mean 0.754762, std 0.019343
FULL TE mean 0.796708, std 0.002615
FULL TR mean 0.866452, std 0.007560
FULL DIFF mean 0.041946, std 0.020831
fe_936_col762_lr0.05_n5.csv
Run LightGBM with kfold - done in 147s


In [16]:
test_sub = test[['SK_ID_CURR']].merge(pd.concat([test_new,large_test])[['SK_ID_CURR','TARGET']],how='left',left_on='SK_ID_CURR',right_on='SK_ID_CURR')
print(test_sub.shape)

(48744, 2)


In [17]:
test_sub.to_csv('best_psudo_th0.6.csv',index=False)

In [53]:
(test_sub['TARGET'] == 1).sum()

0