In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
import pickle
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import gc
from sklearn.cluster import KMeans

print_to_file = False 
test_run = False 

train = pd.read_pickle(ENV.lightgbm_train_764.value)
print('train shape is: {}'.format(train.shape))
test = pd.read_pickle(ENV.lightgbm_test_764.value)
print('test shape is: {}'.format(test.shape))
fe_id = 'comb_764'


train shape is: (307511, 764)
test shape is: (48744, 763)


In [2]:
train_id = train['SK_ID_CURR']
test_id = test['SK_ID_CURR']

train['SK_ID_CURR'] = train['SK_ID_CURR'].astype(int)
test['SK_ID_CURR'] = test['SK_ID_CURR'].astype(int)
targets = train.TARGET.values

In [3]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)

from sklearn.model_selection import train_test_split

def get_time(timezone='America/New_York', time_format='%Y-%m-%d %H:%M:%S'):
    from datetime import datetime
    from dateutil import tz

    # METHOD 1: Hardcode zones:
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz(timezone)

    utc = datetime.utcnow()

    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    utc = utc.replace(tzinfo=from_zone)

    # Convert time zone
    est = utc.astimezone(to_zone)

    return est.strftime(time_format)

import sys, time
class Logger(object):
    def __init__(self, logtofile=True, logfilename='log'):
        self.terminal = sys.stdout
        self.logfile = "{}_{}.log".format(logfilename, int(time.time()))
        self.logtofile = logtofile

    def write(self, message):
        #         self.terminal.write(message)
        if self.logtofile:
            self.log = open(self.logfile, "a")
            self.log.write('[' + get_time() + '] ' + message)
            self.log.close()

    def flush(self):
        # this flush method is needed for python 3 compatibility.
        # this handles the flush command by doing nothing.
        # you might want to specify some extra behavior here.
        pass


def divert_printout_to_file():
    sys.stdout = Logger(logfilename='logfile')

if print_to_file:
    divert_printout_to_file()  # note: comment this to use pdb

import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, train_df, test_df, holdout, num_folds, submission_file_name, fe_img_name, stratified = False, debug= False, colsample=0.67, max_depth=8, num_leaves=31, min_child_samples=20, subsample=0.7, reg_lambda=0.3, lr=0.04, seed=1001, verbose=100, rounds=None):
    print(train_df.shape, test_df.shape, holdout.shape)
    print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
    # Divide in training/validation and test data
    if df is not None:
        train_df = df[df['TARGET'].notnull()]
        test_df = df[df['TARGET'].isnull()]
        print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
        del df
        gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
        
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    holdout_final_preds = np.zeros(holdout.shape[0])
    feature_importance_df = pd.DataFrame()
    feature_importance_gain_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    train_scores = []
    holdout_scores = []
    scores = []
    diff_val_holdout = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
#         print('valid index : ',list(valid_idx)[:5])
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
#         print('MEAN: train({}) vs valid({}): '.format(len(train_y), len(valid_y)), np.mean(train_y), np.mean(valid_y))

        clf = LGBMClassifier(
            nthread=18,
            n_estimators=30000,
            learning_rate=lr,
            num_leaves=num_leaves,
            colsample_bytree=colsample, # 0.67
            subsample=subsample,
            subsample_freq=0, ## disable subsampling
            max_depth=max_depth,
            reg_alpha=0.65,
            reg_lambda=reg_lambda,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            min_child_samples=min_child_samples,
            silent=-1,
            verbose=-1, )
        if rounds is not None:
            clf.n_estimators = rounds
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                eval_metric= 'auc', verbose=verbose)
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits
            holdout_preds = clf.predict_proba(holdout[feats])[:, 1] 
        else:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                eval_metric= 'auc', verbose=verbose, early_stopping_rounds= 200)
            oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
            sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
            holdout_preds = clf.predict_proba(holdout[feats], num_iteration=clf.best_iteration_)[:, 1] 
            
        holdout_final_preds += holdout_preds / folds.n_splits
        score = roc_auc_score(valid_y, oof_preds[valid_idx])
        train_score = clf.best_score_['training']['auc']
        holdout_score = roc_auc_score(holdout['TARGET'], holdout_preds)
        diff = abs(score - holdout_score)
        best_rounds = rounds if rounds is not None else clf.best_iteration_
        print('Fold %2d [%5d] AUC : ho: %.6f / te: %.6f / tr: %.6f (diff: %.6f)' % (n_fold + 1, best_rounds, holdout_score, score,  train_score, diff))
        scores.append(score)
        train_scores.append(train_score)
        holdout_scores.append(holdout_score)
        diff_val_holdout.append(diff)
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        fold_importance_gain_df = pd.DataFrame()
        fold_importance_gain_df["feature"] = feats
        fold_importance_gain_df["importance"] = clf.booster_.feature_importance(importance_type='gain')
        fold_importance_gain_df["fold"] = n_fold + 1
        feature_importance_gain_df = pd.concat([feature_importance_gain_df, fold_importance_gain_df], axis=0)
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    holdout_roc = roc_auc_score(holdout['TARGET'], holdout_final_preds)
    holdout_mean = np.mean(holdout_scores)
    full_te_mean = np.mean(scores)
    full_tr_mean = np.mean(train_scores)
#     print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    print('Full HO score %.6f' % holdout_roc)
    print('FULL HO mean {:.6f}, std {:.6f}'.format(holdout_mean, np.std(holdout_scores)))
    print('FULL TE mean {:.6f}, std {:.6f}'.format(full_te_mean, np.std(scores)))
    print('FULL TR mean {:.6f}, std {:.6f}'.format(full_tr_mean, np.std(train_scores)))
    print('FULL DIFF mean {:.6f}, std {:.6f}'.format(np.mean(diff_val_holdout), np.std(diff_val_holdout)))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
#     if not print_to_file:
#         display_importances(feature_importance_df, fe_img_name)
    feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    feature_importance_gain_df = feature_importance_gain_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    return feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds 

# Display/plot feature importance
def display_importances(feature_importance_df_, fe_img_name):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig(fe_img_name+'.png')


def convert_and_save_imp_df(fe_imp_df, dumpfilename):
    fe_imp_df_mean = fe_imp_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    pickle.dump(fe_imp_df_mean, open(dumpfilename,'wb'))
    
    
def runlgb(train, test, holdout):
    colsamples = [0.07]#[0.1,0.15,0.2]#[0.03,0.04,0.05,0.06,0.07,0.08]
    seeds = [20]#[300,4000,50000,600000,7000000,80000000,523445,31275479] # 20
    depth = [5]
    leaves = [16]
    min_child_sam = [20]#, 800]
    subsamples = [1]#0.8, 0.7, 0.6, 0.5, 0.4] # was 1
    reg_lambdas = [0.5]
    # lrs = lrs.tolist()
    lrs2 = [0.1]
    nfolds = 4 
    rounds = [None] #[1000]#, 1300, 1600, 1900, 2200, 2500]
    for seed in seeds:
        for colsample in colsamples:
            for d in depth:
                for l in leaves:
                    for mcs in min_child_sam:
                        for subsample in subsamples:
                            for reg_lambda in reg_lambdas:
                                for lr in lrs2:
                                    for r in rounds:
                                        filename = 'fe_936_col{}_lr{}_n{}'.format(len(train.columns), lr, nfolds)
                                        print('#############################################')
                                        print(colsample, seed, d, l, mcs, subsample, reg_lambda, lr, 'nfolds:', nfolds)
                                        print('#############################################')
                                        numfeats = len(train.columns)
                                        with timer("Run LightGBM with kfold"):
                                            return kfold_lightgbm(None, train, test, holdout, nfolds, filename+'.csv', filename, colsample=colsample, verbose=None, max_depth=d, num_leaves=l, min_child_samples=mcs, subsample=subsample, reg_lambda=reg_lambda, lr=lr, seed=seed, stratified=True, rounds=r,debug=True)
    #                                         

In [4]:
# X = pd.concat([train.drop('TARGET',axis=1),test])
# print(X.shape)

# X_coff = X.drop('SK_ID_CURR',axis=1)
# coff = X_coff.corr()
# coff.to_pickle('../../data/add_features/coff_764.pkl')

In [5]:
coff = pd.read_pickle(ENV.coff_764.value)
coff_matrix = coff.values
columns_index = coff.columns.values
report = pd.read_pickle(ENV.drop_column_report.value)

In [12]:
col_name,thred_hroc,thred_hmean,thred_fulltemean,threa_tr_mean= list(report[report.drop_column=='nodrop'].values[0])

### roc get increased number

In [25]:
sum(report.holdout_roc > thred_hroc)

208

### holdout_mean increased number

In [26]:
sum(report.holdout_mean > thred_hmean)

122

### full test mean get increased number

In [27]:
sum(report.full_te_mean > thred_fulltemean)

47

### All get increased number

In [28]:
sum((report.full_te_mean > thred_fulltemean) & (report.holdout_mean > thred_hmean) & (report.holdout_roc > thred_hroc))

18

# all increased columns 

In [33]:
drop_list = list(report[(report.full_te_mean > thred_fulltemean) & (report.holdout_mean > thred_hmean) & (report.holdout_roc > thred_hroc)]['drop_column'].values)
print(len(drop_list))

18


In [7]:
kmeans = KMeans(n_clusters=600, random_state=0).fit(coff_matrix)
labels = kmeans.labels_.copy()

# Process original

In [9]:
train_df, holdout = train_test_split(train, test_size=1/7, random_state=42)
print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
print(train_df.shape, test.shape, holdout.shape)

MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
(263580, 764) (48744, 763) (43931, 764)


In [10]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds = runlgb(train_df, test, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.1 nfolds: 4
#############################################
(263580, 764) (48744, 763) (43931, 764)
MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
Fold  1 [  528] AUC : ho: 0.792890 / te: 0.793603 / tr: 0.867796 (diff: 0.000713)
Fold  2 [  434] AUC : ho: 0.793506 / te: 0.791989 / tr: 0.857608 (diff: 0.001517)
Fold  3 [  561] AUC : ho: 0.792675 / te: 0.794414 / tr: 0.870024 (diff: 0.001739)
Fold  4 [  415] AUC : ho: 0.793564 / te: 0.788659 / tr: 0.855359 (diff: 0.004905)
Full HO score 0.797207
FULL HO mean 0.793159, std 0.000385
FULL TE mean 0.792166, std 0.002205
FULL TR mean 0.862697, std 0.006313
FULL DIFF mean 0.002219, std 0.001597
Run LightGBM with kfold - done in 89s


# Process remove all increased

In [35]:
train_act_columns = list(set(train.columns) - set(drop_list))
test_act_columns = list(set(test.columns) - set(drop_list))
train_act = train[train_act_columns].copy()
test_act = test[test_act_columns].copy()
print(train_act.shape)
print(test_act.shape)

(307511, 746)
(48744, 745)


In [36]:
train_df, holdout = train_test_split(train_act, test_size=1/7, random_state=42)
print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
print(train_df.shape, test.shape, holdout.shape)

MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
(263580, 746) (48744, 763) (43931, 746)


In [38]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds = runlgb(train_df, test_act, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.1 nfolds: 4
#############################################
(263580, 746) (48744, 745) (43931, 746)
MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
Fold  1 [  515] AUC : ho: 0.791892 / te: 0.792562 / tr: 0.865798 (diff: 0.000670)
Fold  2 [  555] AUC : ho: 0.792064 / te: 0.791566 / tr: 0.868780 (diff: 0.000499)
Fold  3 [  635] AUC : ho: 0.792874 / te: 0.792257 / tr: 0.877403 (diff: 0.000617)
Fold  4 [  530] AUC : ho: 0.794579 / te: 0.789682 / tr: 0.867789 (diff: 0.004897)
Full HO score 0.797592
FULL HO mean 0.792852, std 0.001064
FULL TE mean 0.791517, std 0.001119
FULL TR mean 0.869943, std 0.004439
FULL DIFF mean 0.001671, std 0.001864
Run LightGBM with kfold - done in 72s


# Process V1

In [8]:
selected_columns = []
for index in tqdm(range(len(labels))):
    filtered_array = (labels==labels[index])
    if filtered_array.sum() == 1:
        selected_columns.append(columns_index[index])
    else:
        columns = columns_index[filtered_array]
        na_result = scan_nan_portion(X[columns])
        na_result=na_result.sort_values()
        selected_columns.append(na_result.index.values[0])
selected_columns = list(set(selected_columns))
print(len(selected_columns))

  0%|          | 0/762 [00:00<?, ?it/s]


NameError: name 'X' is not defined

600

In [100]:
test_select = test[selected_columns].copy()
selected_columns.append('TARGET')
train_select = train[selected_columns].copy()

In [101]:
train_df, holdout = train_test_split(train_select, test_size=1/7, random_state=42)
print('MEAN: train({}) vs holdout({}): '.format(len(train_df), len(holdout)), train_df['TARGET'].mean(), holdout['TARGET'].mean())
print(train_df.shape, test.shape, holdout.shape)

MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
(263580, 601) (48744, 763) (43931, 601)


In [102]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds = runlgb(train_df, test_select, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.1 nfolds: 4
#############################################
(263580, 601) (48744, 600) (43931, 601)
MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
Fold  1 [  490] AUC : ho: 0.793089 / te: 0.792989 / tr: 0.860782 (diff: 0.000100)
Fold  2 [  480] AUC : ho: 0.792482 / te: 0.791004 / tr: 0.860266 (diff: 0.001478)
Fold  3 [  720] AUC : ho: 0.793195 / te: 0.793937 / tr: 0.882264 (diff: 0.000743)
Fold  4 [  367] AUC : ho: 0.793329 / te: 0.787599 / tr: 0.848219 (diff: 0.005731)
Full HO score 0.797428
FULL HO mean 0.793024, std 0.000324
FULL TE mean 0.791382, std 0.002427
FULL TR mean 0.862883, std 0.012267
FULL DIFF mean 0.002013, std 0.002201
Run LightGBM with kfold - done in 75s


In [95]:
feature_importance_df, feature_importance_gain_df,holdout_roc,holdout_mean,full_te_mean,full_tr_mean,oof_preds = runlgb(train_df, test, holdout)

#############################################
0.07 20 5 16 20 1 0.5 0.1 nfolds: 4
#############################################
(263580, 764) (48744, 763) (43931, 764)
MEAN: train(263580) vs holdout(43931):  0.0807572653463844 0.08055814800482575
Fold  1 [  528] AUC : ho: 0.792890 / te: 0.793603 / tr: 0.867796 (diff: 0.000713)
Fold  2 [  434] AUC : ho: 0.793506 / te: 0.791989 / tr: 0.857608 (diff: 0.001517)
Fold  3 [  561] AUC : ho: 0.792675 / te: 0.794414 / tr: 0.870024 (diff: 0.001739)
Fold  4 [  415] AUC : ho: 0.793564 / te: 0.788659 / tr: 0.855359 (diff: 0.004905)
Full HO score 0.797207
FULL HO mean 0.793159, std 0.000385
FULL TE mean 0.792166, std 0.002205
FULL TR mean 0.862697, std 0.006313
FULL DIFF mean 0.002219, std 0.001597
Run LightGBM with kfold - done in 87s
