# Utils methods

In [1]:
import sys

# this script makes it possible to describe a model in one argument.

lgb_target = 'model=LGBM_feat=lgbmBest_categoricalThreVal=10000_validation=subm_params=-,gbdt,0.45,0.04,188,7,auc,20,5,0,20,76,binary,0,0,32.0,1.0,200000,1,0'
keras_target = 'BatchNormalization=on_sameNDenseAsEmb=off_model=keras_feat=kerasBest_validation=team_params=-,20000,1000,1,0.2,100,2,0.001,0.0001,0.001,100,2,3'
target = lgb_target#''

def get_opt(name,default=None):
    global target
    if target == '':
        target = get_target()
    if target == '':
        return default
    flds = target.replace('__','').split('_')
    for fld in flds:
        if fld == '':
            continue
        key, val = fld.split('=')
        if key == name:
            if isinstance(default, int):
                val = int(val)
            elif isinstance(default, float):
                val = float(val)
            else:
                val = val.replace('','_')
            return val
    return default

def get_target():
    global target
    if target != '':
        return target
    if len(sys.argv) > 1:
        target = sys.argv[1]
    else:
        target = default_target
    return target

def reset_target():
    global target
    target = ''
    a = get_target()
    return get_target()

def set_target(tgt):
    global target
    target = tgt


# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import random as rn
from sklearn.metrics import roc_auc_score
import sys
import os
# from lib_util import get_target,get_opt
import lightgbm as lgb
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate, BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.initializers import RandomUniform
from keras.models import Model
from keras.optimizers import Adam
import gc

def get_params(params_str):
    if get_opt('model') == 'keras':
        names = ['batch_size', 'dense_cate', 'dense_nume_n_layers', 'drop', 'emb_cate', 'epochs_for_lr', 'lr', 'lr_fin', 'lr_init', 'max_epochs', 'n_layers', 'patience']
    elif 'LGBM' in get_opt('model'):
        names = ['boosting_type','colsample_bytree','learning_rate','max_bin','max_depth','metric','min_child_samples','min_child_weight','min_split_gain','nthread','num_leaves','objective','reg_alpha','reg_lambda','scale_pos_weight','subsample','subsample_for_bin','subsample_freq','verbose']
    else:
        print("no valid target")
        sys.exit(1)
    pvals = params_str.split(',')
    del pvals[0]
    if len(pvals) != len(names):
        print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!ERR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print('params: count is not fit',len(pvals), len(names))
        print('params_str:',params_str)
        print('names:',names)
        print('param_values:',pvals)
        sys.exit()
    params = dict(zip(names, pvals))
    return params

def LGBM(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
    params_str = get_opt('params')
    if params_str != None:
        params = get_params(params_str)
        return LGBM_helper(X_tr,X_va,X_te,predictors,cat_feats,params,seed=2018)

def Keras(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
    params_str = get_opt('params')
    if params_str != None:
        params = get_params(params_str)
        return Keras0_helper(X_tr,X_va,X_te,predictors,cat_feats,params,seed=2018)

def LGBM_helper(_X_tr,_X_va,_X_te,predictors,cat_feats,params,seed=2018):
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    rn.seed(seed)
    X_tr = _X_tr[predictors]
    X_va = _X_va[predictors]
    X_te = _X_te[predictors]
    y_tr = _X_tr['is_attributed']
    y_va = _X_va['is_attributed']
    y_te = _X_te['is_attributed']
    params['feature_fraction_seed'] = seed
    params['bagging_seed'] = seed
    params['drop_seed'] = seed
    params['data_random_seed'] = seed
    params['num_leaves'] = int(params['num_leaves'])
    params['subsample_for_bin'] = int(params['subsample_for_bin'])
    params['max_depth'] = int(np.log2(params['num_leaves'])+1.2)
    params['max_bin'] = int(params['max_bin'])
    print('*'*50)
    for k,v in sorted(params.items()):
        print(k,':',v)
    columns = X_tr.columns

    print('start for lgvalid')
    lgvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_feats)
    _X_va.drop(predictors,axis=1)
    del _X_va, X_va, y_va
    gc.collect()

    print('start for lgtrain')
    lgtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_feats)
    _X_te.drop(predictors,axis=1)
    del _X_tr, X_tr, y_tr
    gc.collect()

    evals_results = {}
    if get_opt('trainCheck','-') == 'on':
         valid_names=['train','valid']
         valid_sets=[lgtrain, lgvalid]
    else:
         valid_names=['valid']
         valid_sets=[lgvalid]
    if get_opt('testCheck','-') == 'on':
         valid_names.append('test')
         lgtest = lgb.Dataset(X_te, label=y_te, categorical_feature=cat_feats)
         valid_sets.append(lgtest)

    print('start training')
    bst = lgb.train(params,
                     lgtrain,
                     valid_sets=valid_sets,
                     valid_names=valid_names,
                     evals_result=evals_results,
                     num_boost_round=2000,
                     early_stopping_rounds=100,
                     verbose_eval=10,
                     )

    importance = bst.feature_importance()
    print('importance (count)')
    tuples = sorted(zip(columns, importance), key=lambda x: x[1],reverse=True)
    for col, val in tuples:
        print(val,"\t",col)

    importance = bst.feature_importance(importance_type='gain')
    print('importance (gain)')
    tuples = sorted(zip(columns, importance), key=lambda x: x[1],reverse=True)
    for col, val in tuples:
        print(val,"\t",col)

    n_estimators = bst.best_iteration
    metric = params['metric']
    auc = evals_results['valid'][metric][n_estimators-1]
    _X_te['pred'] = bst.predict(X_te)

    return auc

class EarlyStopping(Callback):
    def __init__(self,training_data=False,validation_data=False, testing_data=False, min_delta=0, patience=0, model_file=None, verbose=0):
        super(EarlyStopping, self).__init__()
        self.best_epoch = 0
        self.patience = patience
        self.verbose = verbose
        self.min_delta = min_delta
        self.wait = 0
        self.stopped_epoch = 0
        self.monitor_op = np.greater
        if training_data:
            self.x_tr = training_data[0]
            self.y_tr = training_data[1]
        else:
            self.x_tr = False
            self.y_tr = False
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        if testing_data:
            self.x_te = testing_data[0]
            self.y_te = testing_data[1]
        else:
            self.x_te = False
            self.y_te = False
        self.model_file = model_file
    def on_train_begin(self, logs={}):
        self.wait = 0
        self.best_epoch = 0
        self.stopped_epoch = 0
        self.best = -np.Inf
    def on_train_end(self, logs={}):
        if self.stopped_epoch > 0 and self.verbose > 0:
            print('Epoch ',self.best_epoch,': EarlyStopping')
    def on_epoch_end(self, epoch, logs={}):

        if self.x_tr:
            y_pred = self.model.predict(self.x_tr,batch_size=100000)
            roc_tr = roc_auc_score(self.y_tr, y_pred)
        else:
            roc_tr = 0

        y_hat_val=self.model.predict(self.x_val,batch_size=100000)
        roc_val = roc_auc_score(self.y_val, y_hat_val)

        if self.x_te:
            y_hat_te=self.model.predict(self.x_te,batch_size=100000)
            roc_te = roc_auc_score(self.y_te, y_hat_te)
        else:
            roc_te = 0
        print('roc-auc: %s - roc-auc_val: %s - roc-auc_test: %s' % (str(round(roc_tr,6)),str(round(roc_val,6)), str(round(roc_te,6))),end=100*' '+'\n')

        if self.model_file:
            print("saving",self.model_file+'.'+str(epoch))
            self.model.save_weights(self.model_file+'.'+str(epoch))
        if(self.x_val):
            if get_opt('testCheck','-') == 'on': 
                current = roc_te
            else:
                current = roc_val
            if self.monitor_op(current - self.min_delta, self.best):
                self.best = current
                self.best_epoch = epoch
                self.wait = 0
            else:
                self.wait += 1
                if self.wait >= self.patience:
                    self.stopped_epoch = epoch
                    self.model.stop_training = True

def Keras0_helper(_X_tr,_X_va,_X_te,predictors,cat_feats,params,seed=2018):
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    rn.seed(seed)
    X_tr = _X_tr[predictors]
    X_va = _X_va[predictors]
    X_te = _X_te[predictors]
    y_tr = _X_tr['is_attributed']
    y_va = _X_va['is_attributed']
    y_te = _X_te['is_attributed']
    print('*************params**************')
    for f in sorted(params): print(f+":",params[f])
    batch_size = int(params['batch_size'])
    epochs_for_lr = float(params['epochs_for_lr'])
    max_epochs = int(params['max_epochs'])
    emb_cate = int(params['emb_cate'])
    dense_cate = int(params['dense_cate'])
    dense_nume_n_layers = int(params['dense_nume_n_layers'])
    drop = float(params['drop'])
    lr= float(params['lr'])
    lr_init = float(params['lr_init'])
    lr_fin = float(params['lr_fin'])
    n_layers = int(params['n_layers'])
    patience = int(params['patience'])
    train_dict = {}
    valid_dict = {}
    test_dict = {}
    input_list = []
    emb_list = []
    numerical_feats = []
    tot_emb_n = 0
    for col in X_tr:
        if col not in cat_feats:
            numerical_feats.append(col)
    if len(cat_feats) > 0:
        for col in cat_feats:
            train_dict[col] = np.array(X_tr[col])
            valid_dict[col] = np.array(X_va[col])
            test_dict[col] = np.array(X_te[col])
            inpt = Input(shape=[1], name = col)
            input_list.append(inpt)
            max_val = np.max([X_tr[col].max(), X_va[col].max(), X_te[col].max()])+1
            emb_n = np.min([emb_cate, max_val])
            if get_opt('fixEmb','on') == 'on':
                emb_n = emb_cate
            tot_emb_n += emb_n
            if emb_n == 1:
                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Warinig!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! emb_1 = 1")
                return 0
            print('Embedding size:',max_val, emb_cate, X_tr[col].max(), X_va[col].max(), X_te[col].max(), emb_n,col)
            embd = Embedding(max_val, emb_n)(inpt)
            emb_list.append(embd)
        if len(emb_list) == 1:
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Warinig!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! emb_list = 1")
            return 0
        fe = concatenate(emb_list)
        s_dout = SpatialDropout1D(drop)(fe)
        x1 = Flatten()(s_dout)

    if get_opt('sameNDenseAsEmb','-') == 'on':
        dense_cate = tot_emb_n
    if len(numerical_feats) > 0:
        train_dict['numerical'] = X_tr[numerical_feats].values
        valid_dict['numerical'] = X_va[numerical_feats].values
        test_dict['numerical'] = X_te[numerical_feats].values
        inpt = Input((len(numerical_feats),),name='numerical')
        input_list.append(inpt)
        x2 = inpt
        for n in range(dense_nume_n_layers):
            x2 = Dense(dense_cate,activation='relu',kernel_initializer=RandomUniform(seed=seed))(x2)
            if get_opt('numeDropout','on') != 'off':
                x2 = Dropout(drop)(x2)
            if get_opt('NumeBatchNormalization','on') != 'off':
                x2 = BatchNormalization()(x2)

    if len(numerical_feats) > 0 and len(cat_feats) > 0:
        x = concatenate([x1, x2])
    elif len(numerical_feats) > 0:
        x =  x2
    elif len(cat_feats) > 0:
        x =  x1
    else:
        return 0 # for small data test

    for n in range(n_layers):
        x = Dense(dense_cate,activation='relu',kernel_initializer=RandomUniform(seed=seed))(x)
        if get_opt('lastDropout','on') != 'off':
            x = Dropout(drop)(x)
        if get_opt('BatchNormalization','off') == 'on' or get_opt('LastBatchNormalization','off') == 'on':
            x = BatchNormalization()(x)
    outp = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform(seed=seed))(x)
    model = Model(inputs=input_list, outputs=outp)
    if get_opt('optimizer','expo') == 'adam':
        optimizer = Adam(lr=lr)
    elif get_opt('optimizer','expo') == 'nadam':
        optimizer = Nadam(lr=lr)
    else:
        exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
        steps = int(len(X_tr) / batch_size) * epochs_for_lr
        lr_init, lr_fin = 0.001, 0.0001
        lr_decay = exp_decay(lr_init, lr_fin, steps)
        optimizer = Adam(lr=lr, decay=lr_decay)
    model.compile(loss='binary_crossentropy',optimizer=optimizer)
    model.summary()
    #from keras.utils import plot_model
    #plot_model(model, to_file='model.png')

    model_file = '../work/weights.'+str(os.getpid())+'.hdf5'
    if get_opt('trainCheck','-') == 'on': 
        training_data=(train_dict, y_tr)
    else:
        training_data=False
    if get_opt('testCheck','-') == 'on':
        testing_data=(test_dict, y_te)
    else:
        testing_data=False
    aucEarlyStopping = EarlyStopping(
        training_data=training_data,
        validation_data=(valid_dict,y_va),
        testing_data=testing_data,
        patience=patience,
        model_file=model_file,
        verbose=1)
    model.fit(train_dict,
        y_tr,
        validation_data=[valid_dict, y_va],
        batch_size=batch_size,
        epochs=max_epochs,
        shuffle=True,
        verbose=1,
        callbacks=[aucEarlyStopping])
    best_epoch = aucEarlyStopping.best_epoch
    print('loading',model_file+'.'+str(best_epoch))
    model.load_weights(model_file+'.'+str(best_epoch))
    _X_te['pred'] = model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]
    _X_va['pred'] = model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]
    if get_opt('avgEpoch',0) > 0:
        added = 1
        for i in range(min(get_opt('avgEpoch',0),patience)):
            best_epoch = aucEarlyStopping.best_epoch + (i+1)
            if best_epoch >= max_epochs:
                continue
            print('loading',model_file+'.'+str(best_epoch))
            model.load_weights(model_file+'.'+str(best_epoch))
            _X_te['pred'] += model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            _X_va['pred'] += model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            added += 0.5
            best_epoch = aucEarlyStopping.best_epoch - (i+1)
            if best_epoch < 0:
                continue
            print('loading',model_file+'.'+str(best_epoch))
            model.load_weights(model_file+'.'+str(best_epoch))
            _X_te['pred'] += model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            _X_va['pred'] += model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            added += 0.5
        _X_te['pred'] /= added
        _X_va['pred'] /= added

    os.system('rm -f '+model_file+'.*')
    auc = roc_auc_score(y_va, _X_va.pred)
    return auc

def Predict(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
    model = get_opt('model')
    import pdb
    pdb.set_trace()
    if 'LGBM' in model:
        return LGBM(X_tr,X_va,X_te,predictors,cat_feats,seed=2018)
    elif 'keras' in model:
        return Keras(X_tr,X_va,X_te,predictors,cat_feats,seed=2018)
    else:
        print("no valid model")
        sys.exit(1)


#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
import sys
import pickle
import os
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.special import logit
# from lib_util import get_target,get_opt,set_target,reset_target
import shutil
import pdb

target=get_target()
nrows=get_opt('nrows',-1)
if nrows == -1:
    nrows=None
path = '../input/'
work = '../work/'
csv_dir='../csv/'
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }

# wrapper of pd.read_csv with cache
def read_csv(csv_file,df_len=None,nrows=None,usecols=None,dtype=None):
    nrows = 1000000
    pkl_file = csv_file[:-4] + '.pkl'
    if os.path.isfile(pkl_file) and nrows == None:
        with open(pkl_file, 'rb') as pk:
            print("loading",pkl_file)
            df = pickle.load(pk)
        if df_len != None and len(df) != df_len:
            print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',pkl_file,'is broken',len(df),df_len)
            sys.exit(1)
    else:
        print("loading",csv_file)
        df = pd.read_csv(csv_file, nrows=nrows)
        if 'next' in csv_file:
            df = np.absolute(df)
        for ptn in df:
            if dtype:
                df = df.astype(dtype)
            else:
                df[ptn] = df[ptn].astype(get_type_with_fld_check(df,ptn))
        if nrows == None and (df_len == None or len(df) == df_len):
            print("saving cache file",pkl_file)
            with open(pkl_file+str(os.getpid()), 'wb') as pk:
                pickle.dump(df,pk,protocol=4)
            shutil.move(pkl_file+str(os.getpid()), pkl_file)
        if nrows == None and df_len and len(df) != df_len:
            print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',csv_file,'is broken')
            sys.exit(1)
    if usecols != None:
        df = df[usecols]
    if nrows != None:
        df = df[:nrows]
    gc.collect()
    if df_len != None and len(df) != df_len:
        print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',csv_file,'line is not same',df_len,len(df))
        sys.exit(1)
    return df

def get_type_with_fld_check(df,ptn):
    max_val = df[ptn].max()
    if  'cumratio' in ptn or 'mean_' in ptn or 'Ratio' in ptn or 'CVR' in ptn or 'WOE' in ptn:
        dtype = 'float16'
    else:
        if max_val < 256:
            dtype = 'uint8'
        elif max_val < 65536:
            dtype = 'uint16'
        else:
            dtype = 'uint32'
    return dtype

def get_type(df,ptn):
    max_val = df[ptn].max()
    if max_val < 256:
        dtype = 'uint8'
    elif max_val < 65536:
        dtype = 'uint16'
    else:
        dtype = 'uint32'
    return dtype


def read_data_ph1():
    keep_patterns = []
    feat_opt = get_opt('feat','none')

    if 'lgbmBest' == feat_opt:
        numerical_patterns = ['WOEBnd_ip_nextClickLeakDayFlt', 'WOEBnd_app_nextClickLeakDayFlt', 'WOEBnd_device_nextClickLeakDayFlt', 'WOEBnd_os_nextClickLeakDayFlt', 'WOEBnd_channel_nextClickLeakDayFlt', 'WOEBnd_ip_app_nextClickLeakDayFlt', 'WOEBnd_ip_device_nextClickLeakDayFlt', 'WOEBnd_ip_os_nextClickLeakDayFlt', 'WOEBnd_ip_channel_nextClickLeakDayFlt', 'WOEBnd_app_device_nextClickLeakDayFlt', 'WOEBnd_app_os_nextClickLeakDayFlt', 'WOEBnd_app_channel_nextClickLeakDayFlt', 'WOEBnd_device_os_nextClickLeakDayFlt', 'WOEBnd_device_channel_nextClickLeakDayFlt', 'WOEBnd_os_channel_nextClickLeakDayFlt', 'WOEBnd_ip', 'WOEBnd_app', 'WOEBnd_device', 'WOEBnd_os', 'WOEBnd_channel', 'WOEBnd_ip_app', 'WOEBnd_ip_device', 'WOEBnd_ip_os', 'WOEBnd_ip_channel', 'WOEBnd_app_device', 'WOEBnd_app_os', 'WOEBnd_app_channel', 'WOEBnd_ip_app_device', 'WOEBnd_ip_app_os', 'WOEBnd_ip_app_channel', 'WOEBnd_ip_device_os', 'WOEBnd_ip_device_channel', 'WOEBnd_ip_os_channel', 'WOEBnd_app_device_os', 'WOEBnd_app_device_channel', 'WOEBnd_app_os_channel', 'WOEBnd_ip_app_device_os', 'WOEBnd_ip_app_device_channel', 'WOEBnd_ip_app_os_channel', 'WOEBnd_ip_device_os_channel', 'WOEBnd_app_device_os_channel', 'countRatio_ip_machine', 'countRatio_ip_channel', 'countRatio_machine_ip', 'countRatio_app_channel', 'countRatio_channel_app', 'uniqueCount_day_ip_os', 'uniqueCount_day_ip_device', 'uniqueCountRatio_day_ip_channel', 'uniqueCount_day_ip_machine', 'uniqueCount_day_ip_app',  'uniqueCount_machine_app', 'uniqueCount_machine_channel', 'uniqueCount_machine_ip', 'nextClickLeakDay', 'nextNextClickLeakDay', 'dayhourminute10count_ip_device_os', 'dayhourminute10count_ip_channel', 'dayhourminute10count_app_os_channel', 'cumratio_ip_day', 'cumcount_ip_day', 'count_ip_os', 'count_ip_device_os_day_hourminute10', 'count_ip_app_os_channel_day', 'count_ip_app_os_channel', 'count_ip_app_device_os_day_hour', 'count_ip_app_device_day', 'count_ip_app_device_channel_day', 'count_ip', 'count_device_os_day_hourminute10', 'count_app_os_channel_day_hour', 'count_app_device_day_hour', 'count_app_device_channel_day_hour', 'recumcount_app_device_os_day', 'var_ip_device_hour', 'count_app_day_hourminute']
        cat_patterns = ['cat_os', 'cat_hour', 'cat_device', 'cat_dayhourcount_ip', 'cat_com1_ip', 'cat_channel', 'cat_app']
    elif 'kerasBest' == feat_opt:
        numerical_patterns = ['uniqueCountRatio_day_ip_machine', 'uniqueCountRatio_day_ip_app', 'uniqueCountRatio_day_ip_channel', 'uniqueCount_day_ip_machine', 'uniqueCount_day_ip_app', 'uniqueCount_day_ip_channel', 'uniqueCount_machine_app', 'uniqueCount_machine_channel', 'uniqueCount_machine_ip', 'nextClickLeakDay', 'dayhourcount_ip', 'count_ip', 'count_ip_app_device_os_day_hour', 'count_app_channel', 'cumcount_ip_app_device_os_day_hour', 'count_device_os_day_hourminute10', 'count_app_device_day_hour', 'dayhourminute10count_ip']
        cat_patterns = ['cat_nextClickLeakDay', 'cat_nextNextClickLeakDay', 'cat_app', 'cat_device', 'cat_os', 'cat_count_ip', 'cat_count_app_channel', 'cat_hour', 'cat_dayhourcount_ip']
    else:
        print('ERR: no valid feat !!!!!!!!!!!!!!!!')
        sys.exit(1)

    print("start reading feature for",feat_opt)

    # all cache
    tgt = 'model=' + get_opt('model','none')
    tgt += '_nrows=' + get_opt('nrows','0') 
    tgt += '_feat=' + get_opt('feat','0') 
    tgt += '_categoricalThreVal=' + get_opt('categoricalThreVal','1000') 
    tgt += '_offlineADD=' + get_opt('offlineADD','off') 
    tgt += '_sample=' + get_opt('sample','0.0') 
    tgt += '_noTestSample=' + get_opt('noTestSample','off') 
    tgt += '_noLogDev=' + get_opt('noLogDev','off') 
    tgt += '_smallTest=' + get_opt('smallTest','off') 
    tgt += '_ver=3'
#     tr_pkl_file = '../work/train_' + tgt + '.pkl'
#     te_pkl_file = '../work/test_supplement_' + tgt + '.pkl'
#     if os.path.isfile(tr_pkl_file) == True and os.path.isfile(te_pkl_file) == True:
#         with open(tr_pkl_file, 'rb') as pk:
#             print("loading",tr_pkl_file)
#             train_df = pickle.load(pk)
#         with open(te_pkl_file, 'rb') as pk:
#             print("loading",te_pkl_file)
#             test_df = pickle.load(pk)
#         gc.collect()
#         return train_df, test_df, numerical_patterns, cat_patterns

    # reading base data
    train_df = read_csv(work+"train_base.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel','day','hour','is_attributed'],nrows=nrows)
    test_df = read_csv(work+"test_supplement_base.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel','day','hour'],nrows=nrows)
    test_df['is_attributed'] = 0
    
    #reading categorical data
    n = 0
    for ptn in cat_patterns:
        n+=1
        print('start categorical convert for',ptn,n,'/',len(cat_patterns))
        if ptn in train_df.columns:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! warning cat ptn is in train_df.columns !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print(ptn,train_df.columns)
        org_ptn = ptn[4:]
        if org_ptn in train_df.columns:
            _train_df = train_df[[org_ptn]]
            _test_df = test_df[[org_ptn]]
        else:
            _train_df = read_csv(work + 'train_' + org_ptn + '.csv', nrows=nrows, df_len=len(train_df))
            _test_df = read_csv(work + 'test_supplement_' + org_ptn + '.csv', nrows=nrows, df_len=len(test_df))
            
            
        _train_df = _train_df.rename(columns={org_ptn: ptn})
        _test_df = _test_df.rename(columns={org_ptn: ptn})
        
        len_train = len(_train_df)
        _df = _train_df.append(_test_df)
        thre_val = get_opt('categoricalThreVal',1000)
        max_val = _df[ptn].max()
        if 'cat_device' == ptn and get_opt('noLogDev','-') == 'on':
            _df[ptn] = LabelEncoder().fit_transform(_df[ptn])
        elif thre_val > 0 and max_val > thre_val:
            if 'cumratio' in ptn:
                fixed_vals = (10000*df[ptn]).astype('uint16')
            else:
#                 pdb.set_trace()
                fixed_vals = (np.log2(_df[ptn]+1)*thre_val/100).astype('uint16')
            _df[ptn] = LabelEncoder().fit_transform(fixed_vals)
            print('logged for',ptn,max_val,fixed_vals.max(), _df[ptn].max())
        else:
#             pass
            _df[ptn] = LabelEncoder().fit_transform(_df[ptn])
        _df[ptn] = _df[ptn].astype(get_type(_df,ptn))

        train_df[ptn] = _df[:len_train]
        test_df[ptn] = _df[len_train:]
        gc.collect()
             
    # reading numerical data
    n = 0
    for ptn in numerical_patterns:
        n+=1
        print('start for',ptn,n,'/',len(numerical_patterns))
        if ptn in train_df.columns: continue
        train_df[ptn] = read_csv(work + 'train_' + ptn + '.csv', nrows=nrows, df_len=len(train_df))
        test_df[ptn] = read_csv(work + 'test_supplement_' + ptn + '.csv', nrows=nrows, df_len=len(test_df))

    # numerical data conversion
    if get_opt('model','-') == 'keras':
        for ptn in numerical_patterns:
            print('start for numerical convert',ptn)
            all_df = train_df[[ptn]].append(test_df[[ptn]])
            if 'cumratio' in ptn or 'CVRTgt' in ptn or 'WOETgt' in ptn:
                pass
            else:
                all_df = np.log2(all_df+1)
            all_df = StandardScaler().fit_transform(all_df).astype('float16')
            train_df[ptn] = all_df[:len(train_df)]
            test_df[ptn] = all_df[len(train_df):]

#     # saving cache
#     print("saving",tr_pkl_file)
#     with open(tr_pkl_file+str(os.getpid()), 'wb') as pk:
#         pickle.dump(train_df,pk,protocol=4)
#     shutil.move(tr_pkl_file+str(os.getpid()), tr_pkl_file)
#     print("saving",te_pkl_file)
#     with open(te_pkl_file+str(os.getpid()), 'wb') as pk:
#         pickle.dump(test_df,pk,protocol=4)
#     shutil.move(te_pkl_file+str(os.getpid()), te_pkl_file)
#     print('saved cache file')

    gc.collect()
    return train_df, test_df, numerical_patterns, cat_patterns


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# training / prediction

In [2]:
target=get_target()
print('start for',target)

start for model=LGBM_feat=lgbmBest_categoricalThreVal=10000_validation=subm_params=-,gbdt,0.45,0.04,188,7,auc,20,5,0,20,76,binary,0,0,32.0,1.0,200000,1,0


In [3]:
%%time
#pdb.set_trace()
train_df, test_df, numerical_patterns, cat_patterns = read_data_ph1()
predictors = numerical_patterns + cat_patterns
categorical = cat_patterns

start reading feature for lgbmBest
loading ../work/train_base.csv
loading ../work/test_supplement_base.csv
start categorical convert for cat_os 1 / 7
start categorical convert for cat_hour 2 / 7
start categorical convert for cat_device 3 / 7
start categorical convert for cat_dayhourcount_ip 4 / 7
loading ../work/train_dayhourcount_ip.csv
loading ../work/test_supplement_dayhourcount_ip.csv
start categorical convert for cat_com1_ip 5 / 7
loading ../work/train_com1_ip.csv
loading ../work/test_supplement_com1_ip.csv
logged for cat_com1_ip 87804 1642 2
start categorical convert for cat_channel 6 / 7
start categorical convert for cat_app 7 / 7
start for WOEBnd_ip_nextClickLeakDayFlt 1 / 76
loading ../work/train_WOEBnd_ip_nextClickLeakDayFlt.csv
loading ../work/test_supplement_WOEBnd_ip_nextClickLeakDayFlt.csv
start for WOEBnd_app_nextClickLeakDayFlt 2 / 76
loading ../work/train_WOEBnd_app_nextClickLeakDayFlt.csv
loading ../work/test_supplement_WOEBnd_app_nextClickLeakDayFlt.csv
start for WOE

start for uniqueCount_day_ip_machine 50 / 76
loading ../work/train_uniqueCount_day_ip_machine.csv
loading ../work/test_supplement_uniqueCount_day_ip_machine.csv
start for uniqueCount_day_ip_app 51 / 76
loading ../work/train_uniqueCount_day_ip_app.csv
loading ../work/test_supplement_uniqueCount_day_ip_app.csv
start for uniqueCount_machine_app 52 / 76
loading ../work/train_uniqueCount_machine_app.csv
loading ../work/test_supplement_uniqueCount_machine_app.csv
start for uniqueCount_machine_channel 53 / 76
loading ../work/train_uniqueCount_machine_channel.csv
loading ../work/test_supplement_uniqueCount_machine_channel.csv
start for uniqueCount_machine_ip 54 / 76
loading ../work/train_uniqueCount_machine_ip.csv
loading ../work/test_supplement_uniqueCount_machine_ip.csv
start for nextClickLeakDay 55 / 76
loading ../work/train_nextClickLeakDay.csv
loading ../work/test_supplement_nextClickLeakDay.csv
start for nextNextClickLeakDay 56 / 76
loading ../work/train_nextNextClickLeakDay.csv
loading 

In [4]:
len(predictors), len(categorical)

(83, 7)

In [5]:
# is_val = (train_df['day'] == 9) & ((train_df['hour'] == 13) |(train_df['hour'] == 17) |(train_df['hour'] == 21))
# val_df = train_df[is_val]
# train_df = train_df[~is_val]

In [None]:
val_df = train_df[-250000:]
train_df = train_df[:-250000]
val_df.shape, train_df.shape

In [None]:
auc = Predict(train_df,val_df,test_df,predictors,categorical,seed=get_opt('seed',2018))
print('validation auc:',auc)

In [11]:


test_df = test_df[['pred']].rename(columns={'pred': 'is_attributed'})
mapping = read_csv('../input/mapping.csv')
click_id = read_csv('../input/sample_submission.csv',usecols=['click_id'])
test_df = test_df.reset_index().merge(mapping, left_on='index', right_on='old_click_id', how='left')
test_df = click_id.merge(test_df,on='click_id',how='left')
outfile = '../csv/pred_test_'+target+'.csv'
print('writing to',outfile)
test_df[['click_id','is_attributed']].to_csv(outfile,index=False)

loading ../input/mapping.pkl
loading ../input/sample_submission.pkl
writing to ../csv/pred_test_model=LGBM_feat=lgbmBest_categoricalThreVal=10000_validation=subm_params=-,gbdt,0.45,0.04,188,7,auc,20,5,0,20,76,binary,0,0,32.0,1.0,200000,1,0.csv


In [10]:
auc

0.9852901220779562

In [7]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10115468 entries, 148740843 to 178434048
Data columns (total 91 columns):
ip                                           uint32
app                                          uint16
device                                       uint16
os                                           uint16
channel                                      uint16
day                                          int64
hour                                         int64
is_attributed                                uint8
WOEBnd_ip_nextClickLeakDayFlt                float16
WOEBnd_app_nextClickLeakDayFlt               float16
WOEBnd_device_nextClickLeakDayFlt            float16
WOEBnd_os_nextClickLeakDayFlt                float16
WOEBnd_channel_nextClickLeakDayFlt           float16
WOEBnd_ip_app_nextClickLeakDayFlt            float16
WOEBnd_ip_device_nextClickLeakDayFlt         float16
WOEBnd_ip_os_nextClickLeakDayFlt             float16
WOEBnd_ip_channel_nextClickLeakDayFlt  

In [None]:
**************************************************
bagging_seed : 2018
boosting_type : gbdt
colsample_bytree : 0.45
data_random_seed : 2018
drop_seed : 2018
feature_fraction_seed : 2018
learning_rate : 0.04
max_bin : 188
max_depth : 7
metric : auc
min_child_samples : 20
min_child_weight : 5
min_split_gain : 0
nthread : 20
num_leaves : 76
objective : binary
reg_alpha : 0
reg_lambda : 0
scale_pos_weight : 32.0
subsample : 1.0
subsample_for_bin : 200000
subsample_freq : 1
verbose : 0
start for lgvalid
start for lgtrain
start training
/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/lightgbm/basic.py:1036: UserWarning: Using categorical_feature in Dataset.
  warnings.warn('Using categorical_feature in Dataset.')
/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/lightgbm/basic.py:681: UserWarning: categorical_feature in param dict is overrided.
  warnings.warn('categorical_feature in param dict is overrided.')
Training until validation scores don't improve for 100 rounds.
[10]	valid's auc: 0.976439
[20]	valid's auc: 0.977661
[30]	valid's auc: 0.978294
[40]	valid's auc: 0.978978
[50]	valid's auc: 0.979365
[60]	valid's auc: 0.980071
[70]	valid's auc: 0.980501
[80]	valid's auc: 0.980976
[90]	valid's auc: 0.981433
[100]	valid's auc: 0.981916
[110]	valid's auc: 0.982365
[120]	valid's auc: 0.982697
[140]	valid's auc: 0.983176
[150]	valid's auc: 0.983445
[160]	valid's auc: 0.983669
[170]	valid's auc: 0.983835
[180]	valid's auc: 0.983965
[190]	valid's auc: 0.984083
[200]	valid's auc: 0.984191
[210]	valid's auc: 0.984321
[220]	valid's auc: 0.984415
[230]	valid's auc: 0.984487
[240]	valid's auc: 0.984561
[250]	valid's auc: 0.984613
[260]	valid's auc: 0.984665
[270]	valid's auc: 0.984715
[280]	valid's auc: 0.984778
[290]	valid's auc: 0.984809
[300]	valid's auc: 0.984838
[310]	valid's auc: 0.98488
[320]	valid's auc: 0.984915
[330]	valid's auc: 0.984946
[340]	valid's auc: 0.984977
[350]	valid's auc: 0.984989
[360]	valid's auc: 0.985011
[370]	valid's auc: 0.985032
[380]	valid's auc: 0.985056
[390]	valid's auc: 0.98507
[400]	valid's auc: 0.985092
[410]	valid's auc: 0.985107
[420]	valid's auc: 0.985115
[430]	valid's auc: 0.985124
[440]	valid's auc: 0.985131
[450]	valid's auc: 0.985134
[460]	valid's auc: 0.985141
[470]	valid's auc: 0.985141
[480]	valid's auc: 0.985158
[490]	valid's auc: 0.985164
[500]	valid's auc: 0.985174
[510]	valid's auc: 0.985183
[520]	valid's auc: 0.985195
[530]	valid's auc: 0.98521
[540]	valid's auc: 0.985218
[550]	valid's auc: 0.985223
[560]	valid's auc: 0.985227
[570]	valid's auc: 0.985232
[580]	valid's auc: 0.985233
[590]	valid's auc: 0.985242
[600]	valid's auc: 0.985246
[610]	valid's auc: 0.985255
[620]	valid's auc: 0.985256
[630]	valid's auc: 0.985261
[640]	valid's auc: 0.985263
[650]	valid's auc: 0.985263
[660]	valid's auc: 0.985261
[670]	valid's auc: 0.985261
[680]	valid's auc: 0.985266
[690]	valid's auc: 0.985266
[700]	valid's auc: 0.985267
[710]	valid's auc: 0.98527
[720]	valid's auc: 0.985273
[730]	valid's auc: 0.985276
[740]	valid's auc: 0.985274
[750]	valid's auc: 0.985276
[760]	valid's auc: 0.985277
[770]	valid's auc: 0.985275
[780]	valid's auc: 0.985273
[790]	valid's auc: 0.98528
[800]	valid's auc: 0.98528
[810]	valid's auc: 0.985281
[820]	valid's auc: 0.985276
[830]	valid's auc: 0.985275
[840]	valid's auc: 0.985283
[850]	valid's auc: 0.985283
[860]	valid's auc: 0.985279
[870]	valid's auc: 0.98528
[880]	valid's auc: 0.985285
[890]	valid's auc: 0.98529
[900]	valid's auc: 0.985288
[910]	valid's auc: 0.985285
[920]	valid's auc: 0.985283
[930]	valid's auc: 0.985285
[940]	valid's auc: 0.985283
[950]	valid's auc: 0.985273
[960]	valid's auc: 0.985272
[970]	valid's auc: 0.985273
[980]	valid's auc: 0.985279
Early stopping, best iteration is:
[889]	valid's auc: 0.98529
importance (count)
7662 	 cat_channel
6053 	 cat_dayhourcount_ip
4186 	 cat_os
4076 	 cat_hour
4043 	 cat_app
2305 	 nextClickLeakDay
1156 	 cumratio_ip_day
1138 	 recumcount_app_device_os_day
1109 	 var_ip_device_hour
1070 	 uniqueCount_day_ip_machine
1012 	 count_ip_device_os_day_hourminute10
1007 	 uniqueCount_day_ip_os
979 	 uniqueCount_day_ip_app
949 	 count_ip_os
935 	 dayhourminute10count_ip_device_os
891 	 count_ip
890 	 nextNextClickLeakDay
832 	 cumcount_ip_day
761 	 count_ip_app_device_day
746 	 count_app_day_hourminute
717 	 WOEBnd_app_os_nextClickLeakDayFlt
715 	 count_device_os_day_hourminute10
689 	 uniqueCountRatio_day_ip_channel
681 	 countRatio_ip_channel
652 	 uniqueCount_day_ip_device
648 	 countRatio_ip_machine
626 	 WOEBnd_app_channel_nextClickLeakDayFlt
618 	 count_app_device_channel_day_hour
617 	 WOEBnd_ip
609 	 WOEBnd_ip_device
598 	 dayhourminute10count_ip_channel
586 	 WOEBnd_os_channel_nextClickLeakDayFlt
578 	 count_app_device_day_hour
572 	 count_app_os_channel_day_hour
564 	 countRatio_machine_ip
558 	 WOEBnd_app_nextClickLeakDayFlt
558 	 WOEBnd_ip_device_nextClickLeakDayFlt
536 	 WOEBnd_ip_nextClickLeakDayFlt
516 	 count_ip_app_device_os_day_hour
501 	 WOEBnd_app_os_channel
475 	 WOEBnd_app_channel
475 	 WOEBnd_ip_app_device
470 	 WOEBnd_app_os
470 	 count_ip_app_device_channel_day
458 	 WOEBnd_os_nextClickLeakDayFlt
457 	 WOEBnd_app
454 	 count_ip_app_os_channel
452 	 WOEBnd_channel_nextClickLeakDayFlt
440 	 WOEBnd_app_device_channel
434 	 WOEBnd_app_device_os_channel
416 	 WOEBnd_app_device_os
415 	 WOEBnd_app_device_nextClickLeakDayFlt
407 	 WOEBnd_channel
402 	 WOEBnd_device_os_nextClickLeakDayFlt
396 	 WOEBnd_ip_app
385 	 count_ip_app_os_channel_day
373 	 WOEBnd_ip_app_nextClickLeakDayFlt
363 	 WOEBnd_ip_os_nextClickLeakDayFlt
356 	 WOEBnd_app_device
356 	 countRatio_app_channel
332 	 WOEBnd_ip_os
318 	 WOEBnd_ip_device_os
313 	 dayhourminute10count_app_os_channel
294 	 WOEBnd_device_channel_nextClickLeakDayFlt
283 	 WOEBnd_os
246 	 cat_device
234 	 WOEBnd_device_nextClickLeakDayFlt
227 	 countRatio_channel_app
185 	 uniqueCount_machine_ip
184 	 WOEBnd_ip_app_device_os
171 	 WOEBnd_ip_app_channel
170 	 uniqueCount_machine_app
166 	 WOEBnd_ip_app_device_channel
159 	 WOEBnd_ip_app_os
156 	 uniqueCount_machine_channel
147 	 WOEBnd_device
144 	 WOEBnd_ip_channel_nextClickLeakDayFlt
126 	 WOEBnd_ip_channel
115 	 WOEBnd_ip_device_channel
101 	 WOEBnd_ip_device_os_channel
86 	 WOEBnd_ip_os_channel
73 	 WOEBnd_ip_app_os_channel
20 	 cat_com1_ip
importance (gain)
310909115.1367215 	 WOEBnd_app_channel_nextClickLeakDayFlt
116189675.08218685 	 WOEBnd_app_channel
74585079.33311379 	 WOEBnd_app_device_channel
65346948.51699972 	 WOEBnd_app_os_nextClickLeakDayFlt
52930113.69488859 	 WOEBnd_app_os_channel
34738707.1083705 	 WOEBnd_app_nextClickLeakDayFlt
13248923.494836092 	 WOEBnd_app_device_nextClickLeakDayFlt
11820401.508634567 	 cat_channel
10859121.123201381 	 uniqueCount_day_ip_app
10806922.325395584 	 nextClickLeakDay
8295971.22974968 	 cat_app
8119549.615546957 	 WOEBnd_os_channel_nextClickLeakDayFlt
7742286.443136215 	 cat_dayhourcount_ip
6167366.172456443 	 count_ip
5279370.588954449 	 cat_hour
4964512.492777765 	 WOEBnd_app_device_os_channel
4775688.893976212 	 cat_os
3952056.84449701 	 uniqueCount_day_ip_machine
3902632.14283338 	 uniqueCount_day_ip_os
2870196.691186905 	 WOEBnd_app_device_os
2601322.2771796584 	 WOEBnd_app_os
2512163.12083447 	 count_ip_os
2142401.274064959 	 WOEBnd_app
2080720.4628853558 	 cumcount_ip_day
2012502.033362314 	 dayhourminute10count_ip_device_os
1969777.27425766 	 WOEBnd_ip_app_device
1958352.8883017013 	 cumratio_ip_day
1777085.3216042519 	 WOEBnd_ip_app_nextClickLeakDayFlt
1757405.878881462 	 recumcount_app_device_os_day
1720464.7791444212 	 uniqueCount_day_ip_device
1592370.3896488547 	 count_ip_device_os_day_hourminute10
1310657.6935391873 	 WOEBnd_device_os_nextClickLeakDayFlt
1182321.2738022804 	 WOEBnd_device_channel_nextClickLeakDayFlt
1153360.1602563858 	 nextNextClickLeakDay
1121229.8367853165 	 WOEBnd_app_device
1077885.5237140656 	 var_ip_device_hour
1032646.6714598294 	 WOEBnd_channel_nextClickLeakDayFlt
991875.3891591795 	 WOEBnd_ip_device_nextClickLeakDayFlt
948495.0860916078 	 uniqueCountRatio_day_ip_channel
906906.5093252361 	 countRatio_machine_ip
864391.4197893143 	 WOEBnd_ip_app
861099.666806221 	 uniqueCount_machine_ip
833559.8858332038 	 count_device_os_day_hourminute10
770556.3390529752 	 WOEBnd_os_nextClickLeakDayFlt
724418.9525639177 	 countRatio_ip_machine
634177.162967205 	 count_ip_app_device_day
613470.6909191906 	 WOEBnd_ip_nextClickLeakDayFlt
511534.1028394699 	 count_ip_app_device_channel_day
507560.0583263412 	 countRatio_ip_channel
500883.5614546463 	 WOEBnd_ip_os_nextClickLeakDayFlt
487522.6605615616 	 WOEBnd_ip_device
483755.38690439984 	 count_app_os_channel_day_hour
467635.90353414044 	 count_app_device_channel_day_hour
463408.1527768709 	 count_app_device_day_hour
452831.9241409302 	 cat_device
450700.4908838272 	 uniqueCount_machine_app
441651.1567156911 	 WOEBnd_ip
431864.92791201174 	 WOEBnd_device_nextClickLeakDayFlt
406568.6708442494 	 dayhourminute10count_ip_channel
374913.27003860474 	 WOEBnd_ip_app_device_os
370983.0273208618 	 count_app_day_hourminute
354809.19077014923 	 WOEBnd_ip_device_os
354620.4433208108 	 count_ip_app_device_os_day_hour
339179.5741372297 	 WOEBnd_channel
321153.5785684213 	 countRatio_app_channel
304542.94038391113 	 WOEBnd_ip_os
287665.46991825104 	 count_ip_app_os_channel
256743.59909152985 	 count_ip_app_os_channel_day
252504.0149960518 	 uniqueCount_machine_channel
252411.30738449097 	 WOEBnd_ip_app_os
240897.25969219208 	 WOEBnd_ip_channel_nextClickLeakDayFlt
239004.0766961202 	 dayhourminute10count_app_os_channel
209945.51604175568 	 WOEBnd_ip_app_device_channel
157689.6334280548 	 WOEBnd_os
154800.7994966507 	 countRatio_channel_app
149291.7297153473 	 WOEBnd_ip_app_channel
92971.41224992275 	 WOEBnd_ip_channel
92793.21971416473 	 WOEBnd_device
80946.17392452061 	 WOEBnd_ip_device_channel
34718.61844649911 	 WOEBnd_ip_device_os_channel
24498.98580223322 	 WOEBnd_ip_os_channel
20669.156454086304 	 WOEBnd_ip_app_os_channel
5074.541618347168 	 cat_com1_ip
validation auc: 0.9852901220779562