# Utils methods

In [10]:
import sys

# this script makes it possible to describe a model in one argument.

lgb_target = 'model=LGBM_feat=lgbmBest_categoricalThreVal=10000_validation=subm_params=-,gbdt,0.45,0.04,188,7,auc,20,5,0,70,76,binary,0,0,32.0,1.0,200000,1,0'
keras_target = 'BatchNormalization=on_sameNDenseAsEmb=off_model=keras_feat=kerasBest_validation=team_params=-,20000,1000,1,0.2,100,2,0.001,0.0001,0.001,100,2,3'
target = keras_target#''

def get_opt(name,default=None):
#     import pdb
#     pdb.set_trace()
    global target
    if target == '':
        target = get_target()
    if target == '':
        return default
    flds = target.replace('__','').split('_')
    for fld in flds:
        if fld == '':
            continue
        key, val = fld.split('=')
        if key == name:
            if isinstance(default, int):
                val = int(val)
            elif isinstance(default, float):
                val = float(val)
            else:
                val = val.replace('','_')
            return val
    return default

def get_target():
    global target
    if target != '':
        return target
    if len(sys.argv) > 1:
        target = sys.argv[1]
    else:
        target = default_target
    return target

def reset_target():
    global target
    target = ''
    a = get_target()
    return get_target()

def set_target(tgt):
    global target
    target = tgt


# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import random as rn
from sklearn.metrics import roc_auc_score
import sys
import os
# from lib_util import get_target,get_opt
import lightgbm as lgb
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate, BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.initializers import RandomUniform
from keras.models import Model
from keras.optimizers import Adam
import gc

def get_params(params_str):
    if get_opt('model') == 'keras':
        names = ['batch_size', 'dense_cate', 'dense_nume_n_layers', 'drop', 'emb_cate', 'epochs_for_lr', 'lr', 'lr_fin', 'lr_init', 'max_epochs', 'n_layers', 'patience']
    elif 'LGBM' in get_opt('model'):
        names = ['boosting_type','colsample_bytree','learning_rate','max_bin','max_depth','metric','min_child_samples','min_child_weight','min_split_gain','nthread','num_leaves','objective','reg_alpha','reg_lambda','scale_pos_weight','subsample','subsample_for_bin','subsample_freq','verbose']
    else:
        print("no valid target")
        sys.exit(1)
    pvals = params_str.split(',')
    del pvals[0]
    if len(pvals) != len(names):
        print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!ERR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print('params: count is not fit',len(pvals), len(names))
        print('params_str:',params_str)
        print('names:',names)
        print('param_values:',pvals)
        sys.exit()
    params = dict(zip(names, pvals))
    return params

def LGBM(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
    params_str = get_opt('params')
    if params_str != None:
        params = get_params(params_str)
        return LGBM_helper(X_tr,X_va,X_te,predictors,cat_feats,params,seed=2018)

def Keras(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
    params_str = get_opt('params')
    if params_str != None:
        params = get_params(params_str)
        return Keras0_helper(X_tr,X_va,X_te,predictors,cat_feats,params,seed=2018)

def LGBM_helper(_X_tr,_X_va,_X_te,predictors,cat_feats,params,seed=2018):
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    rn.seed(seed)
    X_tr = _X_tr[predictors]
    X_va = _X_va[predictors]
    X_te = _X_te[predictors]
    y_tr = _X_tr['is_attributed']
    y_va = _X_va['is_attributed']
    y_te = _X_te['is_attributed']
    params['feature_fraction_seed'] = seed
    params['bagging_seed'] = seed
    params['drop_seed'] = seed
    params['data_random_seed'] = seed
    params['num_leaves'] = int(params['num_leaves'])
    params['subsample_for_bin'] = int(params['subsample_for_bin'])
    params['max_depth'] = int(np.log2(params['num_leaves'])+1.2)
    params['max_bin'] = int(params['max_bin'])
    print('*'*50)
    for k,v in sorted(params.items()):
        print(k,':',v)
    columns = X_tr.columns

    print('start for lgvalid')
    lgvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_feats)
    _X_va.drop(predictors,axis=1)
    del _X_va, X_va, y_va
    gc.collect()

    print('start for lgtrain')
    lgtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_feats)
    _X_te.drop(predictors,axis=1)
    del _X_tr, X_tr, y_tr
    gc.collect()

    evals_results = {}
    if get_opt('trainCheck','-') == 'on':
         valid_names=['train','valid']
         valid_sets=[lgtrain, lgvalid]
    else:
         valid_names=['valid']
         valid_sets=[lgvalid]
    if get_opt('testCheck','-') == 'on':
         valid_names.append('test')
         lgtest = lgb.Dataset(X_te, label=y_te, categorical_feature=cat_feats)
         valid_sets.append(lgtest)

    print('start training')
    bst = lgb.train(params,
                     lgtrain,
                     valid_sets=valid_sets,
                     valid_names=valid_names,
                     evals_result=evals_results,
                     num_boost_round=2000,
                     early_stopping_rounds=100,
                     verbose_eval=10,
                     )

    importance = bst.feature_importance()
    print('importance (count)')
    tuples = sorted(zip(columns, importance), key=lambda x: x[1],reverse=True)
    for col, val in tuples:
        print(val,"\t",col)

    importance = bst.feature_importance(importance_type='gain')
    print('importance (gain)')
    tuples = sorted(zip(columns, importance), key=lambda x: x[1],reverse=True)
    for col, val in tuples:
        print(val,"\t",col)

    n_estimators = bst.best_iteration
    metric = params['metric']
    auc = evals_results['valid'][metric][n_estimators-1]
    _X_te['pred'] = bst.predict(X_te)

    return auc

class EarlyStopping(Callback):
    def __init__(self,training_data=False,validation_data=False, testing_data=False, min_delta=0, patience=0, model_file=None, verbose=0):
        super(EarlyStopping, self).__init__()
        self.best_epoch = 0
        self.patience = patience
        self.verbose = verbose
        self.min_delta = min_delta
        self.wait = 0
        self.stopped_epoch = 0
        self.monitor_op = np.greater
        if training_data:
            self.x_tr = training_data[0]
            self.y_tr = training_data[1]
        else:
            self.x_tr = False
            self.y_tr = False
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        if testing_data:
            self.x_te = testing_data[0]
            self.y_te = testing_data[1]
        else:
            self.x_te = False
            self.y_te = False
        self.model_file = model_file
    def on_train_begin(self, logs={}):
        self.wait = 0
        self.best_epoch = 0
        self.stopped_epoch = 0
        self.best = -np.Inf
    def on_train_end(self, logs={}):
        if self.stopped_epoch > 0 and self.verbose > 0:
            print('Epoch ',self.best_epoch,': EarlyStopping')
    def on_epoch_end(self, epoch, logs={}):

        if self.x_tr:
            y_pred = self.model.predict(self.x_tr,batch_size=100000)
            roc_tr = roc_auc_score(self.y_tr, y_pred)
        else:
            roc_tr = 0

        y_hat_val=self.model.predict(self.x_val,batch_size=100000)
        roc_val = roc_auc_score(self.y_val, y_hat_val)

        if self.x_te:
            y_hat_te=self.model.predict(self.x_te,batch_size=100000)
            roc_te = roc_auc_score(self.y_te, y_hat_te)
        else:
            roc_te = 0
        print('roc-auc: %s - roc-auc_val: %s - roc-auc_test: %s' % (str(round(roc_tr,6)),str(round(roc_val,6)), str(round(roc_te,6))),end=100*' '+'\n')

        if self.model_file:
            print("saving",self.model_file+'.'+str(epoch))
            self.model.save_weights(self.model_file+'.'+str(epoch))
        if(self.x_val):
            if get_opt('testCheck','-') == 'on': 
                current = roc_te
            else:
                current = roc_val
            if self.monitor_op(current - self.min_delta, self.best):
                self.best = current
                self.best_epoch = epoch
                self.wait = 0
            else:
                self.wait += 1
                if self.wait >= self.patience:
                    self.stopped_epoch = epoch
                    self.model.stop_training = True

def Keras0_helper(_X_tr,_X_va,_X_te,predictors,cat_feats,params,seed=2018):
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    rn.seed(seed)
    X_tr = _X_tr[predictors]
    X_va = _X_va[predictors]
    X_te = _X_te[predictors]
    y_tr = _X_tr['is_attributed']
    y_va = _X_va['is_attributed']
    y_te = _X_te['is_attributed']
    print('*************params**************')
    for f in sorted(params): print(f+":",params[f])
    batch_size = int(params['batch_size'])
    epochs_for_lr = float(params['epochs_for_lr'])
    max_epochs = int(params['max_epochs'])
    emb_cate = int(params['emb_cate'])
    dense_cate = int(params['dense_cate'])
    dense_nume_n_layers = int(params['dense_nume_n_layers'])
    drop = float(params['drop'])
    lr= float(params['lr'])
    lr_init = float(params['lr_init'])
    lr_fin = float(params['lr_fin'])
    n_layers = int(params['n_layers'])
    patience = int(params['patience'])
    train_dict = {}
    valid_dict = {}
    test_dict = {}
    input_list = []
    emb_list = []
    numerical_feats = []
    tot_emb_n = 0
    for col in X_tr:
        if col not in cat_feats:
            numerical_feats.append(col)
    if len(cat_feats) > 0:
        for col in cat_feats:
            train_dict[col] = np.array(X_tr[col])
            valid_dict[col] = np.array(X_va[col])
            test_dict[col] = np.array(X_te[col])
            inpt = Input(shape=[1], name = col)
            input_list.append(inpt)
            max_val = np.max([X_tr[col].max(), X_va[col].max(), X_te[col].max()])+1
            emb_n = np.min([emb_cate, max_val])
            if get_opt('fixEmb','on') == 'on':
                emb_n = emb_cate
            tot_emb_n += emb_n
            if emb_n == 1:
                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Warinig!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! emb_1 = 1")
                return 0
            print('Embedding size:',max_val, emb_cate, X_tr[col].max(), X_va[col].max(), X_te[col].max(), emb_n,col)
            embd = Embedding(max_val, emb_n)(inpt)
            emb_list.append(embd)
        if len(emb_list) == 1:
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Warinig!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! emb_list = 1")
            return 0
        fe = concatenate(emb_list)
        s_dout = SpatialDropout1D(drop)(fe)
        x1 = Flatten()(s_dout)

    if get_opt('sameNDenseAsEmb','-') == 'on':
        dense_cate = tot_emb_n
    if len(numerical_feats) > 0:
        train_dict['numerical'] = X_tr[numerical_feats].values
        valid_dict['numerical'] = X_va[numerical_feats].values
        test_dict['numerical'] = X_te[numerical_feats].values
        inpt = Input((len(numerical_feats),),name='numerical')
        input_list.append(inpt)
        x2 = inpt
        for n in range(dense_nume_n_layers):
            x2 = Dense(dense_cate,activation='relu',kernel_initializer=RandomUniform(seed=seed))(x2)
            if get_opt('numeDropout','on') != 'off':
                x2 = Dropout(drop)(x2)
            if get_opt('NumeBatchNormalization','on') != 'off':
                x2 = BatchNormalization()(x2)

    if len(numerical_feats) > 0 and len(cat_feats) > 0:
        x = concatenate([x1, x2])
    elif len(numerical_feats) > 0:
        x =  x2
    elif len(cat_feats) > 0:
        x =  x1
    else:
        return 0 # for small data test

    for n in range(n_layers):
        x = Dense(dense_cate,activation='relu',kernel_initializer=RandomUniform(seed=seed))(x)
        if get_opt('lastDropout','on') != 'off':
            x = Dropout(drop)(x)
        if get_opt('BatchNormalization','off') == 'on' or get_opt('LastBatchNormalization','off') == 'on':
            x = BatchNormalization()(x)
    outp = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform(seed=seed))(x)
    model = Model(inputs=input_list, outputs=outp)
    if get_opt('optimizer','expo') == 'adam':
        optimizer = Adam(lr=lr)
    elif get_opt('optimizer','expo') == 'nadam':
        optimizer = Nadam(lr=lr)
    else:
        exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
        steps = int(len(X_tr) / batch_size) * epochs_for_lr
        lr_init, lr_fin = 0.001, 0.0001
        lr_decay = exp_decay(lr_init, lr_fin, steps)
        optimizer = Adam(lr=lr, decay=lr_decay)
    model.compile(loss='binary_crossentropy',optimizer=optimizer)
    model.summary()
    #from keras.utils import plot_model
    #plot_model(model, to_file='model.png')

    model_file = '../work/weights.'+str(os.getpid())+'.hdf5'
    if get_opt('trainCheck','-') == 'on': 
        training_data=(train_dict, y_tr)
    else:
        training_data=False
    if get_opt('testCheck','-') == 'on':
        testing_data=(test_dict, y_te)
    else:
        testing_data=False
    aucEarlyStopping = EarlyStopping(
        training_data=training_data,
        validation_data=(valid_dict,y_va),
        testing_data=testing_data,
        patience=patience,
        model_file=model_file,
        verbose=1)
    model.fit(train_dict,
        y_tr,
        validation_data=[valid_dict, y_va],
        batch_size=batch_size,
        epochs=max_epochs,
        shuffle=True,
        verbose=1,
        callbacks=[aucEarlyStopping])
    best_epoch = aucEarlyStopping.best_epoch
    print('loading',model_file+'.'+str(best_epoch))
    model.load_weights(model_file+'.'+str(best_epoch))
    _X_te['pred'] = model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]
    _X_va['pred'] = model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]
    if get_opt('avgEpoch',0) > 0:
        added = 1
        for i in range(min(get_opt('avgEpoch',0),patience)):
            best_epoch = aucEarlyStopping.best_epoch + (i+1)
            if best_epoch >= max_epochs:
                continue
            print('loading',model_file+'.'+str(best_epoch))
            model.load_weights(model_file+'.'+str(best_epoch))
            _X_te['pred'] += model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            _X_va['pred'] += model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            added += 0.5
            best_epoch = aucEarlyStopping.best_epoch - (i+1)
            if best_epoch < 0:
                continue
            print('loading',model_file+'.'+str(best_epoch))
            model.load_weights(model_file+'.'+str(best_epoch))
            _X_te['pred'] += model.predict(test_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            _X_va['pred'] += model.predict(valid_dict, batch_size=batch_size, verbose=1)[:,0]*0.5
            added += 0.5
        _X_te['pred'] /= added
        _X_va['pred'] /= added

    os.system('rm -f '+model_file+'.*')
    auc = roc_auc_score(y_va, _X_va.pred)
    return auc

def Predict(X_tr,X_va,X_te,predictors,cat_feats,seed=2018):
#     import pdb
#     pdb.set_trace()
    model = get_opt('model')
    if 'LGBM' in model:
        return LGBM(X_tr,X_va,X_te,predictors,cat_feats,seed=2018)
    elif 'keras' in model:
        return Keras(X_tr,X_va,X_te,predictors,cat_feats,seed=2018)
    else:
        print("no valid model")
        sys.exit(1)


#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
import sys
import pickle
import os
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.special import logit
# from lib_util import get_target,get_opt,set_target,reset_target
import shutil
import pdb

target=get_target()
nrows=get_opt('nrows',-1)
if nrows == -1:
    nrows=None
path = '../input/'
work = '../work/'
csv_dir='../csv/'
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }

# wrapper of pd.read_csv with cache
def read_csv(csv_file,df_len=None,nrows=None,usecols=None,dtype=None):
    pkl_file = csv_file[:-4] + '.pkl'
    if os.path.isfile(pkl_file) and nrows == None:
        with open(pkl_file, 'rb') as pk:
            print("loading",pkl_file)
            df = pickle.load(pk)
        if df_len != None and len(df) != df_len:
            print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',pkl_file,'is broken',len(df),df_len)
            sys.exit(1)
    else:
        print("loading",csv_file)
        df = pd.read_csv(csv_file, nrows=nrows)
        if 'next' in csv_file:
            df = np.absolute(df)
        for ptn in df:
            if dtype:
                df = df.astype(dtype)
            else:
                df[ptn] = df[ptn].astype(get_type_with_fld_check(df,ptn))
        if nrows == None and (df_len == None or len(df) == df_len):
            print("saving cache file",pkl_file)
            with open(pkl_file+str(os.getpid()), 'wb') as pk:
                pickle.dump(df,pk,protocol=4)
            shutil.move(pkl_file+str(os.getpid()), pkl_file)
        if nrows == None and df_len and len(df) != df_len:
            print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',csv_file,'is broken')
            sys.exit(1)
    if usecols != None:
        df = df[usecols]
    if nrows != None:
        df = df[:nrows]
    gc.collect()
    if df_len != None and len(df) != df_len:
        print('ERROR!!!!!!!!!!!!!!!!!!!!!!!',csv_file,'line is not same',df_len,len(df))
        sys.exit(1)
    return df

def get_type_with_fld_check(df,ptn):
    max_val = df[ptn].max()
    if  'cumratio' in ptn or 'mean_' in ptn or 'Ratio' in ptn or 'CVR' in ptn or 'WOE' in ptn:
        dtype = 'float16'
    else:
        if max_val < 256:
            dtype = 'uint8'
        elif max_val < 65536:
            dtype = 'uint16'
        else:
            dtype = 'uint32'
    return dtype

def get_type(df,ptn):
    max_val = df[ptn].max()
    if max_val < 256:
        dtype = 'uint8'
    elif max_val < 65536:
        dtype = 'uint16'
    else:
        dtype = 'uint32'
    return dtype


def read_data_ph1():
    keep_patterns = []
    feat_opt = get_opt('feat','none')

    if 'lgbmBest' == feat_opt:
        numerical_patterns = ['WOEBnd_ip_nextClickLeakDayFlt', 'WOEBnd_app_nextClickLeakDayFlt', 'WOEBnd_device_nextClickLeakDayFlt', 'WOEBnd_os_nextClickLeakDayFlt', 'WOEBnd_channel_nextClickLeakDayFlt', 'WOEBnd_ip_app_nextClickLeakDayFlt', 'WOEBnd_ip_device_nextClickLeakDayFlt', 'WOEBnd_ip_os_nextClickLeakDayFlt', 'WOEBnd_ip_channel_nextClickLeakDayFlt', 'WOEBnd_app_device_nextClickLeakDayFlt', 'WOEBnd_app_os_nextClickLeakDayFlt', 'WOEBnd_app_channel_nextClickLeakDayFlt', 'WOEBnd_device_os_nextClickLeakDayFlt', 'WOEBnd_device_channel_nextClickLeakDayFlt', 'WOEBnd_os_channel_nextClickLeakDayFlt', 'WOEBnd_ip', 'WOEBnd_app', 'WOEBnd_device', 'WOEBnd_os', 'WOEBnd_channel', 'WOEBnd_ip_app', 'WOEBnd_ip_device', 'WOEBnd_ip_os', 'WOEBnd_ip_channel', 'WOEBnd_app_device', 'WOEBnd_app_os', 'WOEBnd_app_channel', 'WOEBnd_ip_app_device', 'WOEBnd_ip_app_os', 'WOEBnd_ip_app_channel', 'WOEBnd_ip_device_os', 'WOEBnd_ip_device_channel', 'WOEBnd_ip_os_channel', 'WOEBnd_app_device_os', 'WOEBnd_app_device_channel', 'WOEBnd_app_os_channel', 'WOEBnd_ip_app_device_os', 'WOEBnd_ip_app_device_channel', 'WOEBnd_ip_app_os_channel', 'WOEBnd_ip_device_os_channel', 'WOEBnd_app_device_os_channel', 'countRatio_ip_machine', 'countRatio_ip_channel', 'countRatio_machine_ip', 'countRatio_app_channel', 'countRatio_channel_app', 'uniqueCount_day_ip_os', 'uniqueCount_day_ip_device', 'uniqueCountRatio_day_ip_channel', 'uniqueCount_day_ip_machine', 'uniqueCount_day_ip_app',  'uniqueCount_machine_app', 'uniqueCount_machine_channel', 'uniqueCount_machine_ip', 'nextClickLeakDay', 'nextNextClickLeakDay', 'dayhourminute10count_ip_device_os', 'dayhourminute10count_ip_channel', 'dayhourminute10count_app_os_channel', 'cumratio_ip_day', 'cumcount_ip_day', 'count_ip_os', 'count_ip_device_os_day_hourminute10', 'count_ip_app_os_channel_day', 'count_ip_app_os_channel', 'count_ip_app_device_os_day_hour', 'count_ip_app_device_day', 'count_ip_app_device_channel_day', 'count_ip', 'count_device_os_day_hourminute10', 'count_app_os_channel_day_hour', 'count_app_device_day_hour', 'count_app_device_channel_day_hour', 'recumcount_app_device_os_day', 'var_ip_device_hour', 'count_app_day_hourminute']
        cat_patterns = ['cat_os', 'cat_hour', 'cat_device', 'cat_dayhourcount_ip', 'cat_com1_ip', 'cat_channel', 'cat_app']
    elif 'kerasBest' == feat_opt:
        numerical_patterns = ['uniqueCountRatio_day_ip_machine', 'uniqueCountRatio_day_ip_app', 'uniqueCountRatio_day_ip_channel', 'uniqueCount_day_ip_machine', 'uniqueCount_day_ip_app', 'uniqueCount_day_ip_channel', 'uniqueCount_machine_app', 'uniqueCount_machine_channel', 'uniqueCount_machine_ip', 'nextClickLeakDay', 'dayhourcount_ip', 'count_ip', 'count_ip_app_device_os_day_hour', 'count_app_channel', 'cumcount_ip_app_device_os_day_hour', 'count_device_os_day_hourminute10', 'count_app_device_day_hour', 'dayhourminute10count_ip']
        cat_patterns = ['cat_nextClickLeakDay', 'cat_nextNextClickLeakDay', 'cat_app', 'cat_device', 'cat_os', 'cat_count_ip', 'cat_count_app_channel', 'cat_hour', 'cat_dayhourcount_ip']
    else:
        print('ERR: no valid feat !!!!!!!!!!!!!!!!')
        sys.exit(1)

    print("start reading feature for",feat_opt)

    # all cache
    tgt = 'model=' + get_opt('model','none')
    tgt += '_nrows=' + get_opt('nrows','0') 
    tgt += '_feat=' + get_opt('feat','0') 
    tgt += '_categoricalThreVal=' + get_opt('categoricalThreVal','1000') 
    tgt += '_offlineADD=' + get_opt('offlineADD','off') 
    tgt += '_sample=' + get_opt('sample','0.0') 
    tgt += '_noTestSample=' + get_opt('noTestSample','off') 
    tgt += '_noLogDev=' + get_opt('noLogDev','off') 
    tgt += '_smallTest=' + get_opt('smallTest','off') 
    tgt += '_ver=3'
    tr_pkl_file = '../work/train_' + tgt + '.pkl'
    te_pkl_file = '../work/test_supplement_' + tgt + '.pkl'
    if os.path.isfile(tr_pkl_file) == True and os.path.isfile(te_pkl_file) == True:
        with open(tr_pkl_file, 'rb') as pk:
            print("loading",tr_pkl_file)
            train_df = pickle.load(pk)
        with open(te_pkl_file, 'rb') as pk:
            print("loading",te_pkl_file)
            test_df = pickle.load(pk)
        gc.collect()
        return train_df, test_df, numerical_patterns, cat_patterns

    # reading base data
    train_df = read_csv(work+"train_base.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel','day','hour','is_attributed'],nrows=nrows)
    test_df = read_csv(work+"test_supplement_base.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel','day','hour'],nrows=nrows)
    test_df['is_attributed'] = 0

    # reading numerical data
    n = 0
    for ptn in numerical_patterns:
        n+=1
        print('start for',ptn,n,'/',len(numerical_patterns))
        if ptn in train_df.columns: continue
        train_df[ptn] = read_csv(work + 'train_' + ptn + '.csv', nrows=nrows, df_len=len(train_df))
        test_df[ptn] = read_csv(work + 'test_supplement_' + ptn + '.csv', nrows=nrows, df_len=len(test_df))
    
    #reading categorical data
    n = 0
    for ptn in cat_patterns:
        n+=1
        print('start categorical convert for',ptn,n,'/',len(cat_patterns))
        if ptn in train_df.columns:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! warning cat ptn is in train_df.columns !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print(ptn,train_df.columns)
        org_ptn = ptn[4:]
        if org_ptn in train_df.columns:
            _train_df = train_df[[org_ptn]]
            _test_df = test_df[[org_ptn]]
        else:
            _train_df = read_csv(work + 'train_' + org_ptn + '.csv', nrows=nrows, df_len=len(train_df))
            _test_df = read_csv(work + 'test_supplement_' + org_ptn + '.csv', nrows=nrows, df_len=len(test_df))
        _train_df = _train_df.rename(columns={org_ptn: ptn})
        _test_df = _test_df.rename(columns={org_ptn: ptn})
        
        len_train = len(_train_df)
        _df = _train_df.append(_test_df)
        thre_val = get_opt('categoricalThreVal',1000)
        max_val = _df[ptn].max()
        if 'cat_device' == ptn and get_opt('noLogDev','-') == 'on':
            _df[ptn] = LabelEncoder().fit_transform(_df[ptn])
        elif thre_val > 0 and max_val > thre_val:
            if 'cumratio' in ptn:
                fixed_vals = (10000*df[ptn]).astype('uint16')
            else:
                fixed_vals = (np.log2(_df[ptn]+1)*thre_val/100).astype('uint16')
            _df[ptn] = LabelEncoder().fit_transform(fixed_vals)
            print('logged for',ptn,max_val,fixed_vals.max(), _df[ptn].max())
        else:
            _df[ptn] = LabelEncoder().fit_transform(_df[ptn])
        _df[ptn] = _df[ptn].astype(get_type(_df,ptn))

        train_df[ptn] = _df[:len_train]
        test_df[ptn] = _df[len_train:]
        gc.collect()

    # numerical data conversion
    for ptn in numerical_patterns:
        if get_opt('model','-') == 'keras':
            print('start for numerical convert',ptn)
            all_df = train_df[[ptn]].append(test_df[[ptn]])
            if 'cumratio' in ptn or 'CVRTgt' in ptn or 'WOETgt' in ptn:
                pass
            else:
                all_df = np.log2(all_df+1)
            all_df = StandardScaler().fit_transform(all_df).astype('float16')
            train_df[ptn] = all_df[:len(train_df)]
            test_df[ptn] = all_df[len(train_df):]

    # saving cache
    print("saving",tr_pkl_file)
    with open(tr_pkl_file+str(os.getpid()), 'wb') as pk:
        pickle.dump(train_df,pk,protocol=4)
    shutil.move(tr_pkl_file+str(os.getpid()), tr_pkl_file)
    print("saving",te_pkl_file)
    with open(te_pkl_file+str(os.getpid()), 'wb') as pk:
        pickle.dump(test_df,pk,protocol=4)
    shutil.move(te_pkl_file+str(os.getpid()), te_pkl_file)
    print('saved cache file')

    gc.collect()
    return train_df, test_df, numerical_patterns, cat_patterns


# training / prediction

In [2]:
target=get_target()
print('start for',target)

start for BatchNormalization=on_sameNDenseAsEmb=off_model=keras_feat=kerasBest_validation=team_params=-,20000,1000,1,0.2,100,2,0.001,0.0001,0.001,100,2,3


In [5]:
%%time
train_df, test_df, numerical_patterns, cat_patterns = read_data_ph1()
predictors = numerical_patterns + cat_patterns
categorical = cat_patterns

start reading feature for kerasBest
loading ../work/train_model=keras_nrows=0_feat=kerasBest_categoricalThreVal=1000_offlineADD=off_sample=0.0_noTestSample=off_noLogDev=off_smallTest=off_ver=3.pkl
loading ../work/test_supplement_model=keras_nrows=0_feat=kerasBest_categoricalThreVal=1000_offlineADD=off_sample=0.0_noTestSample=off_noLogDev=off_smallTest=off_ver=3.pkl
CPU times: user 4.73 s, sys: 12 s, total: 16.7 s
Wall time: 16.7 s


In [7]:
predictors

['uniqueCountRatio_day_ip_machine',
 'uniqueCountRatio_day_ip_app',
 'uniqueCountRatio_day_ip_channel',
 'uniqueCount_day_ip_machine',
 'uniqueCount_day_ip_app',
 'uniqueCount_day_ip_channel',
 'uniqueCount_machine_app',
 'uniqueCount_machine_channel',
 'uniqueCount_machine_ip',
 'nextClickLeakDay',
 'dayhourcount_ip',
 'count_ip',
 'count_ip_app_device_os_day_hour',
 'count_app_channel',
 'cumcount_ip_app_device_os_day_hour',
 'count_device_os_day_hourminute10',
 'count_app_device_day_hour',
 'dayhourminute10count_ip',
 'cat_nextClickLeakDay',
 'cat_nextNextClickLeakDay',
 'cat_app',
 'cat_device',
 'cat_os',
 'cat_count_ip',
 'cat_count_app_channel',
 'cat_hour',
 'cat_dayhourcount_ip']

In [6]:
train_df.shape

(184903890, 35)

In [8]:
is_val = (train_df['day'] == 9) & ((train_df['hour'] == 13) |(train_df['hour'] == 17) |(train_df['hour'] == 21))
val_df = train_df[is_val]
train_df = train_df[~is_val]

In [8]:
train_df.shape

(174788422, 35)

In [9]:
val_df.shape

(10115468, 35)

In [12]:
train_df.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'day', 'hour', 'is_attributed',
       'uniqueCountRatio_day_ip_machine', 'uniqueCountRatio_day_ip_app',
       'uniqueCountRatio_day_ip_channel', 'uniqueCount_day_ip_machine',
       'uniqueCount_day_ip_app', 'uniqueCount_day_ip_channel',
       'uniqueCount_machine_app', 'uniqueCount_machine_channel',
       'uniqueCount_machine_ip', 'nextClickLeakDay', 'dayhourcount_ip',
       'count_ip', 'count_ip_app_device_os_day_hour', 'count_app_channel',
       'cumcount_ip_app_device_os_day_hour',
       'count_device_os_day_hourminute10', 'count_app_device_day_hour',
       'dayhourminute10count_ip', 'cat_nextClickLeakDay',
       'cat_nextNextClickLeakDay', 'cat_app', 'cat_device', 'cat_os',
       'cat_count_ip', 'cat_count_app_channel', 'cat_hour',
       'cat_dayhourcount_ip'],
      dtype='object')

In [None]:
auc = Predict(train_df,val_df,test_df,predictors,categorical,seed=get_opt('seed',2018))
print('validation auc:',auc)

*************params**************
batch_size: 20000
dense_cate: 1000
dense_nume_n_layers: 1
drop: 0.2
emb_cate: 100
epochs_for_lr: 2
lr: 0.001
lr_fin: 0.0001
lr_init: 0.001
max_epochs: 100
n_layers: 2
patience: 3
Embedding size: 139 100 138 138 138 100 cat_nextClickLeakDay
Embedding size: 139 100 138 138 138 100 cat_nextNextClickLeakDay
Embedding size: 769 100 768 768 535 100 cat_app
Embedding size: 96 100 95 95 90 100 cat_device
Embedding size: 957 100 956 954 606 100 cat_os
Embedding size: 168 100 167 167 167 100 cat_count_ip
Embedding size: 193 100 192 192 192 100 cat_count_app_channel
Embedding size: 24 100 23 21 23 100 cat_hour
Embedding size: 98 100 97 97 97 100 cat_dayhourcount_ip
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cat_nextClickLeakDay (InputLaye (None, 1)            0                                            
______

Train on 174788422 samples, validate on 10115468 samples
Epoch 1/100
saving ../work/weights.929.hdf5.0
Epoch 2/100
 26420000/174788422 [===>..........................] - ETA: 17:54 - loss: 0.0053

In [None]:


test_df = test_df[['pred']].rename(columns={'pred': 'is_attributed'})
mapping = read_csv('../input/mapping.csv')
click_id = read_csv('../input/sample_submission.csv',usecols=['click_id'])
test_df = test_df.reset_index().merge(mapping, left_on='index', right_on='old_click_id', how='left')
test_df = click_id.merge(test_df,on='click_id',how='left')
outfile = '../csv/pred_test_'+target+'.csv'
print('writing to',outfile)
test_df[['click_id','is_attributed']].to_csv(outfile,index=False)

In [7]:
auc

NameError: name 'auc' is not defined

In [17]:
auc = Predict(train_df,val_df,test_df,predictors=cat_cols,cat_feats=cat_cols,seed=get_opt('seed',2018))
print('validation auc:',auc)

*************params**************
batch_size: 20000
dense_cate: 1000
dense_nume_n_layers: 1
drop: 0.2
emb_cate: 100
epochs_for_lr: 2
lr: 0.001
lr_fin: 0.0001
lr_init: 0.001
max_epochs: 100
n_layers: 2
patience: 3
Embedding size: 212620 100 212619 211839 126388 100 ip
Embedding size: 537 100 536 64 259 100 app
Embedding size: 608 100 607 607 109 100 os
Embedding size: 499 100 498 489 497 100 channel
Embedding size: 3033 100 3032 3032 1473 100 device
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ip (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
app (InputLayer)                (None, 1)            0                                            
____________________________________________________