In [1]:
__file__=''
import os,sys
import pandas as pd
sys.path.append(os.path.join(os.path.dirname(__file__),'../LIB/'))
sys.path.append(os.path.join(os.path.dirname(__file__),'../../../../automl/automl_libs/'))
from env import FILE
from sklearn.metrics import log_loss,roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU,CuDNNGRU,Flatten,BatchNormalization
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.utils import shuffle
import pickle
from sklearn.model_selection import KFold, StratifiedKFold
import numpy as np
import gc
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Data

In [2]:
train = pd.read_pickle(FILE.train_ori.value)
test = pd.read_pickle(FILE.test_ori.value)
train_doc = pd.read_pickle('../../data/fe_doc/train_doc.pkl')
test_doc = pd.read_pickle('../../data/fe_doc/test_doc.pkl')


holdout_index = pickle.load(open(FILE.holdout_index.value,'rb'))
train_index = pickle.load(open(FILE.train_index.value,'rb'))
train_cv = train_doc.loc[train_index].copy()
holdout = train_doc.loc[holdout_index].copy()

test_matrix = pickle.load(open(FILE.countVectorize_format.value.format('test'),'rb'))

In [3]:
# df = pd.DataFrame()
# for fold in range(5):
#     index_name = FILE.Vectorize_index_format.value.format('val_fold_{}'.format(fold))
#     index = pickle.load(open(index_name,'rb'))
#     label_name = FILE.Vectorize_label_format.value.format('val_fold_{}'.format(fold))
#     label = pickle.load(open(label_name,'rb'))
#     dfc = pd.DataFrame({'idx':index,'label':label})
#     df = pd.concat([df,dfc])
# label_ho = pickle.load(open(FILE.Vectorize_label_format.value.format('holdout'),'rb'))
# dfc = pd.DataFrame({'idx':holdout_index,'label':label_ho})
# df = pd.concat([df,dfc])
# df = df.sort_values('idx')

# Define NN model

In [4]:
param_list = []
for layer1 in [4096,5120,6144]:
    for layer2 in [None,1024,2048]:
        for layer3 in [None]:
            if layer2 is None and layer3 is not None:
                continue
            if layer2 is not None and layer3 is not None:
                if layer2 < layer3:
                    continue
            param = {'input_shape':test_matrix.shape[1],
                     'layer1':layer1,
                     'layer2':layer2,
                     'layer3':layer3}
            param_list.append(param)
print(len(param_list))

train_batch = 2048
test_batch = 20000




def get_nn_model(param):
    layer1 = param.get('layer1')
    layer2 = param.get('layer2')
    layer3 = param.get('layer3')
    input_shape = param.get('input_shape')
    
    sequence_input = Input(shape=(input_shape, ))
    x = Dense(layer1, activation='relu')(sequence_input)
    x = Dropout(0.5)(x)
    
    if layer2 is not None:
        x = Dense(layer2, activation='relu')(x)
        x = Dropout(0.5)(x)
    
    if layer2 is not None and layer3 is not None:
        x = Dense(layer3, activation='relu')(x)
        x = Dropout(0.2)(x)
        
    preds = Dense(1, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])    
    return model

def train_each_epoch(x,y,batch_size,model):
    x,y = shuffle(x,y)
    model.fit(x, y, 
              batch_size=batch_size, 
              epochs=1,
              verbose=1,
              )
    return model

def get_file_name(param,vectorize='countVectorize'):
    layer1 = param.get('layer1')
    layer2 = param.get('layer2')
    layer3 = param.get('layer3')
    return '{}_layer1_{}_layer2_{}_layer3_{}.pkl'.format(vectorize,layer1,layer2,layer3)

def save_report(params,report_path):
    """
    params. dataframe report params.
    """
    try:
        saved_report = pd.read_csv(report_path)
    except:
        saved_report = pd.DataFrame()
    new_report = pd.DataFrame({'fileName':params['file_Name'],
                               'cv_mean':params['cv_mean'],
                               'holdout':params['holdout']})
    saved_report = pd.concat([saved_report,new_report]).reset_index(drop=True)
    saved_report.to_csv(report_path)
    print('saved report to path {}'.format(report_path))
    
def save_preds(file_name,preds_params,path='../../data/nn/{}/{}'):
    """
    param: nn layer info.
    preds_params:key should be train/test/holdout and values should be coresponding predict dataframe
    """
    for key in preds_params:
        saving_path = path.format(key,file_name)
        preds_params[key].to_pickle(saving_path)
        print('saving preds {} done!'.format(key))
        
def train_each_fold(param,fold,mode='countVectorize',tolerance=2):
    if mode != 'tfidf':
        train_name = FILE.countVectorize_format.value.format('train_fold_{}'.format(fold))
        val_name = FILE.countVectorize_format.value.format('val_fold_{}'.format(fold))
        holdout_name = FILE.countVectorize_format.value.format('holdout')
        test_name = FILE.countVectorize_format.value.format('test')
        
    else:
        train_name = FILE.tfidfVectorize_format.value.format('train_fold_{}'.format(fold))
        val_name = FILE.tfidfVectorize_format.value.format('val_fold_{}'.format(fold))
        holdout_name = FILE.tfidfVectorize_format.value.format('holdout')
        test_name = FILE.tfidfVectorize_format.value.format('test')
        
    y_train_name = FILE.Vectorize_label_format.value.format('train_fold_{}'.format(fold))
    y_val_name = FILE.Vectorize_label_format.value.format('val_fold_{}'.format(fold))
    y_holdout_name = FILE.Vectorize_label_format.value.format('holdout')
        
    x_train = pickle.load(open(train_name,'rb'))
    x_val = pickle.load(open(val_name,'rb'))
    x_holdout = pickle.load(open(holdout_name,'rb'))
    x_test = pickle.load(open(test_name,'rb'))
    y_train =   pickle.load(open(y_train_name,'rb'))
    y_val =   pickle.load(open(y_val_name,'rb'))
    y_holdout =   pickle.load(open(y_holdout_name,'rb'))
    tol = 0
    model = get_nn_model(param)
    best_loss = None
    while True:
        model = train_each_epoch(x_train,y_train,train_batch,model)
        val_pred = model.predict(x_val,test_batch,verbose=1)
        score = log_loss(y_val,val_pred)
        print('current validation loss: {}'.format(score))
        
        if best_loss is None:
            tol = 0
            best_loss = score
            holdout_preds = model.predict(x_holdout,test_batch,verbose=1)
            holdout_score = log_loss(y_holdout,holdout_preds)
            print('holdout loss is: {}'.format(holdout_score))
            test_preds = model.predict(x_test,test_batch,verbose=1)
            continue
        if score < best_loss:
            tol = 0
            best_loss = score
            holdout_preds = model.predict(x_holdout,test_batch,verbose=1)
            holdout_score = log_loss(y_holdout,holdout_preds)
            print('holdout loss is: {}'.format(holdout_score))
            test_preds = model.predict(x_test,test_batch,verbose=1)
        else:
            tol += 1
            if tol == tolerance:
                break
        print('best validation loss: {}'.format(best_loss))
    return best_loss,val_pred,holdout_preds,test_preds

def train_5_fold(train,test,param,vectorize='countVectorize',report_path='../../data/nn/report.csv'):
    train_oof = train[['instance_id']].copy()
    test_oof = test[['instance_id']].copy()
    train_oof['predicted_score'] = np.nan
    file_name = get_file_name(param,vectorize)
    cv_list = []
    holdout_list = []
    test_list = []
    try:
        saved_report = pd.read_csv(report_path)
        if file_name in saved_report['fileName'].values:
            return None
    except:
        print('no saved report found. create a new one')
        saved_report = pd.DataFrame()
    for fold in range(7):
        print('start fold {}...'.format(fold))
        score,val_pred,holdout_preds,test_preds =  train_each_fold(param,fold,mode=vectorize,tolerance=2)
        cv_list.append(score)
        holdout_list.append(holdout_preds)
        test_list.append(test_preds)
        index_name = FILE.Vectorize_index_format.value.format('val_fold_{}'.format(fold))
        val_index = pickle.load(open(index_name,'rb'))
        train_oof.loc[val_index,'predicted_score'] = val_pred
        gc.collect()
        time.sleep(5)
        
    
    holdout = np.mean(holdout_list,axis=0)
    test = np.mean(test_list,axis=0)
    cv_mean = np.mean(cv_list)
    y_holdout_name = FILE.Vectorize_label_format.value.format('holdout')
    y_holdout =   pickle.load(open(y_holdout_name,'rb'))
    holdout_index = pickle.load(open(FILE.holdout_index.value,'rb'))
    train_oof.loc[holdout_index,'predicted_score'] = holdout
    test_oof['predicted_score'] = test
    holdout_df =  train_oof.loc[holdout_index].copy()
    
    holdout_score = log_loss(y_holdout,holdout)
    print('cv mean: {}, overall holdout: {}'.format(cv_mean,holdout_score))
    
    
    new_report = pd.DataFrame({'fileName':[file_name],
                               'cv_mean':[cv_mean],
                               'holdout':[holdout_score]})
   
    saved_report = pd.concat([saved_report,new_report],sort=False).reset_index(drop=True)
    saved_report.to_csv(report_path,index=False)
    print('saved report to path {}'.format(report_path))
    
    preds_param = {'train':train_oof,'test':test_oof,'holdout':holdout_df}
    
    save_preds(file_name,preds_param)
    
    
    
    

9


# Train

In [5]:
for param in param_list:
    print(param)
    train_5_fold(train,test,param,vectorize='countVectorize',report_path='../../data/nn/report.csv')
    gc.collect()

{'layer1': 4096, 'layer2': None, 'layer3': None, 'input_shape': 25422}
start fold 0...
Epoch 1/1
current validation loss: 0.4114878781143221
holdout loss is: 0.4276503681903065
Epoch 1/1
current validation loss: 0.4117469063793765
best validation loss: 0.4114878781143221
Epoch 1/1
current validation loss: 0.411922317119572
start fold 1...
Epoch 1/1
current validation loss: 0.4216130411413319
holdout loss is: 0.42865379437322065
Epoch 1/1
current validation loss: 0.4195374221109574
holdout loss is: 0.42666087326434576
best validation loss: 0.4195374221109574
Epoch 1/1
current validation loss: 0.4209444733239301
best validation loss: 0.4195374221109574
Epoch 1/1
current validation loss: 0.42206245578140494
start fold 2...
Epoch 1/1
current validation loss: 0.42117108699985795
holdout loss is: 0.4284812923067745
Epoch 1/1
current validation loss: 0.420893091711846
holdout loss is: 0.4273435647152684
best validation loss: 0.420893091711846
Epoch 1/1
current validation loss: 0.4209158553466

ResourceExhaustedError: OOM when allocating tensor with shape[25422,4096]
	 [[Node: training_4/Adam/mul_2 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training_4/Adam/sub_2, training_4/Adam/gradients/dense_9/MatMul_grad/MatMul_1)]]

Caused by op 'training_4/Adam/mul_2', defined at:
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/asyncio/base_events.py", line 1425, in _run_once
    handle._run()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tornado/ioloop.py", line 759, in _run_callback
    ret = callback()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-441ae2bafe6d>", line 3, in <module>
    train_5_fold(train,test,param,vectorize='countVectorize',report_path='../../data/nn/report.csv')
  File "<ipython-input-4-bf54e6dd9e61>", line 158, in train_5_fold
    score,val_pred,holdout_preds,test_preds =  train_each_fold(param,fold,mode=vectorize,tolerance=2)
  File "<ipython-input-4-bf54e6dd9e61>", line 114, in train_each_fold
    model = train_each_epoch(x_train,y_train,train_batch,model)
  File "<ipython-input-4-bf54e6dd9e61>", line 51, in train_each_epoch
    verbose=1,
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/keras/engine/training.py", line 1008, in fit
    self._make_train_function()
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/keras/engine/training.py", line 498, in _make_train_function
    loss=self.total_loss)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/keras/optimizers.py", line 491, in get_updates
    m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 894, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 1117, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/ops/gen_math_ops.py", line 2726, in _mul
    "Mul", x=x, y=y, name=name)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/kai/anaconda3/envs/tf_gpu/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[25422,4096]
	 [[Node: training_4/Adam/mul_2 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training_4/Adam/sub_2, training_4/Adam/gradients/dense_9/MatMul_grad/MatMul_1)]]


In [None]:
report_path='../../data/nn/report.csv'
saved_report = pd.read_csv(report_path)
saved_report1 = pd.read_csv(report_path)

In [None]:
pd.concat([saved_report,saved_report1]).reset_index(drop=True)

In [None]:
new_report = pd.DataFrame({'fileName':['123.csv'],
                               'cv_mean':[0.23232],
                               'holdout':[0.12121]})

In [None]:
pd.concat([saved_report,new_report]).reset_index(drop=True)