- performance evaluation
- date: 2020-08-07
- maintainer: YZK

In [28]:
# jupyter nbconvert --to script mbuilder.ipynb

In [4]:
from datetime import datetime, timedelta
import argparse
import logging
import math
import os
import re
import sys

from collections import deque, Counter
from fbprophet import Prophet
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Layer, Dense, Input, LSTM
from tensorflow.keras.optimizers import SGD
# from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau, TensorBoard



# from keras.models import Sequential, Model
# from keras.layers import Layer, Dense, Input, LSTM
# from keras.optimizers import SGD
# from keras import initializers, regularizers, constraints
# from keras.callbacks import CSVLogger, EarlyStopping, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau, TensorBoard


In [5]:
def lstmbuilder(units, input_shape, loss, optimizer):
    '''
        input_shape: a tuple (timesteps, nfeatures)
    '''
    
    lstm = Sequential()
    lstm.add(LSTM(units, input_shape=input_shape))
    lstm.add(Dense(1))
    lstm.compile(loss=loss, optimizer=optimizer)
             
    return lstm


lstmbuilder(10, (10, 3), 'mae', SGD()).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 10)                560       
_________________________________________________________________
dense (Dense)                (None, 1)                 11        
Total params: 571
Trainable params: 571
Non-trainable params: 0
_________________________________________________________________


In [3]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking. 
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        
        tensorflow & keras reference:
            https://www.tensorflow.org/guide/keras/custom_layers_and_models
            https://www.tensorflow.org/guide/keras/masking_and_padding
            https://www.tensorflow.org/api_docs/python/tf/keras/layers/Masking
            
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)  # inherit Layer

    def build(self, input_shape):
        '''
            deferring weight creation until the shape of the inputs is known
            input_shape[-1] is the number of features if len(input_shape) == 3
        '''
        
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        
#         self.step_dim = input_shape[-2]
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        '''
            The __call__() method of your layer will automatically run build the first time it is called. 
            You now have a layer that's lazy and thus easier to use
        '''
        
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim
        
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0], self.features_dim

In [6]:
class NNBuilder():
    def __init__(self, modeld="model", ckptd="ckpt"):
        
        if not os.path.exists(modeld):
            os.makedirs(modeld)
            
        if not os.path.exists(ckptd):
            os.makedirs(ckptd)
        
        self.modeld = modeld
        self.ckptd = ckptd
        self.callbacks = self._callbacks(modeld, ckptd)
        self.optimizer = self._optimizer()
        
    def TPALSTM(self):
        embedding_layer = Embedding(nb_words, EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        
        lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

        sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences_1 = embedding_layer(sequence_1_input)
        x1 = lstm_layer(embedded_sequences_1)

        sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences_2 = embedding_layer(sequence_2_input)
        y1 = lstm_layer(embedded_sequences_2)

        merged = concatenate([x1, y1])
        merged = Dropout(rate_drop_dense)(merged)
        merged = BatchNormalization()(merged)

        merged = Dense(num_dense, activation=act)(merged)
        merged = Dropout(rate_drop_dense)(merged)
        merged = BatchNormalization()(merged)

        preds = Dense(1, activation='sigmoid')(merged)

        ########################################
        ## add class weight
        ########################################
        if re_weight:
            class_weight = {0: 1.309028344, 1: 0.472001959}
        else:
            class_weight = None

        ########################################
        ## train the model
        ########################################
        model = Model(inputs=[sequence_1_input, sequence_2_input], \
                outputs=preds)
        model.compile(loss='binary_crossentropy',
                optimizer='nadam',
                metrics=['acc'])
        #model.summary()
        print(STAMP)

        early_stopping =EarlyStopping(monitor='val_loss', patience=3)
        bst_model_path = STAMP + '.h5'
        model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

        hist = model.fit([data_1_train, data_2_train], labels_train, \
                validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
                epochs=200, batch_size=2048, shuffle=True, \
                class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

        model.load_weights(bst_model_path)
        bst_val_score = min(hist.history['val_loss'])

        ########################################
        ## make the submission
        ########################################
        print('Start making the submission before fine-tuning')

        preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
        preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
        preds /= 2

        submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
        submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)
    

    def stackedLSTM(self, shape, cells, target="regression"):    
        
        timesteps = shape[0]
        nfeatures = shape[1]
                
        nlayer = 1
        if isinstance(cells, list):
            units = cells
            nlayer = len(cells)
        else:
            units = [cells]
   
        model = Sequential()
        
        if nlayer > 1:
            for idx in range(nlayer):
                if idx == 0:  # the first hidden layer
                    model.add(LSTM(units[idx], input_shape=(timesteps, nfeatures), return_sequences=True, name="lstm_{}".format(idx))) 
                elif idx == nlayer - 1:  # the last hidden layer
                    model.add(LSTM(units[idx], name="lstm_{}".format(idx))) 
                else:
                    model.add(LSTM(units[idx], return_sequences=True, name="lstm_{}".format(idx))) 
        else:
#             model.add(LSTM(units[0], input_shape=(timesteps, nfeatures), name="lstm"))
            model.add(LSTM(units[0], input_shape=(timesteps, nfeatures), name="lstm_0")) 
                
        if target == "regression":
            model.add(Dense(nfeatures, activation='sigmoid', name="dense"))  # for regression
        else:
            model.add(Dense(nfeatures, activation='softmax', name="dense"))  # for classification
        
        return [model, self.callbacks, self.optimizer]
    
    def LSTMbasicAttention(self, shape, cells):
        '''
            shape = (timestep, feature)
            return [model, optimizer, callbacks]
        '''
        
        nfeatures = shape[1]
        
        inputs = Input(shape, name="input")  # return a tensor
        
        nlayer = 1
        if isinstance(cells, list):
            units = cells
        else:
            units = [cells]
                
        for idx, unit in enumerate(units):
            if idx == 0:
                x = LSTM(unit, return_sequences=True, name="LSTM_{}".format(idx))(inputs)
            else:
                x = LSTM(unit, return_sequences=True, name="LSTM_{}".format(idx))(x)
            x = Attention(shape[0])(x)
            
        outputs = Dense(nfeatures)(x)
        
        model = Model(inputs=inputs, outputs=outputs)
        
        return [model, self.callbacks, self.optimizer]
    
    @staticmethod
    def _callbacks(modeld, ckptd, mmonitor="val_loss", emonitor="loss", lmonitor="val_loss", name="ckpt"):
        
        timestamp = datetime.now().strftime("%Y%m%d%H%M")
        
        name_ = "{epoch:04d}_{loss:.3f}_{val_loss:.3f}"
        checkpointer = ModelCheckpoint(filepath=os.path.join(modeld, "{0}_{1}_{2}.hdf5".format(name, name_, timestamp)),
                                       verbose=1,
                                       save_best_only=True, 
                                       monitor=mmonitor)
        
        earlystopper = EarlyStopping(monitor=emonitor, patience=10)

        reduceLR = ReduceLROnPlateau(monitor=lmonitor, factor=0.5, patience=20, min_lr=0.00001)
        
        tb = TensorBoard(log_dir=ckptd)

        csvlogger = CSVLogger(os.path.join(ckptd, "{}_{}.log".format(name, timestamp)), append=False, separator=",")

        # Learning rate schedule.
    #     lr_schedule = LearningRateScheduler(fixed_schedule, verbose=0)

        return [checkpointer, earlystopper, reduceLR, tb, csvlogger]

#         return [checkpointer, earlystopper, reduceLR, csvlogger]
    
    @staticmethod
    def _optimizer(lr=1e-2):
        optimizer = SGD(lr=lr, momentum=0.9, nesterov=True)

        return optimizer

In [13]:
def main(X_train, y_train, epochs, batch_size):
    
    timesteps = X_train.shape[1]
    nfeatures = X_train.shape[2]
    stackedLSTM, callbacks_, optimizer_ = NNBuilder(modeld="model", ckptd="ckpt").stackedLSTM(shape=(timesteps, nfeatures), cells=60)
    stackedLSTM.compile(loss="mae", optimizer=optimizer_)

    history = stackedLSTM.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks_, validation_split=0.1, verbose=2, shuffle=True)
    
#     history = stackedLSTM.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=2, shuffle=True)

    fig, ax = plt.subplots()
    ax.plot(history.history['loss'], label='train')
    ax.plot(history.history['val_loss'], label='test')
    ax.legend(fontsize=14)
    plt.savefig("./ckpt/trainingHistory.png")
    plt.close()

In [14]:
if __name__ == "__main__":
    stackedLSTM, callbacks_, optimizer_ = NNBuilder().stackedLSTM([6, 4], 60)
    stackedLSTM.compile(loss="mae", optimizer=optimizer_)
    stackedLSTM.summary()
#     history = stackedLSTM.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks_, validation_split=0.1, verbose=2, shuffle=True)


    X_train = np.random.random_sample((1000, 6, 4))
    y_train = np.random.random_sample((1000, 4))

    main(X_train, y_train, 3, 30)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_0 (LSTM)                (None, 60)                15600     
_________________________________________________________________
dense (Dense)                (None, 4)                 244       
Total params: 15,844
Trainable params: 15,844
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2000

Epoch 00001: val_loss improved from inf to 0.24996, saving model to model/ckpt_0001_0.250_0.250_202008071719.hdf5
18000/18000 - 29s - loss: 0.2501 - val_loss: 0.2500
Epoch 2/2000

Epoch 00002: val_loss improved from 0.24996 to 0.24988, saving model to model/ckpt_0002_0.250_0.250_202008071719.hdf5
18000/18000 - 28s - loss: 0.2501 - val_loss: 0.2499
Epoch 3/2000

Epoch 00003: val_loss did not improve from 0.24988
18000/18000 - 29s - loss: 0.2501 - val_loss: 0.2499
Epoch 4/2000

Epoch 00004: val_

KeyboardInterrupt: 