In [1]:
!pip install keras==2.2.4

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple


In [5]:
pip install --upgrade keras_applications

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Requirement already up-to-date: keras_applications in /opt/conda/envs/tensorflow_py3/lib/python3.6/site-packages (1.0.8)
Note: you may need to restart the kernel to use updated packages.


In [1]:
from keras.models import Model
from keras.optimizers import SGD,Adam,RMSprop
from keras.layers import *
import os
from keras import backend as K
from keras.engine.topology import Layer
import h5py
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

# from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
class Embedding(Layer):

    def __init__(self, vocab_size, model_dim, **kwargs):
        self._vocab_size = vocab_size
        self._model_dim = model_dim
        super(Embedding, self).__init__(**kwargs)

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self._vocab_size, self._model_dim),
            initializer='glorot_uniform',
            name="embeddings")
        super(Embedding, self).build(input_shape)

    def call(self, inputs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')
        embeddings = K.gather(self.embeddings, inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        return embeddings

    def compute_output_shape(self, input_shape):

        return input_shape + (self._model_dim,)
    
class Add(Layer):

    def __init__(self, **kwargs):
        super(Add, self).__init__(**kwargs)

    def call(self, inputs):
        input_a, input_b = inputs
        return input_a + input_b

    def compute_output_shape(self, input_shape):
        return input_shape[0]
    
class ScaledDotProductAttention(Layer):

    def __init__(self, masking=True, future=False, dropout_rate=0., **kwargs):
        self._masking = masking
        self._future = future
        self._dropout_rate = dropout_rate
        self._masking_num = -2**32+1
        super(ScaledDotProductAttention, self).__init__(**kwargs)

    def mask(self, inputs, masks):
        masks = K.cast(masks, 'float32')
        masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1])
        masks = K.expand_dims(masks, 1)
        outputs = inputs + masks * self._masking_num
        return outputs
    
    def future_mask(self, inputs):
        diag_vals = tf.ones_like(inputs[0, :, :])
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  
        future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])
        paddings = tf.ones_like(future_masks) * self._masking_num
        outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs)
        return outputs

    def call(self, inputs):
        if self._masking:
            assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
            queries, keys, values, masks = inputs
        else:
            assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
            queries, keys, values = inputs

        if K.dtype(queries) != 'float32':  queries = K.cast(queries, 'float32')
        if K.dtype(keys) != 'float32':  keys = K.cast(keys, 'float32')
        if K.dtype(values) != 'float32':  values = K.cast(values, 'float32')

        matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul
        scaled_matmul = matmul / int(queries.shape[-1]) ** 0.5  # Scale
        if self._masking:
            scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.)

        if self._future:
            scaled_matmul = self.future_mask(scaled_matmul)

        softmax_out = K.softmax(scaled_matmul) # SoftMax
        # Dropout
        out = K.dropout(softmax_out, self._dropout_rate)
        
        outputs = K.batch_dot(out, values)

        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape

    
class PositionEncoding(Layer):

    def __init__(self, model_dim, **kwargs):
        self._model_dim = model_dim
        super(PositionEncoding, self).__init__(**kwargs)

    def call(self, inputs):
        seq_length = inputs.shape[1]
        position_encodings = np.zeros((seq_length, self._model_dim))
        for pos in range(seq_length):
            for i in range(self._model_dim):
                position_encodings[pos, i] = pos / np.power(10000, (i-i%2) / self._model_dim)
        position_encodings[:, 0::2] = np.sin(position_encodings[:, 0::2]) # 2i
        position_encodings[:, 1::2] = np.cos(position_encodings[:, 1::2]) # 2i+1
        position_encodings = K.cast(position_encodings, 'float32')
        return position_encodings

    def compute_output_shape(self, input_shape):
        return input_shape

class MultiHeadAttention(Layer):

    def __init__(self, n_heads, head_dim, dropout_rate=.1, masking=True, future=False, trainable=True, **kwargs):
        self._n_heads = n_heads
        self._head_dim = head_dim
        self._dropout_rate = dropout_rate
        self._masking = masking
        self._future = future
        self._trainable = trainable
        super(MultiHeadAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self._weights_queries = self.add_weight(
            shape=(input_shape[0][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_queries')
        self._weights_keys = self.add_weight(
            shape=(input_shape[1][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_keys')
        self._weights_values = self.add_weight(
            shape=(input_shape[2][-1], self._n_heads * self._head_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name='weights_values')
        super(MultiHeadAttention, self).build(input_shape)


    def call(self, inputs):
        if self._masking:
            assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
            queries, keys, values, masks = inputs
        else:
            assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
            queries, keys, values = inputs
        
        queries_linear = K.dot(queries, self._weights_queries) 
        keys_linear = K.dot(keys, self._weights_keys)
        values_linear = K.dot(values, self._weights_values)

        queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0)
        keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0)
        values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0)
        
        if self._masking:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads, masks]
        else:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads]
            
        attention = ScaledDotProductAttention(
            masking=self._masking, future=self._future, dropout_rate=self._dropout_rate)
        att_out = attention(att_inputs)

        outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2)
        
        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape  

In [3]:
class PositionWiseFeedForward(Layer):
    
    def __init__(self, model_dim, inner_dim, trainable=True, **kwargs):
        self._model_dim = model_dim
        self._inner_dim = inner_dim
        self._trainable = trainable
        super(PositionWiseFeedForward, self).__init__(**kwargs)

    def build(self, input_shape):
        self.weights_inner = self.add_weight(
            shape=(input_shape[-1], self._inner_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name="weights_inner")
        self.weights_out = self.add_weight(
            shape=(self._inner_dim, self._model_dim),
            initializer='glorot_uniform',
            trainable=self._trainable,
            name="weights_out")
        self.bais_inner = self.add_weight(
            shape=(self._inner_dim,),
            initializer='uniform',
            trainable=self._trainable,
            name="bais_inner")
        self.bais_out = self.add_weight(
            shape=(self._model_dim,),
            initializer='uniform',
            trainable=self._trainable,
            name="bais_out")
        super(PositionWiseFeedForward, self).build(input_shape)

    def call(self, inputs):
        if K.dtype(inputs) != 'float32':
            inputs = K.cast(inputs, 'float32')
        inner_out = K.relu(K.dot(inputs, self.weights_inner) + self.bais_inner)
        outputs = K.dot(inner_out, self.weights_out) + self.bais_out
        return outputs

    def compute_output_shape(self, input_shape):
        return self._model_dim

    
class LayerNormalization(Layer):

    def __init__(self, epsilon=1e-8, **kwargs):
        self._epsilon = epsilon
        super(LayerNormalization, self).__init__(**kwargs)

    def build(self, input_shape):
        self.beta = self.add_weight(
            shape=(input_shape[-1],),
            initializer='zero',
            name='beta')
        self.gamma = self.add_weight(
            shape=(input_shape[-1],),
            initializer='one',
            name='gamma')
        super(LayerNormalization, self).build(input_shape)

    def call(self, inputs):
        mean, variance = tf.nn.moments(inputs, [-1], keepdims=True)
        normalized = (inputs - mean) / ((variance + self._epsilon) ** 0.5)
        outputs = self.gamma * normalized + self.beta
        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape
    
class Transformer(Layer):

    def __init__(self, vocab_size, model_dim, 
            n_heads=8, encoder_stack=6, decoder_stack=6, feed_forward_size=2048, dropout_rate=0.1, **kwargs):
        self._vocab_size = vocab_size
        self._model_dim = model_dim
        self._n_heads = n_heads
        self._encoder_stack = encoder_stack
        self._decoder_stack = decoder_stack
        self._feed_forward_size = feed_forward_size
        self._dropout_rate = dropout_rate
        super(Transformer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self._vocab_size, self._model_dim),
            initializer='glorot_uniform',
            trainable=True,
            name="embeddings")
        super(Transformer, self).build(input_shape)


    def encoder(self, inputs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')

        masks = K.equal(inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        # Position Encodings
        position_encodings = PositionEncoding(self._model_dim)(embeddings)
        # Embedings + Postion-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)

        for i in range(self._encoder_stack):
            # Multi-head-Attention
            attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
            attention_input = [encodings, encodings, encodings, masks]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += encodings
            attention_out = LayerNormalization()(attention_out)
            # Feed-Forward
            ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = LayerNormalization()(ff_out)

        return encodings, masks


    def decoder(self, inputs):
        decoder_inputs, encoder_encodings, encoder_masks = inputs
        if K.dtype(decoder_inputs) != 'int32':
            decoder_inputs = K.cast(decoder_inputs, 'int32')

        decoder_masks = K.equal(decoder_inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, decoder_inputs)
        embeddings *= self._model_dim ** 0.5 # Scale
        # Position Encodings
        position_encodings = PositionEncoding(self._model_dim)(embeddings)
        # Embedings + Postion-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)
        
        for i in range(self._decoder_stack):
            # Masked-Multi-head-Attention
            masked_attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True)
            masked_attention_input = [encodings, encodings, encodings, decoder_masks]
            masked_attention_out = masked_attention(masked_attention_input)
            # Add & Norm
            masked_attention_out += encodings
            masked_attention_out = LayerNormalization()(masked_attention_out)

            # Multi-head-Attention
            attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
            attention_input = [masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += masked_attention_out
            attention_out = LayerNormalization()(attention_out)

            # Feed-Forward
            ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = LayerNormalization()(ff_out)

        # Pre-Softmax 与 Embeddings 共享参数
        linear_projection = K.dot(encodings, K.transpose(self.embeddings))
        outputs = K.softmax(linear_projection)
        return outputs

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        encoder_encodings, encoder_masks = self.encoder(encoder_inputs)
        encoder_outputs = self.decoder([decoder_inputs, encoder_encodings, encoder_masks])
        return encoder_outputs

    def compute_output_shape(self, input_shape):
        return  (input_shape[0][0], input_shape[0][1], self._vocab_size)
    


In [4]:
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input
# from tensorflow.keras.utils import plot_model

vocab_size = 5000
max_seq_len = 256 
model_dim = 512

vocab_size = 32
max_seq_len = 256 
model_dim = 5000

encoder_inputs = Input(shape=(max_seq_len,), name='encoder_inputs')
decoder_inputs = Input(shape=(max_seq_len,), name='decoder_inputs')
tran_outputs = Transformer(vocab_size, model_dim)([encoder_inputs, decoder_inputs])

outputs = Bidirectional(LSTM(16,activation='softsign',return_sequences=False))(tran_outputs)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)

model.summary()




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 256)          0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, 256)          0                                            
__________________________________________________________________________________________________
transformer_1 (Transformer)     (None, 256, 5000)    2560000     encoder_inputs[0][0]             
                                                                 decoder_inputs[0][0]          

In [8]:
def buid_model():
    
    vocab_size = 5000
    max_seq_len = 200 
    model_dim = 512
    
    S_inputs = Input(shape=(200,),name='main_input',dtype='int32')
#     with h5py.File('../../get_w2v_feat/w2v1_pre_ad/embeddings_matrix_ad_size_32_w100_count_0.h5','r') as f:
#         embeddings_matrix = np.array(f.get('embeddings_matrix'))
    

#     EMBEDDING_DIM = 32 #词向量维度
#     MAX_SEQUENCE_LENGTH = 200
#     embeddings = Embedding(input_dim = len(embeddings_matrix), # 字典长度
#                                 output_dim = EMBEDDING_DIM, # 词向量 长度（100）
#                                 weights=[embeddings_matrix], # 重点：预训练的词向量系数
#                                 input_length=MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
#                                 trainable=False, # 是否在 训练的过程中 更新词向量
#                                 mask_zero = True)(S_inputs)
    
#     embeddings = np.random.randint(0,model_dim,(2000,200))
    outputs = Transformer(vocab_size, model_dim)([S_inputs, S_inputs])
#     outputs = Bidirectional(LSTM(32,activation='softsign',return_sequences=False))(outputs)
    outputs = GlobalMaxPool1D()(outputs)
    outputs = Dense(10, activation='softmax', name='main_output')(outputs)
    model = keras.models.Model(inputs=[S_inputs], outputs=outputs)
    
#     O_seq = Bidirectional(LSTM(32,activation='softsign',return_sequences=False))(embeddings)
#     O_seq = Attention()(O_seq)
#     O_seq = Bidirectional(LSTM(16,activation='softsign',return_sequences=False))(O_seq)
#     O_seq = GlobalAveragePooling1D()(O_seq)
#     O_seq = GlobalMaxPool1D()(O_seq)
#     outputs = Dense(10, activation='softmax', name='main_output')(O_seq)

    # 定义一个具有两个输入输出的模型
#     model = keras.models.Model(inputs=[S_inputs],#,auxiliary_input],
#                                outputs=[outputs])  # 这里的输入输出顺序与fit时一致就好
    
#     model.layers[1].trainable = False
    
#     opt = RMSprop(lr=0.01,  clipnorm=1.0)
    opt = Adam(lr=0.01)
    model.compile(optimizer=opt,
                  sample_weight_mode='None',#"temporal",
                  loss={'main_output': 'categorical_crossentropy'},
                 metrics=['accuracy'])
    print(model.summary())
    return model

#     print(model.summary())


In [9]:
def data_load():
    print('loading data ... \n')

    with h5py.File('../../get_w2v_feat/w2v1_pre_ad/word_train_ad_w2v.h5', 'r') as f:
        data = np.array(f.get('word_data'))

    label = pd.read_csv('../../train_preliminary/user.csv').sort_values(by=['user_id'])

    train_x, test_x, train_y, test_y = train_test_split(data, label, test_size=0.2, random_state=2020)

    train_y_age = train_y['age'].values - 1
    train_y_age = keras.utils.np_utils.to_categorical(train_y_age, num_classes=10)
    train_y_gender = train_y['gender'].values - 1

    test_y_age = test_y['age'].values - 1
    test_y_age = keras.utils.np_utils.to_categorical(test_y_age, num_classes=10)
    test_y_gender = test_y['gender'].values - 1

    print('get data ... \n')

    return train_x, test_x, train_y_age, train_y_gender,test_y_age,test_y_gender

In [10]:
model = buid_model()
print('lstm model geted...\n')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 200)          0                                            
__________________________________________________________________________________________________
transformer_3 (Transformer)     (None, 200, 5000)    2560000     main_input[0][0]                 
                                                                 main_input[0][0]                 
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 5000)         0           transformer_3[0][0]              
__________________________________________________________________________________________________
main_output (Dense)             (None, 10)           50010       global_max_pooling1d_1[0][0]     
Total para

In [13]:
train_x, test_x, train_y_age, train_y_gender,test_y_age,test_y_gender = data_load()

loading data ... 

get data ... 



In [None]:
def get_filename_for_saving(save_dir):
    return os.path.join(save_dir,
                        "self_lstm_attention_max_dense_ad_age_adm_{val_loss:.3f}-{val_acc:.3f}-{epoch:03d}-{loss:.3f}-{acc:.3f}.hdf5")

print('lstm model fit...\n')
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=get_filename_for_saving(''),
    save_best_only=False)
stopping = keras.callbacks.EarlyStopping(patience=8)
reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.0001)

# with h5py.File('../train_data_weight.h5', 'r') as f:
#         weight = np.array(f.get('weight'))
# train_w, test_w= train_test_split(weight, test_size=0.2, random_state=2020)

# train_w = np.squeeze(train_w)
# t_w = 10/np.log(train_w)

test_x = train_x = np.random.randint(0,16,(2000,200))
train_y_age = np.random.randint(0,10,(2000,1))
train_y_age = test_y_age = keras.utils.np_utils.to_categorical(train_y_age, num_classes=10)


lstm model fit...


Train on 2000 samples, validate on 2000 samples
Epoch 1/100


In [None]:
model.fit({'main_input': train_x },#,'aux_input': train_x_sta},
          {'main_output': train_y_age},
          epochs=100,
          batch_size=256,
          validation_data=({'main_input': test_x},#,'aux_input': test_x_sta},
                           {'main_output': test_y_age}),
          callbacks=[checkpointer, reduce_lr, stopping])

In [2]:
!pip install keras-transformer

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting keras-transformer
  Downloading http://mirrors.tencentyun.com/pypi/packages/8a/2b/c465241bd3f37a3699246827ff4ad7974c6edeaa69cf9cdcff2fd1d3ba46/keras-transformer-0.37.0.tar.gz (11 kB)
Collecting keras-pos-embd>=0.11.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz (5.9 kB)
Collecting keras-multi-head>=0.27.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/e6/32/45adf2549450aca7867deccfa04af80a0ab1ca139af44b16bc669e0e09cd/keras-multi-head-0.27.0.tar.gz (14 kB)
Collecting keras-layer-normalization>=0.14.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz (4.3 kB)
Collecting keras-position-wise-feed-forward>=0.6.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/e3/59/f0faa1037c033

In [4]:
import numpy as np
from keras_transformer import get_model

# Build a small toy token dictionary
tokens = 'all work and no play makes jack a dull boy'.split(' ')
token_dict = {
    '<PAD>': 0,
    '<START>': 1,
    '<END>': 2,
}
for token in tokens:
    if token not in token_dict:
        token_dict[token] = len(token_dict)

In [5]:
token_dict

{'<PAD>': 0,
 '<START>': 1,
 '<END>': 2,
 'all': 3,
 'work': 4,
 'and': 5,
 'no': 6,
 'play': 7,
 'makes': 8,
 'jack': 9,
 'a': 10,
 'dull': 11,
 'boy': 12}

In [6]:
# Generate toy data
encoder_inputs_no_padding = []
encoder_inputs, decoder_inputs, decoder_outputs = [], [], []
for i in range(1, len(tokens) - 1):
    encode_tokens, decode_tokens = tokens[:i], tokens[i:]
    encode_tokens = ['<START>'] + encode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(encode_tokens))
    output_tokens = decode_tokens + ['<END>', '<PAD>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
    decode_tokens = ['<START>'] + decode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
    encode_tokens = list(map(lambda x: token_dict[x], encode_tokens))
    decode_tokens = list(map(lambda x: token_dict[x], decode_tokens))
    output_tokens = list(map(lambda x: [token_dict[x]], output_tokens))
    encoder_inputs_no_padding.append(encode_tokens[:i + 2])
    encoder_inputs.append(encode_tokens)
    decoder_inputs.append(decode_tokens)
    decoder_outputs.append(output_tokens)

In [7]:
encoder_inputs

[[1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 3, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0],
 [1, 3, 4, 5, 6, 2, 0, 0, 0, 0, 0, 0],
 [1, 3, 4, 5, 6, 7, 2, 0, 0, 0, 0, 0],
 [1, 3, 4, 5, 6, 7, 8, 2, 0, 0, 0, 0],
 [1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0],
 [1, 3, 4, 5, 6, 7, 8, 9, 10, 2, 0, 0]]

In [8]:
decoder_inputs

[[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0],
 [1, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0],
 [1, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0],
 [1, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0],
 [1, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0],
 [1, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0],
 [1, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0],
 [1, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0, 0]]

In [21]:
S_inputs = Input(shape=(200,),name='main_input',dtype='int32')

# Build the model
model = get_model(
    token_num=len(token_dict),
    embed_dim=30,
    encoder_num=3,
    decoder_num=2,
    head_num=3,
    hidden_dim=120,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((13, 30)),
)
tran_out = model(x=[np.asarray(encoder_inputs * 1000), np.asarray(decoder_inputs * 1000)],
    y=np.asarray(decoder_outputs * 1000))
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
)
model.summary()

# Train the model
model.fit(
    x=[np.asarray(encoder_inputs * 1000), np.asarray(decoder_inputs * 1000)],
    y=np.asarray(decoder_outputs * 1000),
    epochs=5,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embedding (EmbeddingRet)  [(None, None, 30), ( 390         Encoder-Input[0][0]              
                                                                 Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Embedding (TrigPosEmbed (None, None, 30)     0           Token-Embedding[0][0]            
__________

ValueError: Error when checking target: expected Decoder-Output to have 3 dimensions, but got array with shape (8000, 1)

In [24]:
!pip install transformers

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting transformers
  Downloading http://mirrors.tencentyun.com/pypi/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 799 kB/s eta 0:00:01
[?25hCollecting tokenizers==0.7.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 1.6 MB/s eta 0:00:01     |██████████████▉                 | 1.7 MB 1.6 MB/s eta 0:00:02     |█████████████████████████▉      | 3.0 MB 1.6 MB/s eta 0:00:01
Collecting regex!=2019.12.17
  Downloading http://mirrors.tencentyun.com/pypi/packages/1a/a1/6d8fdf4a20ffbbf2bd6003dff47a0628b9e6a4b840c421b0dec27da9376e/regex-2020.6.8-cp36-cp36m-manylinux2010_x86_64.whl (660 kB)
[K     |█████████████████████████

In [25]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

ModuleNotFoundError: No module named 'tensorflow_datasets'

In [None]:


# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

# Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)

# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")