In [1]:
import tensorflow as tf

In [2]:
import tensorflow_hub as hub

In [15]:
import tokenization as tokenizer #from https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1

In [70]:
max_seq_length = 128  # Your choice here.

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")

In [7]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1",
                            trainable=True)

In [93]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [65]:
import gc
gc.collect()

239

In [13]:
import numpy as np

In [17]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenizer.FullTokenizer(vocab_file, do_lower_case)

In [18]:
texts = ['I love you', 'I want to go home']

In [19]:
text = tokenizer.tokenize(texts[1])
print(text)

['I', 'want', 'to', 'go', 'home']


In [20]:
input_sequence = ['[CLS]'] + text + ['[SEP]']
print(input_sequence)

['[CLS]', 'I', 'want', 'to', 'go', 'home', '[SEP]']


In [21]:
input_sequence[:100-3]

['[CLS]', 'I', 'want', 'to', 'go', 'home', '[SEP]']

In [22]:
tokens = tokenizer.convert_tokens_to_ids(input_sequence)
print(tokens)

[101, 146, 1328, 1106, 1301, 1313, 102]


In [23]:
max_seq_length = 128

In [24]:
pad_len = max_seq_length - len(input_sequence)

In [25]:
tokens += [0] * pad_len

In [26]:
pad_masks = [1] * len(input_sequence) + [0] * pad_len

In [36]:
segment_ids = list(range(0, max_seq_length))

In [37]:
print(tokens, '\n', pad_masks, '\n', segment_ids)

[101, 146, 1328, 1106, 1301, 1313, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
 [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 5

In [94]:
out = sequence_output[:, 0]

In [95]:
out

<tf.Tensor 'strided_slice_12:0' shape=(None, 768) dtype=float32>

In [96]:
out = tf.keras.layers.Dense(1, activation='sigmoid')(out)

In [97]:
model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

In [98]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 108310273   input_word_ids[0][0]             
                                                                 input_mask[0][0]          

In [102]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001),  # Optimizer
              # Loss function to minimize
              loss=tf.keras.losses.binary_crossentropy,
              # List of metrics to monitor
              metrics=['accuracy'])

In [None]:
    def build_model(self):
        
        input_word_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_word_ids')
        input_mask = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_mask')
        segment_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='segment_ids')    
        
        pooled_output, sequence_output = self.bert_layer([input_word_ids, input_mask, segment_ids])   
        clf_output = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(clf_output)
        
        model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
        optimizer = SGD(learning_rate=self.lr, momentum=0.8)
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        
        return model


In [None]:
    def encode(self, texts):
                
        all_tokens = []
        all_masks = []
        all_segments = []

        for text in texts:
            text = self.tokenizer.tokenize(text)
            text = text[:self.max_seq_length - 2]
            input_sequence = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_seq_length - len(input_sequence)

            tokens = self.tokenizer.convert_tokens_to_ids(input_sequence)
            tokens += [0] * pad_len
            pad_masks = [1] * len(input_sequence) + [0] * pad_len
            segment_ids = [0] * self.max_seq_length

            all_tokens.append(tokens)
            all_masks.append(pad_masks)
            all_segments.append(segment_ids)

        return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
