In [None]:
# !pip install tensorflow==2.0
# !pip install tensorflow_hub
# !pip install bert-for-tf2
# !pip install sentencepiece

In [1]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, LSTM, Embedding, Flatten, Dense, LeakyReLU, Input
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D
from tensorflow.keras.layers import UpSampling2D, Conv2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
import pickle, sys
import numpy as np
import tensorflow_hub as hub
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Flatten, Dense, LeakyReLU, Input
from tensorflow.python.keras import backend 
import pickle

2.0.0


In [2]:
# Helper functions for BERT model
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [3]:
# Initializations
max_seq_length = 128
embed_length = 768
bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(bert_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [4]:
# Build the BERT embedder
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=clf_output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [5]:
# test embedder on a simple sentence
s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]
input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)
model.predict([[input_ids],[input_masks],[input_segments]]).shape

(1, 768)

In [6]:
def embed_row(text):
    stokens = tokenizer.tokenize(text)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length) 
    return model.predict([[input_ids],[input_masks],[input_segments]])

In [7]:
# embed the whole data, but do it in chunks. The whole thing may not fit into memory all at once.
chunksize = 50
X = None
for i, chunk in enumerate(pd.read_csv("train.csv", usecols=['text'], chunksize=chunksize)):
    # if i>10:
    #    break
    if type(X)==type(None):
        X = np.vstack(chunk.text.apply(embed_row).values)
        X = np.expand_dims(X, axis=2)
    else:
        X_temp = np.vstack(chunk.text.apply(embed_row).values)
        X_temp = np.expand_dims(X_temp, axis=2)
        X = np.vstack((X, X_temp))
    print(X.shape)

(50, 768, 1)
(100, 768, 1)
(150, 768, 1)
(200, 768, 1)
(250, 768, 1)
(300, 768, 1)
(350, 768, 1)
(400, 768, 1)
(450, 768, 1)
(500, 768, 1)
(550, 768, 1)
(600, 768, 1)
(650, 768, 1)
(700, 768, 1)
(750, 768, 1)
(800, 768, 1)
(850, 768, 1)
(900, 768, 1)
(950, 768, 1)
(1000, 768, 1)
(1050, 768, 1)
(1100, 768, 1)
(1150, 768, 1)
(1200, 768, 1)
(1250, 768, 1)
(1300, 768, 1)
(1350, 768, 1)
(1400, 768, 1)
(1450, 768, 1)
(1500, 768, 1)
(1550, 768, 1)
(1600, 768, 1)
(1650, 768, 1)
(1700, 768, 1)
(1750, 768, 1)
(1800, 768, 1)
(1850, 768, 1)
(1900, 768, 1)
(1950, 768, 1)
(2000, 768, 1)
(2050, 768, 1)
(2100, 768, 1)
(2150, 768, 1)
(2200, 768, 1)
(2250, 768, 1)
(2300, 768, 1)
(2350, 768, 1)
(2400, 768, 1)
(2450, 768, 1)
(2500, 768, 1)
(2550, 768, 1)
(2600, 768, 1)
(2650, 768, 1)
(2700, 768, 1)
(2750, 768, 1)
(2800, 768, 1)
(2850, 768, 1)
(2900, 768, 1)
(2950, 768, 1)
(3000, 768, 1)
(3050, 768, 1)
(3100, 768, 1)
(3150, 768, 1)
(3200, 768, 1)
(3250, 768, 1)
(3300, 768, 1)
(3350, 768, 1)
(3400, 768, 1)


In [8]:
with open('X.p', 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)