In [None]:
import numpy as np

import tensorflow as tf

!pip install -q tensorflow-hub
!pip install -q tensorflow-datasets
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

In [None]:
# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

In [None]:
def preprocess(X_batch, n_words=500):
    shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])
    Z = tf.strings.substr(X_batch, 0, n_words)
    Z = tf.strings.lower(Z)
    Z = tf.strings.regex_replace(Z, b"<br\\s*/?>", b" ")
    Z = tf.strings.regex_replace(Z, b"[^a-z]", b" ")
    Z = tf.strings.split(Z)
    return Z.to_tensor(shape=shape, default_value=b"<pad>")

X_example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!!!"])

In [None]:
from collections import Counter

def get_vocabulary(data_sample, max_size=1000):
    preprocessed_reviews = preprocess(data_sample).numpy()
    counter = Counter()
    for words in preprocessed_reviews:
        for word in words:
            if word != b"<pad>":
                counter[word] += 1
    return [b"<pad>"] + [word for word, count in counter.most_common(max_size)]

get_vocabulary(X_example)

In [None]:
class TextVectorization(tf.keras.layers.Layer):
    def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.max_vocabulary_size = max_vocabulary_size
        self.n_oov_buckets = n_oov_buckets

    def adapt(self, data_sample):
        self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)
        words = tf.constant(self.vocab)
        word_ids = tf.range(len(self.vocab), dtype=tf.int64)
        vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
        self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)
        
    def call(self, inputs, input_length):
        preprocessed_inputs = preprocess(inputs, n_words=input_length)
        return self.table.lookup(preprocessed_inputs)
    
    def get_config(self):
        config = super(TextVectorization, self).get_config()
        config.update({
            'max_vocabulary_size': self.max_vocabulary_size,
            'n_oov_buckets': self.n_oov_buckets
        })
        return config

In [None]:
max_vocabulary_size = 100000
n_oov_buckets = 10
text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets,
                                       input_shape=[])

train_examples_batch, train_labels_batch = next(iter(train_data.batch(15000)))
text_vectorization.adapt(train_examples_batch)

In [None]:
text_vectorization.vocab

In [None]:
input_length = 600
embed_size = 128
filter_sizes = '1,2,3'
num_filters = 1500
vocab_size = len(text_vectorization.vocab) + n_oov_buckets

input = tf.keras.layers.Input(shape=(), dtype=tf.string)
vectorized = text_vectorization(input, input_length)

embed_initer = tf.keras.initializers.RandomUniform(minval=-1, maxval=1)
embed = tf.keras.layers.Embedding(vocab_size, embed_size,
                               embeddings_initializer=embed_initer,
                               input_length=input_length,
                               name='embedding')(vectorized)
# single channel. If using real embedding, you can set one static
embed = tf.keras.layers.Reshape((input_length, embed_size, 1), name='add_channel')(embed)
#embed = tf.expand_dims(embed, -1)
pool_outputs = []
for filter_size in list(map(int, filter_sizes.split(','))):
    filter_shape = (filter_size, embed_size)
    conv = tf.keras.layers.Conv2D(num_filters, filter_shape, strides=(1, 1), padding='valid',
                               data_format='channels_last', activation='relu',
                               kernel_initializer='glorot_normal',
                               bias_initializer=tf.keras.initializers.constant(0.1),
                               name='convolution_{:d}'.format(filter_size))(embed)
    max_pool_shape = (input_length - filter_size + 1, 1)
    pool = tf.keras.layers.MaxPool2D(pool_size=max_pool_shape,
                                  strides=(1, 1), padding='valid',
                                  data_format='channels_last',
                                  name='max_pooling_{:d}'.format(filter_size))(conv)
    pool_outputs.append(pool)
pool_outputs = tf.keras.layers.concatenate(pool_outputs, axis=-1, name='concatenate')
pool_outputs = tf.keras.layers.Flatten(data_format='channels_last', name='flatten')(pool_outputs)
pool_outputs = tf.keras.layers.Dropout(0.4, name='dropout1')(pool_outputs)
dense = tf.keras.layers.Dense(256, name='dense1')(pool_outputs)
dense = tf.keras.layers.Dropout(0.4, name='dropout2')(dense)
outputs = tf.keras.layers.Dense(1, name='dense2')(dense)
model = tf.keras.models.Model(inputs=[input],outputs=[outputs])
model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt,
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("text_cnn_no_pretraining", save_weights_only=True, save_best_only=True)

In [None]:
history = model.fit(train_data.shuffle(10000).batch(128),
                    epochs=30,
                    validation_data=validation_data.batch(128),
                    callbacks=[checkpoint_cb],
                    verbose=1)

In [None]:
model.load_weights("text_cnn_no_pretraining")

In [None]:
results = model.evaluate(test_data.batch(32), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

In [None]:
tf.saved_model.save(model, "text_cnn_no_pretraining")

In [None]:
saved_model = tf.saved_model.load("text_cnn_no_pretraining")
y_pred = saved_model(tf.constant(["this is a terrible movie.","this is a good movie.","very interesting movie","i wouldn't watch this movie.","i recommend this movie."]))
y_pred