In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

import tensorflow_datasets as tfds

from tensorflow.keras import layers
from tensorflow.keras import losses

## Tensorflow for text classification

Load data

In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWNNVUS/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWNNVUS/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteWNNVUS/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In TensorFlow, the "buffer_size" parameter is often used in the context of data loading and shuffling. This parameter is commonly associated with methods like shuffle in TensorFlow datasets or the shuffle method of a tf.data.Dataset. The buffer size determines the number of elements from the dataset that should be loaded into a buffer before shuffling.

Here's how it typically works:

The dataset is initially populated with elements from the data source.
A buffer of the specified size is filled with elements randomly chosen from the dataset.
When an element is requested from the dataset, it is sampled from this buffer.
Once the buffer is emptied, it gets replenished with the next set of elements from the dataset.
The purpose of using a buffer is to introduce randomness into the order of elements during training, especially when the dataset doesn't fit entirely in memory. Shuffling is essential for preventing the model from learning patterns based on the order of data samples.

In [None]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'"The Woman in Black" is easily one of the creepiest British ghost stories ever made.A young solicitor,after arriving in a small town to handle a dead client\'s estate,is haunted by a mysterious woman dressed all in black.The film is loaded with extremely eerie atmosphere and the frights are calculated for and deliver the maximum effect possible.The action keeps the viewer deeply involved and the finale is quite disturbing.The acting is excellent and the tension is almost unbearable at times.So if you want to see a truly creepy horror film give this one a look.I dare anyone to watch "The Woman in Black" alone at night with the lights off.Highly recommended.10 out of 10.'
 b'Well what I can say about this movie is that it\'s great to see so many Asian faces. What I didn\'t like about the film was that it was full of stereotypes of what typical racial characters would do in their role. The Asian girl without confidence who has to play someone else to get ahead, the white guy in

TextVectorization (encoder) layer



standardize data

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
VOCAB_SIZE = 1000
sequence_length = 250

encoder = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=sequence_length)

encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1) #add an extra dimension
  return encoder(text), label

Here is the demo of how to use TextVectorization layer of tensorflow to vectorize the input sentences. Each word or subword is represented as an integer. Given a tensorflow string (byte string), the textvectorization layer will return a tensor cntaining an array of integres with a specificed length (sequence_length). The size of the learned vocabulary is definied by max_features, more frequent words will be learned. Based on the original length of input sentence, padding and truncating will be implented.

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(train_dataset))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", first_label)
print("Vectorized review", vectorize_text(first_review, first_label))
print("Length of word embedding vector", len(vectorize_text(first_review, first_label)[0][0]))

Review tf.Tensor(b'This barely watchable film was a bit of an ordeal to sit through. None of the segments are good, but at least the first one was mildly amusing, and the middle one was somewhat imaginative. The final one was just plain brutal, and after sitting through two weak comedic shorts, the third one was truly painful to watch. Even by the low standards of a National Lampoon movie, this one seemed especially boring and joyless.', shape=(), dtype=string)
Label tf.Tensor(0, shape=(), dtype=int64)
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[ 11,   1,   1,  19,  13,   4, 221,   5,  33,   1,   6, 856, 140,
        590,   5,   2,   1,  23,  49,  18,  30, 214,   2,  85,  28,  13,
          1,   1,   3,   2, 755,  28,  13, 623,   1,   2, 464,  28,  13,
         40,   1,   1,   3, 100,   1, 140, 104, 807,   1,   1,   2, 845,
         28,  13, 356,   1,   6, 103,  53,  32,   2, 485,   1,   5,   4,
          1,   1,  17,  11,  28, 449, 260, 351,   3,   1,   

In [None]:
print("987 ---> ",encoder.get_vocabulary()[987])
print(" 313 ---> ",encoder.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(encoder.get_vocabulary())))

987 --->  secret
 313 --->  idea
Vocabulary size: 1000


Model building

In [None]:
bidirectionallstm = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=256, #the output vector for each token (the token is the index of each real word or sub word) is 256 dimension
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
bidirectionallstm.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
bilstmhistory = bidirectionallstm.fit(train_dataset, epochs=3,
                    validation_data=test_dataset)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
bidirectionallstm.predict(test_dataset)



array([[-1.6519271],
       [ 1.8352096],
       [-2.9763334],
       ...,
       [-3.8325205],
       [ 2.2626734],
       [ 1.6434814]], dtype=float32)

In [None]:
import numpy as np
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = bidirectionallstm.predict(np.array([sample_text]))



In [None]:
predictions

array([[1.0776657]], dtype=float32)