In [1]:
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [2]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

In [3]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

#### DATA is PREPROCESSED 

Where are the movie reviews? Well, as you can see, the dataset is already preprocessed for you: X_train consists of a list of reviews, each of which is represented as a NumPy array of integers, where each integer represents a word. All punctuation was removed, and then words were converted to lowercase, split by spaces, and finally indexed by frequency (so low integers correspond to frequent words). The integers 0, 1, and 2 are special: they represent the padding token, the start-of-sequence (SSS) token, and unknown words, respectively. If you want to visualize a review, you can decode it like this:

In [4]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

#### In a real project, you will have to preprocess the text yourself, which we can do using Tokenizer class

In [5]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised=True,
                           with_info=True)
train_size = info.splits['train'].num_examples

### PREPROCESS THE RAW FUNCTIONS 

In [6]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=0.1.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [7]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch,0,300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch
    

It starts by truncating the reviews, keeping only the first 300 characters of each: this will speed up training, and it won’t impact performance too much because you can generally tell whether a review is positive or not in the first sentence or two. Then it uses regular expressions to replace <br /> tags with spaces, and to replace any characters other than letters and quotes with spaces. For example, the text "Well, I can't<br />" will become "Well I can't". Finally, the preprocess() function splits the reviews by the spaces, which returns a ragged tensor, and it converts this ragged tensor to a dense tensor, padding all reviews with the padding token "<pad>" so that they all have the same length.



In [8]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [9]:
vocabulary.most_common()[:3]

[(b'<pad>', 214869), (b'the', 61137), (b'a', 38564)]

##### lets  keep only first 1000 words

In [10]:
vocab_size=10000
truncated_vocabulary=[word for word, count in vocabulary.most_common()[:vocab_size]]

Now we need to add a preprocessing step to replace each word with its ID (i.e., its index in the vocabulary). Just like we did in Chapter 13, we will create a lookup table for this, using 1,000 out-of-vocabulary (oov) buckets:

In [11]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

any word less common than 10,000 have been kept in oov(out_of_vocabulary) table 

In [12]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: id=101863, shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

#### create final test set 

encode these words using a simple encode_words() function that uses the table we just built, and finally prefetch the next batch:

In [13]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [14]:
print(datasets["train"].batch(32).map(preprocess))

<MapDataset shapes: ((None, None), (None,)), types: (tf.string, tf.int64)>


In [15]:
print(train_set)

<PrefetchDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int64)>


### TRAIN THE MODEL 

##### CAN RUN ON THIS PC GPU 

In [16]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [17]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
      1/Unknown - 4s 4s/step

CancelledError:  [_Derived_]RecvAsync is cancelled.
	 [[{{node Reshape_11/_38}}]] [Op:__inference_distributed_function_107992]

Function call stack:
distributed_function
