<a href="https://colab.research.google.com/github/temmyzeus/Tensorflow-Courses/blob/master/DeepLearning.AI%20TensorFlow%20Developer%20Professional%20Certificate/Natural%20Language%20Processing%20with%20Tensorflow/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, GlobalAveragePooling1D
print('Tensorflow Version: ', tf.__version__)

Tensorflow Version:  2.6.0


In [2]:
imdb, info = tfds.load(
    name='imdb_reviews',
    with_info=True,
    as_supervised=True
)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJDEWZQ/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJDEWZQ/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJDEWZQ/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

for sent, label in imdb.get('train'):
    sent = sent.numpy().decode()
    label = label.numpy()
    train_sentences.append(sent)
    train_labels.append(label)


for sent, label in imdb.get('test'):
    sent = sent.numpy().decode()
    label = label.numpy()
    test_sentences.append(sent)
    test_labels.append(label)

In [11]:
oov_token: str = '<UNK>'
max_length: int = 120
padding_type: str = 'post'
trunc_type: str = 'post'
vocab_size: str = 10000

In [12]:
tokenizer = Tokenizer(
    num_words=vocab_size,
    oov_token=oov_token
)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)

test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [13]:
n: int = 4
text = train_sentences[n]
print('Original Sentence', text)
print('Sequences', tokenizer.texts_to_sequences([text]))

Original Sentence As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. The plot very ably shows the hypocrisy of the female libido. When men are around they want to be pursued, but when no "men" are around, they become the pursuers of a 14 year old boy. And the boy becomes a man really fast (we should all be so lucky at this age!). He then gets up the courage to pursue his true love.
Sequences [[15, 406, 26, 1044, 30, 2, 370, 13, 138, 2513, 9, 12, 20, 24, 666, 425, 1485, 2, 112, 53, 1, 285, 2, 1, 5, 2, 667, 1, 52, 347, 24, 185, 34, 179, 6, 28, 6986, 19, 52, 55, 347, 24, 185, 34, 411, 2, 1, 5, 4, 2426, 289, 152, 428, 3, 2, 428, 458, 4, 130, 64, 700, 73, 142, 30, 28, 36, 2040, 31, 12, 556, 27, 93, 212, 54, 2, 3147, 6, 6628, 25, 281, 117]]


In [14]:
train_padded = pad_sequences(
    sequences=train_sequences,
    maxlen=max_length,
    padding='post',
    truncating='post'
)

test_padded = pad_sequences(
    sequences=test_sequences,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type
)

In [20]:
# Define out Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    Flatten(),
    Dense(units=6,activation='relu'),
    Dense(units=1, activation='sigmoid')
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Define out Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(units=6,activation='relu'),
    Dense(units=1, activation='sigmoid')
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(
    optimizer='rmsprop',
    loss='binarycrossentropy',
    metrics=['accuracy']
)

In [33]:
train_labels = np.array(train_labels).reshape((-1,1))
test_labels = np.array(test_labels).reshape((-1,1))

In [34]:
model.fit(
    x=train_padded, 
    y=train_labels,
    epochs=20,
    validation_data=(test_padded, test_labels))

Epoch 1/20


ValueError: ignored

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])