In [None]:
#Using Word Embeddings to Represent Texts

In [None]:
#Embeddings In Practice

In [None]:
 #Getting the Imdb Reviews Data

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
max_features = 20000
sequence_length = 200

(train_data, train_labels), (val_data, val_labels) = keras.datasets.imdb.load_data(num_words=max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
train_data[:2]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 82

In [4]:
train_labels[:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

In [None]:
# Preparing the Dataset

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_data = pad_sequences(train_data, maxlen=sequence_length)
val_data = pad_sequences(val_data, maxlen=sequence_length)

In [6]:
#Build, Compile and Train the Model

In [7]:
# embedding layer: represent 100 words into 5 dimensions

embedding_layer = tf.keras.layers.Embedding(100,5)

embed_integers = embedding_layer(tf.constant([1,2,3,4]))

embed_integers.numpy()

array([[ 0.03816209, -0.04987466,  0.02779404, -0.02791326,  0.00371612],
       [ 0.03418124, -0.0476001 , -0.03166645, -0.04195561,  0.02020099],
       [ 0.036716  , -0.03167267,  0.01113405, -0.0091457 , -0.04507789],
       [ 0.03266636, -0.03325208, -0.012862  , -0.0273773 ,  0.0255034 ]],
      dtype=float32)

In [8]:
# Create a model

embedding_dim = 16

model = tf.keras.Sequential([
       tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length),
       tf.keras.layers.GlobalAveragePooling1D(),
       tf.keras.layers.Dense(6, activation='relu'),
       tf.keras.layers.Dropout(0.5),
       tf.keras.layers.Dense(1, activation='sigmoid')
])

In [9]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [10]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 16)           320000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
 dropout (Dropout)           (None, 6)                 0         
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 320,109
Trainable params: 320,109
Non-trainable params: 0
__________________________________________________

In [12]:
history = model.fit(train_data, 
                    train_labels,
                    validation_data = (val_data, val_labels),
                    epochs = 5,
                    callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
 #Visualizing the Model Results

In [13]:
%load_ext tensorboard
%tensorboard --logdir logs

In [14]:
#Testing the Model on New texts

In [15]:
negative_review = ["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. This must simply be their worst role in history"]

# Convert the test sentence to tokens and sequences

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(negative_review)

neg_sequences = tokenizer.texts_to_sequences(negative_review)

# Pad the sequence to have the size of the sequences used for model training

neg_sequences  = pad_sequences(neg_sequences  , maxlen = sequence_length)


# Predict the review
model.predict(neg_sequences)

array([[0.4237007]], dtype=float32)

In [16]:
positive_review = ["Very beautiful and awesome movie. This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. "]

# Convert the test sentence to tokens 
tokenizer_2 = Tokenizer(num_words=20000)
tokenizer_2.fit_on_texts(positive_review)

pos_sequences = tokenizer_2.texts_to_sequences(positive_review)
pos_sequences = pad_sequences(pos_sequences, maxlen = sequence_length)

model.predict(pos_sequences)

array([[0.3525746]], dtype=float32)