In [1]:
#Build a sentiment classification model to distinguish between postive and negative movie reviews.
#Will it on the IMDB review dataset and visualize the embeddings generated after training

import tensorflow as tf
import tensorflow_datasets as tfds


In [None]:
# Install this package if running on your local machine
# !pip install -q tensorflow-datasets

In [2]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised= True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteRBENVW/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteRBENVW/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteRBENVW/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [4]:
print(imdb)

{Split('train'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('test'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('unsupervised'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}


In [5]:
for example in imdb['train'].take(2):
  print(example)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

In [6]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']
train_sentences = []
train_labels= []
test_sentences= []
test_labels= []
for s, l in train_data:
   train_sentences.append(s.numpy().decode('utf8'))
   train_labels.append(l.numpy())

for s, l in test_data:
   test_sentences.append(s.numpy().decode('utf8'))
   test_labels.append(l.numpy())

#converting label list to numpy array
training_labels_final = np.array(train_labels)
testing_labels_final = np.array(test_labels)

In [7]:
#Generate padded sequences
#tokenize the sentences and pad them to a uniform length

#Parameters
vocab_size= 10000
max_length = 120
embedding_dim = 16
trunc_type = 'post'
oov_token='<OOV>'

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#initilize the tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
#Generate word index dictionary for the training process
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
#generate and pad the sequences
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences, maxlen= max_length, truncating=trunc_type)
#generate and pad the test sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(sequences, maxlen= max_length, truncating=trunc_type)



In [12]:
#Build and compile model(Build sentiment classficiation model)
#the input will be embedding layer. Main idea here is to represent each word in vocabulary with vectors.
#these vectors have trainable weights. so the neural network learns that words likely to appear in a positive
#tweet will converge towards similar weights. words in negative tweets will clustered more closely together.

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171533 (670.05 KB)
Trainable params: 171533 (670.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
#Train the model
num_epochs= 10
model.fit(padded, training_labels_final, epochs= num_epochs, validation_data=(test_padded,testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b3e50748b80>

In [18]:
#Visualize word embeddings
#after training, can visualize the trained weights in the embedding layer to see words that are clustered
#together  The Tensorflow Embedding Projector (https://projector.tensorflow.org/)
#is able to reduce the 16-dimension vectors you defined earlier
#into fewer components so it can be plotted in the projector. First, you will need to get these weights
#and you can do that with the cell below:
embedding_layer = model.layers[0]
embedding_weights = embedding_layer.get_weights()[0]
print(embedding_weights.shape)

(10000, 16)


In [19]:
reverse_word_index = tokenizer.index_word

In [None]:
#need to generate two files:
# vecs.tsv -- contain vector weights for each word in vocabulary
# meta.tsv --contains the words in the vocaulary

In [21]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  #get the word associated at the current index
  word_name = reverse_word_index[word_num]
  #get the associated weight associated with the current index
  word_embedding= embedding_weights[word_num]
  #write the word name
  out_m.write(word_name + "\n")
  #write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) +"\n")

out_v.close()
out_m.close()

In [22]:
try:
  from google.colab import files
except ImportError:
  pass

else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>