# Viewing Embedding using TensorBoard

We saw Embedding in the discussion of text classification,  this Notebook shows how we can view embedding in TensorBoard

Make sure you have completed the Python notebooks on working with text before attempting to run this Notebook

In [1]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorboard.plugins import projector

In [2]:
# tensorboard does not shut down cleanly on it's own.  You have to manually delete a file so that it does not restart
# using the previous data set.   Shut it down (on Windows) using this command,   you need to change the user name from hdavi to your
# user name

dir = "C:\\Users\\hdavi\\AppData\\Local\\Temp\\.tensorboard-info"
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

## Delete those logfiles first!

In [3]:
(train_data, test_data), info = tfds.load(
    "imdb_reviews/subwords8k",
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True,
)
encoder = info.features["text"].encoder

# Shuffle and pad the data.
train_batches = train_data.shuffle(1000).padded_batch(
    10, padded_shapes=((None,), ())
)
test_batches = test_data.shuffle(1000).padded_batch(
    10, padded_shapes=((None,), ())
)
train_batch, train_labels = next(iter(train_batches))



In [4]:
# not sure why the underscores got added.
encoder.subwords[0:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

## A quick text classifier using embedding in a Neural Net

In [5]:
from tensorflow.keras.layers import Embedding


# Create an embedding layer.
embedding_dim = 16
embedding = Embedding(encoder.vocab_size, embedding_dim)

# Configure the embedding layer as part of a keras model.
model = tf.keras.Sequential(
    [
        embedding, # The embedding layer should be the first layer in a model.
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1),
    ]
)

# Compile model.
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)


In [6]:
# Train model for one epoch.  Increase this if the code runs fast
history = model.fit(
    train_batches, epochs=1, validation_data=test_batches, validation_steps=20
)



In [11]:
# Set up a logs directory, so Tensorboard knows where to look for files.
# on a windows machine this will be at c:/logs/imdb-example

log_dir='/logs/imdb-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
# I added code here to convert the string encoding to utf-8 and remove the underscore

with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
  for subwords in encoder.subwords:
    stemp=str(subwords.encode('utf-8'))
    stemp=stemp[1:].strip("'")
    stemp=stemp.strip("_")
    f.write("{}\n".format(stemp))
  # Fill in the rest of the labels with "unknown".
 # for unknown in range(1, encoder.vocab_size - len(encoder.subwords)):
#    f.write("unknown #{}\n".format(unknown))


# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

UnknownError: Failed to rename: /logs/imdb-example/embedding.ckpt-1_temp/part-00000-of-00001.data-00000-of-00001 to: /logs/imdb-example/embedding.ckpt-1.data-00000-of-00001 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

In [8]:
%load_ext tensorboard

In [10]:
#if tensorboard does not see the logs the first time, try restarting it by running this cell again

# check to see if the logdir setting the window below matches the logdir setting in the call of TensorBoard.  If it does not, exit Jupyter, shut down the 
# Jupyter server, delete all the log files and restart Jupyter,  run the notebook again.

Enable the three-D labels mode to see specific words

%tensorboard --logdir /logs/imdb-example/

Reusing TensorBoard on port 6006 (pid 16096), started 0:00:04 ago. (Use '!kill 16096' to kill it.)

In [12]:
# tensorboard does not shut down cleanly on it's own.  You have to manually delete a file so that it does not restart
# using the previous data set.   Shut it down (on Windows) using this command,   you need to change the user name from hdavi to your
# user name

dir = "C:\\Users\\hdavi\\AppData\\Local\\Temp\\.tensorboard-info"
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

In [13]:
#if tensorboard does not see the logs the first time, try restarting it by running this cell again

# check to see if the logdir setting the window below matches the logdir setting in the call of TensorBoard.  If it does not, exit Jupyter, shut down the 
# Jupyter server, delete all the log files and restart Jupyter,  run the notebook again.

%tensorboard --logdir /logsb/imdb-example/

In [14]:
%load_ext watermark
%watermark

Last updated: 2022-03-27T15:18:22.953623-04:00

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 8.1.1

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 13, GenuineIntel
CPU cores   : 16
Architecture: 64bit



In [15]:
%watermark --iversions

tensorboard        : 2.6.0
tensorflow         : 2.6.0
tensorflow_datasets: 4.2.0

