# Visualizing Sentence Embeddings

In [1]:
import os
import sys
import numpy as np
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector

# sys.path.append("~/neuralmonkey")

from neuralmonkey.experiment import Experiment
from neuralmonkey.run import load_runtime_config
from neuralmonkey import dataset

## 1. Load Neural Monkey model

In [2]:
exp = Experiment("mmmt-en-de/experiment.ini")
exp.build_model()

[34m2018-08-25 16:03:13[0m: Loading INI file: 'mmmt-en-de/experiment.ini'
[33m2018-08-25 16:03:13[0m: INI file is parsed.
[33m2018-08-25 16:03:13[0m: Building model based on the config.
[33m2018-08-25 16:03:13[0m: Vocabulary from wordlist loaded, containing 12142 words
[33m2018-08-25 16:03:13[0m: Sample of the vocabulary: ['hard-hat', 'pineapple', 'devil', 'ons', 'boxes']
[33m2018-08-25 16:03:14[0m: Vocabulary from wordlist loaded, containing 30003 words
[33m2018-08-25 16:03:14[0m: Sample of the vocabulary: ['abgesägten', 'Kniestrümpfen', 'X', 'Münztelefon', 'Hunderennens']
[33m2018-08-25 16:03:14[0m: Initializing decoder, name: 'decoder'
[33m2018-08-25 16:03:14[0m: Using linear projection of encoders as the initial state
[33m2018-08-25 16:03:14[0m: No output projection specified - using tanh projection
[33m2018-08-25 16:03:14[0m: The inferred rnn_size of this encoder projection will be 600
[33m2018-08-25 16:03:14[0m: Decoder initalized. Cost var: Tensor("decode

## 2. Load data

In [3]:
run_cfg = load_runtime_config("val-data.ini")
data = run_cfg.test_datasets[0]

[34m2018-08-25 16:05:05[0m: Loading INI file: 'val-data.ini'
[33m2018-08-25 16:05:05[0m: INI file is parsed.
[33m2018-08-25 16:05:05[0m: Building model based on the config.
[33m2018-08-25 16:05:05[0m: Initializing dataset with: source, target
[33m2018-08-25 16:05:05[0m: Dataset length: 1015
[33m2018-08-25 16:05:05[0m: Model built.


In [5]:
data.get_series('source')

[['a', 'group', 'of', 'men', 'are', 'loading', 'cotton', 'onto', 'a', 'truck'],
 ['a', 'man', 'sleeping', 'in', 'a', 'green', 'room', 'on', 'a', 'couch', '.'],
 ['a',
  'boy',
  'wearing',
  'headphones',
  'sits',
  'on',
  'a',
  'woman',
  "'s",
  'shoulders',
  '.'],
 ['two',
  'men',
  'setting',
  'up',
  'a',
  'blue',
  'ice',
  'fishing',
  'hut',
  'on',
  'an',
  'iced',
  'over',
  'lake'],
 ['a',
  'balding',
  'man',
  'wearing',
  'a',
  'red',
  'life',
  'jacket',
  'is',
  'sitting',
  'in',
  'a',
  'small',
  'boat',
  '.'],
 ['a',
  'lady',
  'in',
  'a',
  'red',
  'coat',
  ',',
  'holding',
  'a',
  'bluish',
  'hand',
  'bag',
  'likely',
  'of',
  'asian',
  'descent',
  ',',
  'jumping',
  'off',
  'the',
  'ground',
  'for',
  'a',
  'snapshot',
  '.'],
 ['a', 'brown', 'dog', 'is', 'running', 'after', 'the', 'black', 'dog', '.'],
 ['a',
  'young',
  'boy',
  'wearing',
  'a',
  'Giants',
  'jersey',
  'swings',
  'a',
  'baseball',
  'bat',
  'at',
  'an',
 

## 3. Run model on data

In [7]:
_, outputs = exp.run_model(data, batch_size=256)

In [11]:
outputs['encoded'].shape

(1015, 600)

## 4. Visualize data in Projector

In [15]:
# Save embeddings in TSV format
np.savetxt("mmmt-en-de/embeddings.tsv", outputs['encoded'], delimiter='\t')

In [18]:
# Save source and output sentences in TSV format
with open("mmmt-en-de/metadata.tsv", "w") as f:
    # TSV header
    print("source\toutput", file=f)

    # Data
    for src_words, tgt_words in zip(data.get_series('source'), outputs['target']):
        src = ' '.join(src_words)
        tgt = ' '.join(tgt_words)
        print("{}\t{}".format(src, tgt), file=f)

In [16]:
# Create Projector config object
config = projector.ProjectorConfig()
emb = config.embeddings.add()
emb.tensor_name = "sent_embeddings"
emb.tensor_path = os.path.abspath("mmmt-en-de/embeddings.tsv")
emb.metadata_path = os.path.abspath("mmmt-en-de/metadata.tsv")

# Write Projector config to TensorBoard
writer = tf.summary.FileWriter("mmmt-en-de")
projector.visualize_embeddings(writer, config)