# Visualizing Sentence Embeddings

In [10]:
import os
import sys
import numpy as np
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector

# sys.path.append("~/neuralmonkey")

from neuralmonkey.experiment import Experiment
from neuralmonkey.run import load_runtime_config
from neuralmonkey import dataset

## 1. Load Neural Monkey model

In [11]:
exp = Experiment("mmmt-en-de/experiment.ini")
exp.build_model()

[34m2018-08-25 17:25:11[0m: Loading INI file: 'mmmt-en-de/experiment.ini'
[33m2018-08-25 17:25:11[0m: INI file is parsed.
[33m2018-08-25 17:25:11[0m: Building model based on the config.
[33m2018-08-25 17:25:11[0m: Vocabulary from wordlist loaded, containing 12142 words
[33m2018-08-25 17:25:11[0m: Sample of the vocabulary: ['hard-hat', 'pineapple', 'devil', 'ons', 'boxes']
[33m2018-08-25 17:25:11[0m: Vocabulary from wordlist loaded, containing 30003 words
[33m2018-08-25 17:25:11[0m: Sample of the vocabulary: ['abgesägten', 'Kniestrümpfen', 'X', 'Münztelefon', 'Hunderennens']
[33m2018-08-25 17:25:11[0m: Initializing decoder, name: 'decoder'
[33m2018-08-25 17:25:11[0m: Using linear projection of encoders as the initial state
[33m2018-08-25 17:25:11[0m: No output projection specified - using tanh projection
[33m2018-08-25 17:25:12[0m: The inferred rnn_size of this encoder projection will be 600
[33m2018-08-25 17:25:12[0m: Decoder initalized. Cost var: Tensor("decode

## 2. Load data

In [12]:
run_cfg = load_runtime_config("val-data.ini")
data = run_cfg.test_datasets[0]

[34m2018-08-25 17:25:12[0m: Loading INI file: 'val-data.ini'
[33m2018-08-25 17:25:12[0m: INI file is parsed.
[33m2018-08-25 17:25:12[0m: Building model based on the config.
[33m2018-08-25 17:25:12[0m: Initializing dataset with: target, source
[33m2018-08-25 17:25:12[0m: Dataset length: 1014
[33m2018-08-25 17:25:12[0m: Model built.


## 3. Run model on data

In [13]:
_, outputs = exp.run_model(data, batch_size=256)

INFO:tensorflow:Restoring parameters from mmmt-en-de/variables.data


[33m2018-08-25 17:25:12[0m: Default variable file 'mmmt-en-de/variables.data' will be used for loading variables.
[33m2018-08-25 17:25:12[0m: Loading variables from mmmt-en-de/variables.data
[33m2018-08-25 17:25:14[0m: Variables loaded from mmmt-en-de/variables.data


In [14]:
outputs['encoded'].shape

(1014, 600)

## 4. Visualize data in Projector

In [15]:
# Save embeddings in TSV format
np.savetxt("mmmt-en-de/embeddings.tsv", outputs['encoded'], delimiter='\t')

In [16]:
# Save source and output sentences in TSV format
with open("mmmt-en-de/metadata.tsv", "w") as f:
    # TSV header
    print("source\toutput", file=f)

    # Data
    for src_words, tgt_words in zip(data.get_series('source'), outputs['target']):
        src = ' '.join(src_words)
        tgt = ' '.join(tgt_words)
        print("{}\t{}".format(src, tgt), file=f)

In [17]:
# Create Projector config object
config = projector.ProjectorConfig()
emb = config.embeddings.add()
emb.tensor_name = "sent_embeddings"
emb.tensor_path = "embeddings.tsv"
emb.metadata_path = "metadata.tsv"

# Write Projector config to TensorBoard
writer = tf.summary.FileWriter("mmmt-en-de")
projector.visualize_embeddings(writer, config)