## TensorBoard 2.0 Embeddings Visualizer

In [2]:
import os
import shutil
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector
from gensim.models import KeyedVectors

In [3]:
"""An object to generate tensorboard data

    arguments:
    vecs_file -- path to a file that contains all the vectors
       * each row in the file is a single comma separated vector. Example: 0.11,0.123,0.333....
    metadata_file -- path to the metadata file that describes the vectors
    """
class TF_visualizer(object):
    def __init__(self, vecs_file, metadata_file, output_path):
        
        self.metadata_file = metadata_file
        self.output_path = output_path
        
        
        with open(vecs_file, 'r') as vecs_fd:
            vecs = list(vecs_fd.readlines())
            vecs = [np.fromstring(v, sep=',') for v in vecs]
            self.vecs_tensor = tf.convert_to_tensor(vecs, dtype=tf.float32)
        
        if os.path.exists(output_path):
            shutil.rmtree(output_path)
        os.mkdir(output_path)
        
    def visualize(self):
            
        config = projector.ProjectorConfig()
        
        embedding_var = tf.Variable(self.vecs_tensor, trainable=False, name='embeddings')
        embed = config.embeddings.add()
#         embed.tensor_name = embedding_var.name
#         embed.metadata_path = self.metadata_file

     
        ckpt = tf.train.Checkpoint(step=embedding_var)
        manager = tf.train.CheckpointManager(ckpt, self.output_path, max_to_keep=1)
        path = manager.save()

        projector.visualize_embeddings(self.output_path, config)
        
        cmd = 'tensorboard --logdir={0} --bind_all'.format(self.output_path)
        
        print('Running {0}'.format(cmd))
        return cmd

## Prepare the data for Tensorboard
* Load an existing word2vec model
* Generate a file with all the vectors for a specific list of words
* Generate a file with all the words vectors
* Generate a file with the metadata for

In [3]:
dimension = 300
model = KeyedVectors.load_word2vec_format('/home/ds/data/GoogleNews-vectors-negative300.bin.gz', binary=True)

### create a single list of words from multiple files

In [4]:
files_lists = ['/home/ds/data/countries.txt', '/home/ds/data/fruits.txt']
with open('metadata_file.tsv','wb') as wfd:
    wfd.write(b'alias\tfile_name\n')
    for f in files_lists:
        with open(f,'rb') as fd:
            l = list(fd.readlines())
            for line in l:
                wfd.write(bytes('%s\t%s\n' %(line.decode('utf-8').strip(), f), encoding='utf8'))

### create the vectors file

In [5]:
with open('metadata_file.tsv','rb') as rfd:
    f = open('vectors_file.tsv','w+')
    found = 0
    not_found = 0
    data = rfd.readlines()
    
    for i, name in enumerate(data):
        if i == 0: continue
             
        name = name.decode('utf-8').strip().lower().split('\t')[0]
        
        if name in model:
            found += 1
            f.write(",".join([str(n) for n in model[name]]) + "\n")
        else:
            not_found += 1
            f.write(",".join([str(n) for n in np.zeros(dimension)]) + "\n")
    f.close()
    print('total found %s' % found)
    print('total not-found %s' % not_found)

total found 121
total not-found 106


## Run Tensorboard Service

In [3]:
base_path = os.getcwd()
# print(base_path)
# create a new tensor board visualizer
visualizer = TF_visualizer(vecs_file = os.path.join(base_path, 'vectors_file.tsv'),
                           metadata_file = os.path.join(base_path, 'metadata_file.tsv'),
                           output_path = os.path.join(base_path, 'outputs'))
cmd = visualizer.visualize()
! {cmd}

/home/ds


NameError: name 'TF_visualizer' is not defined