## Split `glove.6B.300d.txt` file vertically into two files:`words` and `vectors`

In [1]:
!cat glove.6B.300d.txt|cut -d ' ' -f 1 > words
!cat glove.6B.300d.txt|cut -d ' ' --complement -f 1 > vectors

In [1]:
import tensorflow as tf
import numpy as np
import random
import collections
import csv
import sys

tf.enable_eager_execution()

In [2]:
fable_text = """
long ago , the mice had a general council to consider what measures
they could take to outwit their common enemy , the cat . some said
this , and some said that but at last a young mouse got up and said
he had a proposal to make , which he thought would meet the case . 
you will all agree , said he , that our chief danger consists in the
sly and treacherous manner in which the enemy approaches us . now , 
if we could receive some signal of her approach , we could easily
escape from her . i venture , therefore , to propose that a small
bell be procured , and attached by a ribbon round the neck of the cat
. by this means we should always know when she was about , and could
easily retire while she was in the neighbourhood . this proposal met
with general applause , until an old mouse got up and said that is
all very well , but who is to bell the cat ? the mice looked at one
another and nobody spoke . then the old mouse said it is easy to
propose impossible remedies .
"""
# Replace the carriage return with space.
fable_text = fable_text.replace('\n','')

#this function puts all the words in a single column vector within a numpy array

def read_data(raw_text):
 content = raw_text
 content = content.split() #splits the text by spaces (default split character)
 content = np.array(content)
 content = np.reshape(content, [-1, ])
 return content

training_data = read_data(fable_text)

#Create dictionary and reverse dictionary with word ids

def build_dictionaries(words):
    count = collections.Counter(words).most_common() #creates list of word/count pairs;
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) #len(dictionary) increases each iteration
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

dictionary, reverse_dictionary = build_dictionaries(training_data)

doc_vocab_size = len(dictionary) 


## Construct embedding vectors for vocabulary of doc text
### Extract embedding vectors for words  in Glove data file by looking up two hash tables (from file)  :
lookup index of word by word (using `words` file);

lookup embedding word vector by word index (using  `vectors` file);
### Use random embedding vectors for words NOT  included in Glove data file.
### Save the numpy array of constructed embedding vectors to `embedding.npy` file

In [3]:
tf.tables_initializer()

features = tf.convert_to_tensor([reverse_dictionary[i] for i in range(doc_vocab_size)],dtype=tf.string)
table = tf.contrib.lookup.index_table_from_file(
    vocabulary_file="words", num_oov_buckets=0)
ids = table.lookup(features)

table2 = tf.contrib.lookup.index_to_string_table_from_file(
    vocabulary_file="vectors")
lines = table2.lookup(ids)

embedding_dim = 300


In [4]:
def make_vector (line):
    RECORD_DEFAULTS = [0.0]*embedding_dim
    vector = tf.cond(tf.equal(tf.constant('UNK'),line),
            lambda:tf.random.uniform([embedding_dim,],minval=-0.2, maxval=0.2),
            lambda:tf.convert_to_tensor(tf.decode_csv(line, RECORD_DEFAULTS,field_delim=' ')))
    return vector

embedding = tf.map_fn(make_vector,lines,dtype=tf.float32)

In [5]:
np.save('embedding.npy', embedding.numpy())