In [15]:
import numpy as np
import json
from collections import defaultdict
import string
import tensorflow as tf
import time
import os

In [36]:
def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
    else:
        word_to_embedding_dict = dict()

    
    with open(glove_filename, 'r') as glove_file:
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )
            
            if with_indexes:
                if word in word_to_index_dict:
                    print ("dup word: ", word)
                else:
                    word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation
    
    print ("load_embedding_from_disks representation: ", len(representation))
    
    _WORD_NOT_FOUND = [0.01]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        print ("index_to_embedding_array: ", len(index_to_embedding_array))
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        print ("word_to_index_dict: ", len(word_to_index_dict))
        print ("index_to_embedding_array: ", index_to_embedding_array.shape)
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

In [37]:
word_to_index_dict, index_to_embedding_array = load_embedding_from_disks("/home/ubuntu/cs224u/glove/glove.twitter.27B.50d.txt")


dup word:  <unk>
dup word:  <unk>
load_embedding_from_disks representation:  50
index_to_embedding_array:  1193517
word_to_index_dict:  1193515
index_to_embedding_array:  (1193518, 50)


In [57]:
path = "/home/ubuntu/cs224u/processed_10_1k_mymodel/processed_combine_all/combine_all_story/"
stories = os.listdir(path)
word2id, id2emb = dict(), np.zeros((len(word_to_index_dict),50))
count = 0
for s in stories:
    f = open(path+s,"r")
    txt = ""
    for i in f.readlines():
        txt += i 
    for p in txt.split("\n"):
        p = p.translate(str.maketrans('', '', string.punctuation))
        for w in p.split():
            ind = word_to_index_dict[w]
            emb = index_to_embedding_array[ind]
            if w not in word2id:
                word2id[w] = count
                id2emb[count,:] = emb
                count += 1
id2emb = id2emb[:len(word2id)]
_WORD_NOT_FOUND = [0.01]* len(representation)
_LAST_INDEX = len(word2id)
word2id = defaultdict(lambda: _LAST_INDEX, word2id)
id2emb = id2emb + np.array(_WORD_NOT_FOUND)

In [58]:
len(word2id)

114527