# This notebook contains code for loading the GloVe vectors

In [None]:
import numpy as np
import pickle

In [None]:
# Vocabular object for storing words and its properties
class Vocab(object):
    def __init__(self):
        self.PADDING = 0
        self.UNKNOWN = 1
        self.word2index = {'<pad>':0, '<unk>':1}
        self.word2count = {}
        self.index2word = {0: '<pad>', 1: '<unk>'}
        self.n_words = 2

    # Save vocabulary properties to pickle for future reuse
    def save_vocab(self):
        with open('word2index.pickle', 'wb') as handle:
            pickle.dump(self.word2index, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('word2count.pickle', 'wb') as handle:
            pickle.dump(self.word2count, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('index2word.pickle', 'wb') as handle:
            pickle.dump(self.index2word, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("saved vocabs")

    # Load saved vocab properties
    def load_vocab(self):
        with open('word2index.pickle', 'rb') as handle:
            self.word2index = pickle.load(handle)
        with open('word2count.pickle', 'rb') as handle:
            self.word2count = pickle.load(handle)
        with open('index2word.pickle', 'rb') as handle:
            self.index2word = pickle.load(handle)
        self.n_words = len(self.word2index)
    
    
    # add words in the sentence to Vocab         
    def add_sentence(self, sentence):
        for word in sentence:
            self.addWord(word)
            
    def word_to_id(self, word):
        if not self.has_word(word):
            return self.word2index['<unk>']
        return self.word2index[word]

    def load_word(self, word, count):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = count
            self.index2word[self.n_words] = word
            self.n_words += 1
            
    def id_to_word(self, id_):
        return self.index2word[id_]

    def has_word(self, word):
        return word in self.word2index

    # add a word to vocab
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def get_length(self):
        return self.n_words


In [None]:
# code for loading pretrained GloVe into Vocab
def load_glove(path, vocab, init_weight: np.array):
    word_vectors = dict()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            word, *values = line.split()
            try:
                if vocab.has_word(word):
                    if word in word_vectors:
                        # Let's use the first occurrence only.
                        continue
                    word_vector = np.array([float(v) for v in values])
                    word_vectors[word] = word_vector
            except ValueError:
                # 840D GloVe file has some encoding errors...
                # I think they can be ignored.
                continue
    glove_weight = np.zeros_like(init_weight)
    # glove_weight[:] = word_vectors[vocab.unk_word]
    for word in word_vectors:
        word_index = vocab.word_to_id(word)
        glove_weight[word_index, :] = word_vectors[word]
    return glove_weight