# HW5: LSTM

In [16]:
import numpy as np
import torch
import torch.nn as nn
import numpy.random
from torch.utils.data import dataloader
from torchvision import datasets, transforms

### Pre-process the Training Data

training_firsts (and seconds) will be a lists of length 6036 each. Each element of these lists is a sentence vector, where each element corresponds to the words in the sentence. both `<s>` and `</s>` are included in each sentence vector. 

In [17]:
# TRAINING
training_size = 6036
training_firsts = []
training_secs = []
with open("bobsue-data/bobsue.seq2seq.train.tsv") as trainfile:
    for line in trainfile:
        separated = line.split()
        for i in range(len(separated)):
            if separated[i] == "</s>":
                sent1 = separated[:i + 1]
                sent2 = separated[i + 1:]
                break
        training_firsts.append(sent1)
        training_secs.append(sent2)


Now, we repeat for validation and testing sets:

In [18]:
# VALIDATION
validation_size = 750
valid_firsts = []
valid_secs = []
with open("bobsue-data/bobsue.seq2seq.dev.tsv") as valfile:
    for line in valfile:
        separated = line.split()
        for i in range(len(separated)):
            if separated[i] == "</s>":
                sent1 = separated[:i + 1]
                sent2 = separated[i + 1:]
                break
        valid_firsts.append(sent1)
        valid_secs.append(sent2)

# TESTING
testing_size = 750
test_firsts = []
test_secs = []
with open("bobsue-data/bobsue.seq2seq.test.tsv") as testfile:
    for line in testfile:
        separated = line.split()
        for i in range(len(separated)):
            if separated[i] == "</s>":
                sent1 = separated[:i + 1]
                sent2 = separated[i + 1:]
                break
        test_firsts.append(sent1)
        test_secs.append(sent2)

We now have to give each word in our target vocabulary an embedding. To do this, we'll use GloVe. I'll be using snippits from [Chris Olah's Blog](https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76) in my implementation. `word_2_index` maps each word in the GloVe file to an index (i.e. 0, 1, ...). Associated with each index is the corresponding entry in `embeddings_vector`. `embeddings_vector` has shape `(size of glove vocabulary, EMBED_SIZE)`. `EMBED_SIZE` will eventually be 200, but for now it is 50 to speed up training. `glove_voc` stores each word in the GloVe vocabulary. Lastly, we'll create a dictionary `glove` that has keys that are vocabulary words, and the values are the corresponding embedding vectors. 

In [19]:
#GloVe
#TODO: Change embedding size from 50 to 200
EMBED_SIZE = 50
#See descriptions above for the below three variables
word_to_index = {}
embeddings_vector = []
glove_voc = []

index = 0
with open(f'embedding-data/glove.6B.{EMBED_SIZE}d.txt') as embedfile:
    for line in embedfile:
        #Store the first element of the line as the word, store the remaining elements as the embedding vector of size EMBED_SIZE
        split_vector = line.split()
        cur_embedded_word = split_vector[0]
        
        cur_embedded_vector = np.array(split_vector[1:]).astype(np.double)
        word_to_index[cur_embedded_word] = index

        glove_voc.append(cur_embedded_word)
        embeddings_vector.append(cur_embedded_vector)
        index += 1

glove = {w: embeddings_vector[word_to_index[w]] for w in glove_voc}

We'll generate our target vocabulary from our `bobsue` data

In [20]:
#Pre-process target vocabulary
target_voc = []
with open('bobsue-data/bobsue.voc.txt') as vocfile:
    for line in vocfile:
        line = line.split()
        target_voc.append(line[0])
target_voc = np.array(target_voc)

The next step is to ensure that we have embeddings for all words in our target vocabulary. To do this, we'll loop through each word in `target_voc`, and if the word appears in our glove dictionary with an embedding, we link that same embedding in the new `target_embeddings` dictionary. If not, we assign a normally distributed random vector of dimension EMBED_SIZE. `target_voc` is a `numpy` array where each element is a word that appears in our training/testing data.

In [21]:
#TODO: Remove me
np.random.seed(0)
tgt_vocab_len = len(target_voc)
target_embeddings = {}
for i in range(tgt_vocab_len):
    cur_word = target_voc[i]
    if cur_word in glove:
        target_embeddings[cur_word] = glove[cur_word]
    else:
        target_embeddings[cur_word] = np.random.normal(scale=0.6, size=(EMBED_SIZE, ))

Create a dictionary `target_to_index` from a target word to its corresponding index (does the inverse of calling `target_voc[i]`):

In [22]:
target_to_index = {}
for i, word in enumerate(target_voc):
    target_to_index[word] = i