## Setting  Up Dictionary

Functions to set up a vocab of words and perform simple preprocessing on the given dataset

In [1]:
import torch
import gensim 
import numpy as np

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)
    
class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()
        # words_found += 1
    def get_data(self, path, batch_size=20):
        # Add words to the dictionary
        with open(path, 'r') as f:
            lines = f.readlines()
        tokens = 0
        # process sentences to tokens
        processedLines = [gensim.utils.simple_preprocess(sentence,min_len=1) for sentence in lines]
        #create word list from token using utf8 encoding
        for words in processedLines : 
            tokens += len(words) 
            for word in words : 
                self.dictionary.add_word(word)  
        
        # Tokenize the file content
        ids = torch.LongTensor(tokens)
        token = 0
        with open(path, 'r') as f:
            lines = f.readlines()
        processedLines = [gensim.utils.simple_preprocess(sentence,min_len=1) for sentence in lines]
        #create word list from token using utf8 encoding
        for words in processedLines :
            for word in words:
                ids[token] = self.dictionary.word2idx[word]
                token += 1
        
        return ids
        

In [2]:
UNK_TOKEN = "<UNK>"

## Glove Embeddings
using pretrained glove embeddings to map words to vectors 

In [49]:
#setup Glove word embeddings
with open('glove.6B/glove.6B.50d.txt', 'r') as f:
    lines = f.readlines()

glove = dict()

for line in lines:
    items = line.split()
    word = items[0]
    vector = torch.FloatTensor(list(map(float, items[1:])))
    glove[word] = vector

## Setting Up Embeddings In Pytorch
  

In [48]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# make dictionary
corpus = Corpus()
ids = corpus.get_data('./brown.txt', 20)
corpus.dictionary.add_word(UNK_TOKEN)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1)

print(vocab_size)
#weights for embedding layer

weights_matrix = torch.zeros(vocab_size, 50)
words_found = 0

for i in range(vocab_size-1):
    # print(corpus.dictionary.idx2word[word])
    try:
        weights_matrix[i] = glove[corpus.dictionary.idx2word[i]]
        words_found += 1
    except KeyError:
        weights_matrix[i] = torch.from_numpy(np.random.normal(scale=0.6, size=(50, )))
weights_matrix[vocab_size-1] =  torch.mean(weights_matrix, 0)

cpu
41455


## Q1 Neural Network Model

In [5]:
# Embedding layer
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim


# Neural Network Model

class language_model(nn.Module):
    def __init__(self, weights_matrix, hidden_size, vocab_size):
        super(language_model, self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.hidden_size = hidden_size
        self.linear = nn.Linear( 200,hidden_size)
        self.linear2 = nn.Linear( hidden_size, vocab_size)
        self.sf = nn.Softmax(dim=0)
       
    def forward(self, inp):
        out = self.embedding(inp)
        out1 = torch.concat([out[i] for i in range(out.size(0))])
        out2 = self.linear(out1)
        out3 = self.linear2(out2)
        out4 = self.sf(out3)
        return out4
        # return self.gru(self.embedding(inp), hidden)
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))

In [33]:
model = language_model(weights_matrix, hidden_size=300, vocab_size=vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 5


In [46]:
for epoch in epochs:
    with open('./brown.txt', 'r') as f:
        lines = f.readlines()
    processedLines = [gensim.utils.simple_preprocess(sentence,min_len=1) for sentence in lines]
    

    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
    inp = ["I" , "eat" , "a" , "red"]
    x = []
    for word in inp:
        try:
            x.append(corpus.dictionary.word2idx[word.lower()])
        except:
            x.append(corpus.dictionary.word2idx[UNK_TOKEN])

    x = torch.LongTensor(x)
    output = model(x)
    output


# print(output.size())
# corpus.dictionary.idx2word[int(torch.argmax(output).data)]

tensor([2.8963e-05, 2.6875e-05, 1.8563e-05,  ..., 2.3017e-05, 1.7878e-05,
        2.2275e-05], grad_fn=<SoftmaxBackward0>)