In [1]:
import nltk
import csv
from nltk.corpus import brown
from nltk.corpus import wordnet

nltk.download("brown")
nltk.download("wordnet")

len(brown.paras())

[nltk_data] Downloading package brown to /home/yizhouw/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yizhouw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


15667

In [2]:
num_train = 12000
UNK_symbol = '<UNK>'
vocab = set([UNK_symbol])

# create brown corpus again with all words
# no preprocessing, only lowercase
brown_corpus_train = []
for idx, paragraph in enumerate(brown.paras()):
    if idx == num_train:
        break
    words = []
    for sentence in paragraph:
        for word in sentence:
            words.append(word.lower())
    brown_corpus_train.append(words)
    
# create term frequency of the words
words_term_frequency_train = {}
for doc in brown_corpus_train:
    for word in doc:
        words_term_frequency_train[word] = words_term_frequency_train.get(word,0) + 1

# create vocabulary
for doc in brown_corpus_train:
    for word in doc:
        if words_term_frequency_train.get(word, 0) >= 5:
            vocab.add(word)

len(vocab)  # 12681
            # vocab has no dup

12681

In [3]:
import numpy as np 
x_train = []
y_train = []
x_dev = []
y_dev = []


# create word to id mappings
word_to_id_mappings = {}
for idx, word in enumerate(vocab):
    word_to_id_mappings[word] = idx

    
# function to get id for a given word
# return <UNK> id if not found
def get_id_of_word(word):
    unknown_word_id = word_to_id_mappings['<UNK>']
    return word_to_id_mappings.get(word, unknown_word_id)
                              
   
# create training and dev set
for idx, paragraph in enumerate(brown.paras()):   
    for sentence in paragraph:
        for i, word in enumerate(sentence):
            if i + 2 >= len(sentence):
                # sentence boundary reached
                # ignoring sentence less than 3 words
                break
            x_extract = [get_id_of_word(word.lower), get_id_of_word(sentence[i+1].lower())]
            y_extract = [get_id_of_word(sentence[i+2].lower())]
                                   
            if idx < num_train:
                x_train.append(x_extract)
                y_train.append(y_extract)
            else: 
                x_dev.append(x_extract)
                y_dev.append(y_extract)
                
x_train = np.array(x_train)
y_train = np.array(y_train)
x_dev   = np.array(x_dev)
y_dev   = np.array(y_dev)

print(x_train.shape)
print(y_train.shape)
print(x_dev.shape)
print(y_dev.shape)


(872823, 2)
(872823, 1)
(174016, 2)
(174016, 1)


In [5]:
# load libraries
import torch
import multiprocessing
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

# Trigram Neural Network Model
class TrigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(TrigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [6]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 2
BATCH_SIZE = 256
# hidden units
H = 100
torch.manual_seed(13013)

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
available_workers = multiprocessing.cpu_count()

print("--- Creating training and dev dataloaders with {} batch size ---".format(BATCH_SIZE))
train_set = np.concatenate((x_train, y_train), axis=1)
dev_set = np.concatenate((x_dev, y_dev), axis=1)
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = available_workers)
dev_loader = DataLoader(dev_set, batch_size = BATCH_SIZE, num_workers = available_workers)

--- Creating training and dev dataloaders with 256 batch size ---


In [7]:
# helper function to get accuracy from log probabilities
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:2]
            target_tensor = data_tensor[:,2]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [149]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, H)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)


# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_model_path = None
for epoch in range(20):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):       
        context_tensor = data_tensor[:,0:2]
        target_tensor = data_tensor[:,2]

        context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(epoch, dev_acc, dev_loss))
    if dev_acc > best_acc:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
        best_acc = dev_acc
        # set best model path
        best_model_path = 'best_model_{}.dat'.format(epoch)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 9.512959480285645; Acc:0.0; Time taken (s): 0.4376358985900879
Training Iteration 500 of epoch 0 complete. Loss: 6.362466335296631; Acc:0.12109375; Time taken (s): 0.7990531921386719
Training Iteration 1000 of epoch 0 complete. Loss: 6.119363307952881; Acc:0.140625; Time taken (s): 0.7724261283874512
Training Iteration 1500 of epoch 0 complete. Loss: 6.059145450592041; Acc:0.11328125; Time taken (s): 0.7673954963684082
Training Iteration 2000 of epoch 0 complete. Loss: 5.998517036437988; Acc:0.11328125; Time taken (s): 0.7977237701416016
Training Iteration 2500 of epoch 0 complete. Loss: 6.223663330078125; Acc:0.12109375; Time taken (s): 0.7737562656402588
Training Iteration 3000 of epoch 0 complete. Loss: 5.791821002960205; Acc:0.14453125; Time taken (s): 0.7659680843353271

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.0850114822387695; Mean Acc:0.17578125; Time taken (s): 0

Training Iteration 2000 of epoch 6 complete. Loss: 5.132070541381836; Acc:0.1484375; Time taken (s): 0.7726919651031494
Training Iteration 2500 of epoch 6 complete. Loss: 5.074798107147217; Acc:0.1875; Time taken (s): 0.7776980400085449
Training Iteration 3000 of epoch 6 complete. Loss: 4.740074634552002; Acc:0.22265625; Time taken (s): 0.7776341438293457

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.252227783203125; Mean Acc:0.1875; Time taken (s): 0.4313945770263672
Dev Iteration 500 complete. Mean Loss: 5.349879317178935; Mean Acc:0.161208838224411; Time taken (s): 0.43974781036376953
Epoch 6 complete! Development Accuracy: 0.16123047471046448; Development Loss: 5.363751313966863

--- Training model Epoch: 8 ---
Training Iteration 0 of epoch 7 complete. Loss: 5.413658618927002; Acc:0.15625; Time taken (s): 0.44055867195129395
Training Iteration 500 of epoch 7 complete. Loss: 5.082083225250244; Acc:0.18359375; Time taken (s): 0.7970316410064697
Training

Training Iteration 0 of epoch 13 complete. Loss: 5.162001132965088; Acc:0.16015625; Time taken (s): 0.43438196182250977
Training Iteration 500 of epoch 13 complete. Loss: 4.908546447753906; Acc:0.1875; Time taken (s): 0.8007111549377441
Training Iteration 1000 of epoch 13 complete. Loss: 4.8617753982543945; Acc:0.23828125; Time taken (s): 0.7831194400787354
Training Iteration 1500 of epoch 13 complete. Loss: 5.111323833465576; Acc:0.1484375; Time taken (s): 0.7764487266540527
Training Iteration 2000 of epoch 13 complete. Loss: 5.000035285949707; Acc:0.16015625; Time taken (s): 0.7979497909545898
Training Iteration 2500 of epoch 13 complete. Loss: 4.8728156089782715; Acc:0.1953125; Time taken (s): 0.8000571727752686
Training Iteration 3000 of epoch 13 complete. Loss: 4.711226463317871; Acc:0.234375; Time taken (s): 0.7859194278717041

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.444770812988281; Mean Acc:0.1796875; Time taken (s): 0.4438636302947998
Dev It


--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.420502185821533; Mean Acc:0.16796875; Time taken (s): 0.4417452812194824
Dev Iteration 500 complete. Mean Loss: 5.569921283188932; Mean Acc:0.15869823098182678; Time taken (s): 0.427410364151001
Epoch 19 complete! Development Accuracy: 0.15790633857250214; Development Loss: 5.5775646665517025


In [56]:
# ---------------------- Loading Best Model -------------------
best_model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, H)
best_model.load_state_dict(torch.load(best_model_path))
best_model.cuda(gpu)

cos = nn.CosineSimilarity(dim=0)

lm_similarities = {}

# word pairs to calculate similarity
words = {('computer','keyboard'),('cat','dog'),('dog','car'),('keyboard','cat')}

# ----------- Calculate LM similarities using cosine similarity ----------
for word_pairs in words:
    w1 = word_pairs[0]
    w2 = word_pairs[1]
    words_tensor = torch.LongTensor([get_id_of_word(w1),get_id_of_word(w2)])
    words_tensor = words_tensor.cuda(gpu)
    # get word embeddings from the best model
    words_embeds = best_model.embeddings(words_tensor)
    # calculate cosine similarity between word vectors
    sim = cos(words_embeds[0],words_embeds[1])
    lm_similarities[word_pairs] = sim.item()

print(lm_similarities)

{('computer', 'keyboard'): 0.008548855781555176, ('cat', 'dog'): -0.07967369258403778, ('dog', 'car'): 0.17890217900276184, ('keyboard', 'cat'): 0.01506723277270794}


In [160]:
# ---------------------- Loading Best Model -------------------
best_model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, H)
best_model.load_state_dict(torch.load(best_model_path))
best_model.cuda(gpu)

# predict word
context_tensor = torch.tensor([get_id_of_word('a'), get_id_of_word('lot')])

context_tensor = context_tensor.cuda(gpu)

# get log probabilities over next words
log_probs = best_model(context_tensor)

ix = torch.argmax(log_probs)
result = id_to_word_mappings[ix.item()]

print(result)




.
