In [1]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from torch.utils.data import Dataset, DataLoader

# Learning the Continuous Bag of Words Embeddings (CBOW)

## Get data
Project Gutenberg’s The Complete Works of William Shakespeare, by William
Shakespeare

In [2]:
def get_dataset():
    ! wget http://www.gutenberg.org/files/100/100-0.txt
    ! mkdir data
    ! mkdir models
    ! mv 100-0.txt data

In [4]:
#get_dataset()

In [5]:
!ls data

100-0.txt           glove.6B.200d.txt   glove.6B.zip        quote.tok.gt9.5000
11-0.txt            glove.6B.300d.txt   pg5200.txt          subjdata.README.1.0
glove.6B.100d.txt   glove.6B.50d.txt    plot.tok.gt9.5000


In [6]:
! head -2 data/100-0.txt

﻿
Project Gutenberg’s The Complete Works of William Shakespeare, by William


In [7]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/glove.6B.300d.txt'),
 PosixPath('data/glove.6B.100d.txt'),
 PosixPath('data/100-0.txt'),
 PosixPath('data/glove.6B.50d.txt'),
 PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/11-0.txt'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000'),
 PosixPath('data/glove.6B.200d.txt'),
 PosixPath('data/glove.6B.zip'),
 PosixPath('data/pg5200.txt')]

## Tokenization
Tokenization is the task of chopping up text into pieces, called tokens.

spaCy is an open-source software library for advanced Natural Language Processing. Here we will use it for tokenization.  

In [8]:
filename = PATH/"100-0.txt"
file = open(filename, 'rt')
text = file.read()
file.close()

In [9]:
import re
def clean_up_split(line):
    return re.split(r'\s+|[,;.-:\n]\s*', line.lower())

In [10]:
text_split = clean_up_split(text)
text_split[:20]

['\ufeff',
 'project',
 'gutenberg’s',
 'the',
 'complete',
 'works',
 'of',
 'william',
 'shakespeare',
 'by',
 'william',
 'shakespeare',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone']

## Word to index mapping
In interest of time we will tokenize without spaCy. Here we will compute a vocabulary of words based on the training set and a mapping from word to an index.

In [11]:
from collections import defaultdict

In [12]:
def get_vocab(text_split):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for word in text_split:
        vocab[word] += 1
    return vocab      

In [13]:
#Getting the vocabulary from the training set
word_count = get_vocab(text_split)

In [14]:
#word_count

In [15]:
len(word_count.keys())

42434

In [19]:
# let's delete words that are very infrequent
for word in list(word_count):
    if word_count[word] < 10:
        del word_count[word]

In [20]:
len(word_count.keys())

6284

In [21]:
## Finally we need an index for each word in the vocab
vocab2index = {"UNK":0} # init with padding and unknown
words = ["UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [22]:
#vocab2index

## From text to list of ids

In [23]:
index_content = [vocab2index.get(x, 0) for x in text_split]

In [24]:
N = int(0.8*len(index_content))
N

773421

In [25]:
train_content = index_content[:N]
valid_content = index_content[N:]
len(train_content), len(valid_content)

(773421, 193356)

In [26]:
train_content[100:110]

[6, 7, 55, 6, 7, 0, 56, 0, 57, 57]

## Dataset
Given the list of index in text create a dataset such that y is the center word and x is the context of that word. Here is an example. Given `[6, 7, 58, 6, 7, 59, 60]` this is the result of your dataset.

`x = [6, 7, 6, 7], y = 58` <br>
`x = [7, 58, 7, 59], y = 6` <br>
`x = [58, 6, 59, 60], y = 7` <br>

In [27]:
class CBOWDataset(Dataset):
    def __init__(self, index_content, k=2):
        self.text_index = index_content
        self.k = k
    
    def __len__(self):
        return 
    
    def __getitem__(self, idx):
        
        
        return 

In [28]:
small_context = [6, 7, 58, 6, 7, 59, 60]

In [29]:
small_ds = CBOWDataset(small_context)

In [30]:
small_ds[0], small_ds[1], small_ds[2]

((array([6, 7, 6, 7]), 58),
 (array([ 7, 58,  7, 59]), 6),
 (array([58,  6, 59, 60]), 7))

In [31]:
len(small_ds)

3

## Continuous Bag of Words Model for training embeddings

In [32]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=50):
        super(CBOW, self).__init__()
        self.word_emb = nn.Embedding(   )
        self.linear = nn.Linear(  )
        
    def forward(self, x):
        
        
        
        return x

# Training the CBOW model 

In [33]:
V = len(words)
model = CBOW(vocab_size=V, emb_size=50)
print(V)

6285


In [34]:
train_ds = CBOWDataset(train_content)
valid_ds = CBOWDataset(valid_content)
train_dl = DataLoader(train_ds, batch_size=10000, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=10000)

In [74]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [75]:
def val_metrics(model):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, y in valid_dl:
        batch = y.shape[0]
        out = model(x.long())
        loss = F.cross_entropy(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = torch.max(out, dim=1)[1]
        correct += (pred == y).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, val_acc

In [80]:
def train_epocs(model, optimizer, epochs=10):
    prev_val_acc = 0
    for i in range(epochs):
        total_loss = 0
        total = 0
        model.train()
        for x, y in train_dl:
            y_hat = model(x.long())
            loss = F.cross_entropy(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
        train_loss = total_loss/total
        val_loss, val_accuracy = val_metrics(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (
            train_loss, val_loss, val_accuracy))
        
        # save model
        if val_accuracy > prev_val_acc:
            prev_val_acc = val_accuracy
            path = "models/embedding_{0:.1f}.pth".format(100*val_accuracy) 
            if val_accuracy > 0.12:
                save_model(model, path)
                print(path)

In [81]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [82]:
model = CBOW(vocab_size=V, emb_size=50)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
train_epocs(model, optimizer, epochs=10)

In [None]:
set_learning_rate(optimizer, 0.005)
train_epocs(model, optimizer, epochs=10)

In [162]:
## approximate nearest neighbour library 
##pip install --user annoy

In [44]:
from annoy import AnnoyIndex

In [47]:
W = model.word_emb.weight
W = W.detach().numpy()
W.shape

(6285, 50)

In [48]:
t = AnnoyIndex(50, "euclidean")
for i in range(W.shape[0]):
    t.add_item(i, W[i])
t.build(10) # 10 trees
t.save('W.ann')

True

In [52]:
t = AnnoyIndex(50, "euclidean")
t.load('W.ann')
t.get_nns_by_item(1, 4)

[1, 388, 2642, 635]

In [65]:
def gen_nns(w, t=t, k=2):
    ind = vocab2index[w]
    return [words[x] for x in t.get_nns_by_item(ind, k)]

In [66]:
gen_nns("beauty")

['beauty', 'passion']

In [67]:
gen_nns("he")

['he', 'she']

In [68]:
gen_nns("lovely")

['lovely', 'fair']

In [71]:
gen_nns("summer")

['summer', 'shadows']