# Word Embeddings with a language model
This model is a version of the one introduced in 2003 by Bengio et all [here](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf). 

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random
import os

## Data wikitext-2
The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.

The data can be dowloaded here.
`https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip`

In [2]:
PATH=Path("/data/yinterian/wikitext-2")
list(PATH.iterdir())

[PosixPath('/data/yinterian/wikitext-2/wiki.train.tokens'),
 PosixPath('/data/yinterian/wikitext-2/wiki.valid.tokens'),
 PosixPath('/data/yinterian/wikitext-2/wiki.test.tokens'),
 PosixPath('/data/yinterian/wikitext-2/model20.pth'),
 PosixPath('/data/yinterian/wikitext-2/model10.pth'),
 PosixPath('/data/yinterian/wikitext-2/model.pth'),
 PosixPath('/data/yinterian/wikitext-2/mode117.pth')]

In [3]:
!head -4 /data/yinterian/wikitext-2/wiki.train.tokens

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 


## Tokenization / get vocab

In [4]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [5]:
train_lines = read_file(PATH/'wiki.train.tokens')
valid_lines = read_file(PATH/'wiki.valid.tokens')

In [6]:
len(train_lines), len(valid_lines)

(36718, 3760)

In [7]:
train_lines[10]

' The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action <unk> . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character

In [8]:
from collections import defaultdict

def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab

In [9]:
vocab = get_vocab(train_lines)

In [10]:
len(vocab)

33280

In [11]:
#vocab

## Creating a word2index

In [12]:
vocab2index = {}
words = []
for word in vocab:
    vocab2index[word] = len(words)
    words.append(word)

In [13]:
#vocab2index

## Encoding datasets

In [14]:
# there is an "unk" already
vocab2index['<unk>'] #[y for y in x for x in non_flat]

36

In [15]:
train_content = np.array([vocab2index.get(w, vocab2index["<unk>"]) for line in train_lines for w in line.split()])

In [16]:
val_content = np.array([vocab2index.get(w, vocab2index["<unk>"]) for line in valid_lines for w in line.split()])

In [17]:
train_content.shape, val_content.shape

((2051961,), (213886,))

In [18]:
train_content[:30]

array([ 3,  2,  0,  1,  3, 46, 66,  2, 53, 27, 36,  0, 76,  5, 27, 79, 65,
       47, 72, 37,  2, 18, 28, 54, 53, 38, 47, 39, 61, 52])

## Dataset
Example say we have this dataset and we are using a window size of window=3.

Raw Dataset:
`the cat is walking in the bedroom`

`x                  y 
the cat is         walking 
cat is walking     in 
is walking in      the 
walking in the     bedroom` 

In [19]:
class WikiDataset(Dataset):
    def __init__(self, content, window=5):
        self.content = content
        self.window = window
        self.len = len(self.content) - self.window
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        x = self.content[idx : idx + self.window]
        y = self.content[idx + self.window]
        return x, y

In [20]:
dataset = WikiDataset([0,1,2,3,4,5,6], window=3)

In [21]:
print(len(dataset))
for i in range(4): 
    print(dataset[i])

4
([0, 1, 2], 3)
([1, 2, 3], 4)
([2, 3, 4], 5)
([3, 4, 5], 6)


In [22]:
train_ds = WikiDataset(train_content, window=5)
val_ds = WikiDataset(val_content, window=5)

In [23]:
batch_size = 4 # testing model
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

## Model

In [24]:
class LangModel(nn.Module):
    """Lang Model that needs to be regularized."""

    def __init__(self, V, D, hidden, window=5, dropout=0.5):
        super(LangModel, self).__init__()
        self.word_emb = nn.Embedding(V, D)
        self.linear1 = nn.Linear(window*D, hidden)
        self.dense_bn1 = nn.BatchNorm1d(hidden)
        self.linear2 = nn.Linear(hidden, V)

    def forward(self, x):
        x = self.word_emb(x)
        x = x.view(x.shape[0], -1)
        x = self.linear1(x)
        x = F.relu(self.dense_bn1(x))
        x = self.linear2(x)
        return x

In [25]:
x, y = next(iter(train_dl))

In [26]:
x, y

(tensor([[   92,  1785, 15820,    47,   234],
         [   37,   108, 29375,   449,    18],
         [ 5177,    82, 15644,    47,  6814],
         [   36,    47,    82,   843,  9748]]),
 tensor([ 479, 4287,   47,  690]))

In [27]:
x.size(0)

4

In [28]:
V = len(vocab)
D = 3 # for testing

In [29]:
word_emb = nn.Embedding(V, D)

In [30]:
x1 = word_emb(x)
x1

tensor([[[-0.2512, -0.5158, -0.2631],
         [ 0.5185, -1.3206, -1.0044],
         [ 0.8041, -0.9139,  0.3286],
         [-0.5446, -0.2846,  0.0880],
         [ 0.5323,  0.9674, -0.3900]],

        [[-1.1816, -0.0356, -0.0244],
         [-1.8873, -0.5559, -2.4781],
         [-1.2179,  0.0639,  0.5769],
         [ 1.7267,  2.3174, -1.7062],
         [-2.8953, -0.4923,  0.2094]],

        [[-0.5084, -0.0363, -0.0090],
         [-0.4232, -2.1233, -0.4622],
         [ 0.3782,  0.7960,  0.0047],
         [-0.5446, -0.2846,  0.0880],
         [ 2.5962,  0.1487,  0.3901]],

        [[ 0.2585,  0.4235, -0.5735],
         [-0.5446, -0.2846,  0.0880],
         [-0.4232, -2.1233, -0.4622],
         [-0.1327,  0.4522,  0.6747],
         [ 0.0530,  1.0400, -0.0339]]], grad_fn=<EmbeddingBackward>)

In [31]:
x2 = x1.view(x1.shape[0], -1)
x2

tensor([[-0.2512, -0.5158, -0.2631,  0.5185, -1.3206, -1.0044,  0.8041, -0.9139,
          0.3286, -0.5446, -0.2846,  0.0880,  0.5323,  0.9674, -0.3900],
        [-1.1816, -0.0356, -0.0244, -1.8873, -0.5559, -2.4781, -1.2179,  0.0639,
          0.5769,  1.7267,  2.3174, -1.7062, -2.8953, -0.4923,  0.2094],
        [-0.5084, -0.0363, -0.0090, -0.4232, -2.1233, -0.4622,  0.3782,  0.7960,
          0.0047, -0.5446, -0.2846,  0.0880,  2.5962,  0.1487,  0.3901],
        [ 0.2585,  0.4235, -0.5735, -0.5446, -0.2846,  0.0880, -0.4232, -2.1233,
         -0.4622, -0.1327,  0.4522,  0.6747,  0.0530,  1.0400, -0.0339]],
       grad_fn=<ViewBackward>)

In [32]:
x2.shape

torch.Size([4, 15])

## Testing model

In [33]:
V = len(vocab)
D = 50
model = LangModel(V, D, hidden=15).cuda()

In [34]:
x, y = next(iter(train_dl))

In [35]:
x = x.cuda()

In [36]:
y_hat = model(x)

In [37]:
y_hat

tensor([[-0.0943,  0.0098, -0.1051,  ..., -0.1925,  0.2966,  0.0325],
        [-0.3251, -0.0107, -0.4621,  ..., -0.1111,  0.4544, -0.1570],
        [-0.1425,  0.2231,  0.1544,  ..., -0.2237,  0.2928, -0.1015],
        [-0.2112,  0.0486, -0.1693,  ..., -0.0402, -0.0255,  0.1471]],
       device='cuda:0', grad_fn=<AddmmBackward>)

## Training

`F.cross_entropy` combines `log_softmax` and `nll_loss` in a single function.

In [38]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [39]:
def train_epocs(model, epochs=10, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        total_loss = 0.
        for x, y in train_dl:
            x = torch.LongTensor(x).cuda()
            y = torch.LongTensor(y).cuda()
            y_hat = model(x)
            loss = F.cross_entropy(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss +=  x.size(0)* loss.item()
            total += x.size(0)
        val_loss = val_metrics(model)
        print("train_loss %.3f val_loss %.3f perplexity %.3f" % (total_loss/total, 
                                                                 val_loss, math.exp(val_loss)))

In [40]:
import math
def val_metrics(model):
    model.eval()
    total = 0
    total_loss = 0.
    for x,y in val_dl:
        x = torch.LongTensor(x).cuda()
        y = torch.LongTensor(y).cuda()
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        total_loss +=  x.size(0)* loss.item()
        total += x.size(0)
    return total_loss/ total

In [41]:
V = len(vocab)
D = 50
model = LangModel(V, D, hidden=15).cuda()

In [42]:
val_metrics(model)

10.421753756367842

In [43]:
train_epocs(model, epochs=10, lr=0.01)

train_loss 6.586 val_loss 5.883 perplexity 358.788
train_loss 5.830 val_loss 5.724 perplexity 306.236
train_loss 5.622 val_loss 5.675 perplexity 291.376
train_loss 5.495 val_loss 5.653 perplexity 285.217
train_loss 5.404 val_loss 5.648 perplexity 283.629
train_loss 5.333 val_loss 5.655 perplexity 285.705
train_loss 5.275 val_loss 5.663 perplexity 287.979
train_loss 5.226 val_loss 5.672 perplexity 290.484
train_loss 5.184 val_loss 5.689 perplexity 295.519
train_loss 5.146 val_loss 5.700 perplexity 298.914


In [44]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [45]:
save_model(model, PATH/"model10.pth")

In [46]:
load_model(model, PATH/"model10.pth")

In [47]:
train_epocs(model, epochs=10, lr=0.001)

train_loss 5.035 val_loss 5.721 perplexity 305.320
train_loss 5.008 val_loss 5.744 perplexity 312.243
train_loss 4.997 val_loss 5.757 perplexity 316.547
train_loss 4.989 val_loss 5.771 perplexity 320.755
train_loss 4.982 val_loss 5.781 perplexity 324.137
train_loss 4.976 val_loss 5.788 perplexity 326.409
train_loss 4.971 val_loss 5.805 perplexity 331.821
train_loss 4.966 val_loss 5.812 perplexity 334.325
train_loss 4.962 val_loss 5.817 perplexity 336.089
train_loss 4.958 val_loss 5.825 perplexity 338.750


In [48]:
save_model(model, PATH/"model20.pth")