# Word Embeddings with a language model
This model is a version of the one introduced in 2003 by Bengio et all [here](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf). 

In [130]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random
import os

## Data wikitext-2
The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.

The data can be dowloaded here.
`https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip`

In [131]:
PATH=Path("/data/yinterian/wikitext-2")
list(PATH.iterdir())

[PosixPath('/data/yinterian/wikitext-2/wiki.train.tokens'),
 PosixPath('/data/yinterian/wikitext-2/wiki.valid.tokens'),
 PosixPath('/data/yinterian/wikitext-2/wiki.test.tokens'),
 PosixPath('/data/yinterian/wikitext-2/model20.pth'),
 PosixPath('/data/yinterian/wikitext-2/model10.pth'),
 PosixPath('/data/yinterian/wikitext-2/model.pth'),
 PosixPath('/data/yinterian/wikitext-2/mode117.pth')]

In [132]:
!head -4 /data/yinterian/wikitext-2/wiki.train.tokens

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 


## Tokenization / get vocab

In [133]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [134]:
train_lines = read_file(PATH/'wiki.train.tokens')
valid_lines = read_file(PATH/'wiki.valid.tokens')

In [135]:
len(train_lines), len(valid_lines)

(36718, 3760)

In [136]:
train_lines[10]

' The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action <unk> . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character

In [137]:
from collections import defaultdict

def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab

In [138]:
vocab = get_vocab(train_lines)

In [139]:
len(vocab)

33280

In [140]:
#vocab

## Creating a word2index

In [141]:
vocab2index = {}
words = []
for word in vocab:
    vocab2index[word] = len(words)
    words.append(word)

In [143]:
#vocab2index

## Encoding datasets

In [144]:
# there is an "unk" already
vocab2index['<unk>'] #[y for y in x for x in non_flat]

19

In [145]:
train_content = np.array([vocab2index.get(w, vocab2index["<unk>"]) for line in train_lines for w in line.split()])

In [146]:
val_content = np.array([vocab2index.get(w, vocab2index["<unk>"]) for line in valid_lines for w in line.split()])

In [147]:
train_content.shape, val_content.shape

((2051961,), (213886,))

In [148]:
train_content[:30]

array([ 3,  1,  0,  2,  3, 49, 54,  1,  9, 52, 19,  0, 18, 39, 52, 46, 70,
       15, 27,  5,  1, 34, 65, 40,  9, 35, 15, 50, 64,  7])

## Dataset
Example say we have this dataset and we are using a window size of window=3.

Raw Dataset:
`the cat is walking in the bedroom`

`x                  y 
the cat is         walking 
cat is walking     in 
is walking in      the 
walking in the     bedroom` 

In [149]:
class WikiDataset(Dataset):
    def __init__(self, content, window=5):
        self.content = content
        self.window = window
        self.len = len(self.content) - self.window
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        x = self.content[idx : idx + self.window]
        y = self.content[idx + self.window]
        return x, y

In [150]:
dataset = WikiDataset([0,1,2,3,4,5,6], window=3)

In [151]:
print(len(dataset))
for i in range(4): 
    print(dataset[i])

4
([0, 1, 2], 3)
([1, 2, 3], 4)
([2, 3, 4], 5)
([3, 4, 5], 6)


In [152]:
train_ds = WikiDataset(train_content, window=5)
val_ds = WikiDataset(val_content, window=5)

In [153]:
batch_size = 4 # testing model
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

## Model

In [154]:
class LangModel(nn.Module):
    """Lang Model that needs to be regularized."""

    def __init__(self, V, D, hidden, window=5, dropout=0.5):
        super(LangModel, self).__init__()
        self.word_emb = nn.Embedding(V, D)
        self.linear1 = nn.Linear(window*D, hidden)
        self.dense_bn1 = nn.BatchNorm1d(hidden)
        self.linear2 = nn.Linear(hidden, V)

    def forward(self, x):
        x = self.word_emb(x)
        x = x.view(x.shape[0], -1)
        x = self.linear1(x)
        x = F.relu(self.dense_bn1(x))
        x = self.linear2(x)
        return x

In [155]:
x, y = next(iter(train_dl))

In [156]:
x, y

(tensor([[15928,     6, 14699,    15,   153],
         [ 3616,    53,   296,  5184,    34],
         [  713,   101,  1488, 10004,  9994],
         [  320,    17, 14590,    17,  1572]]),
 tensor([  17, 2502, 1459,  188]))

In [157]:
x.size(0)

4

In [158]:
V = len(vocab)
D = 3 # for testing

In [159]:
word_emb = nn.Embedding(V, D)

In [160]:
x1 = word_emb(x)
x1

tensor([[[-0.1306,  0.9653, -0.0159],
         [-0.0256, -0.8275, -0.4398],
         [-1.5326, -1.0188,  2.5852],
         [-1.4296,  0.6531, -0.1261],
         [-0.5627,  1.3889, -0.3952]],

        [[ 0.2710, -0.6399, -0.9767],
         [ 0.1841,  0.5853, -0.4152],
         [ 1.6365,  0.5396,  0.0575],
         [-0.8606,  0.3476,  0.9160],
         [ 1.2647, -1.0991, -1.4873]],

        [[-0.0752,  0.2613,  0.7435],
         [ 0.6072, -1.4787,  0.3401],
         [-0.1738, -1.0006, -1.0554],
         [-0.8836, -1.0697,  0.3121],
         [-0.9620, -0.2892,  0.1314]],

        [[-0.0831, -0.7060,  0.5228],
         [-1.0072, -1.3731,  0.2329],
         [-0.4790, -1.4167, -0.1525],
         [-1.0072, -1.3731,  0.2329],
         [ 0.1868,  0.5473,  0.9505]]], grad_fn=<EmbeddingBackward>)

In [161]:
x2 = x1.view(x1.shape[0], -1)
x2

tensor([[-0.1306,  0.9653, -0.0159, -0.0256, -0.8275, -0.4398, -1.5326, -1.0188,
          2.5852, -1.4296,  0.6531, -0.1261, -0.5627,  1.3889, -0.3952],
        [ 0.2710, -0.6399, -0.9767,  0.1841,  0.5853, -0.4152,  1.6365,  0.5396,
          0.0575, -0.8606,  0.3476,  0.9160,  1.2647, -1.0991, -1.4873],
        [-0.0752,  0.2613,  0.7435,  0.6072, -1.4787,  0.3401, -0.1738, -1.0006,
         -1.0554, -0.8836, -1.0697,  0.3121, -0.9620, -0.2892,  0.1314],
        [-0.0831, -0.7060,  0.5228, -1.0072, -1.3731,  0.2329, -0.4790, -1.4167,
         -0.1525, -1.0072, -1.3731,  0.2329,  0.1868,  0.5473,  0.9505]],
       grad_fn=<ViewBackward>)

In [162]:
x2.shape

torch.Size([4, 15])

## Testing model

In [163]:
V = len(vocab)
D = 50
model = LangModel(V, D, hidden=15).cuda()

In [164]:
x, y = next(iter(train_dl))

In [165]:
x = x.cuda()

In [166]:
y_hat = model(x)

In [167]:
y_hat

tensor([[ 0.2578, -0.3228, -0.1330,  ...,  0.3378,  0.0733, -0.0093],
        [ 0.3104, -0.2697, -0.2017,  ...,  0.4531,  0.2132,  0.2790],
        [ 0.3274, -0.2656, -0.1990,  ...,  0.2809,  0.2792, -0.1894],
        [ 0.3502, -0.1914, -0.2320,  ...,  0.1110, -0.0466,  0.9675]],
       device='cuda:0', grad_fn=<AddmmBackward>)

## Training

`F.cross_entropy` combines `log_softmax` and `nll_loss` in a single function.

In [168]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [169]:
def train_epocs(model, epochs=10, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        total_loss = 0.
        for x, y in train_dl:
            x = torch.LongTensor(x).cuda()
            y = torch.LongTensor(y).cuda()
            y_hat = model(x)
            loss = F.cross_entropy(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss +=  x.size(0)* loss.item()
            total += x.size(0)
        val_loss = val_metrics(model)
        print("train_loss %.3f val_loss %.3f perplexity %.3f" % (total_loss/total, 
                                                                 val_loss, math.exp(val_loss)))

In [170]:
import math
def val_metrics(model):
    model.eval()
    total = 0
    total_loss = 0.
    for x,y in val_dl:
        x = torch.LongTensor(x).cuda()
        y = torch.LongTensor(y).cuda()
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        total_loss +=  x.size(0)* loss.item()
        total += x.size(0)
    return total_loss/ total

In [171]:
V = len(vocab)
D = 50
model = LangModel(V, D, hidden=15).cuda()

In [172]:
val_metrics(model)

10.430189487070546

In [173]:
train_epocs(model, epochs=10, lr=0.01)

train_loss 6.581 val_loss 5.870 perplexity 354.240
train_loss 5.824 val_loss 5.725 perplexity 306.337
train_loss 5.618 val_loss 5.675 perplexity 291.513
train_loss 5.491 val_loss 5.654 perplexity 285.335
train_loss 5.399 val_loss 5.646 perplexity 283.159
train_loss 5.327 val_loss 5.647 perplexity 283.465
train_loss 5.268 val_loss 5.658 perplexity 286.612
train_loss 5.219 val_loss 5.671 perplexity 290.464
train_loss 5.177 val_loss 5.682 perplexity 293.483
train_loss 5.140 val_loss 5.693 perplexity 296.928


In [174]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [175]:
save_model(model, PATH/"model10.pth")

In [178]:
load_model(model, PATH/"model10.pth")

In [179]:
train_epocs(model, epochs=10, lr=0.001)

train_loss 5.027 val_loss 5.715 perplexity 303.370
train_loss 4.999 val_loss 5.733 perplexity 309.009
train_loss 4.988 val_loss 5.746 perplexity 312.961
train_loss 4.980 val_loss 5.760 perplexity 317.437
train_loss 4.973 val_loss 5.769 perplexity 320.059
train_loss 4.967 val_loss 5.780 perplexity 323.618
train_loss 4.962 val_loss 5.792 perplexity 327.559
train_loss 4.958 val_loss 5.799 perplexity 329.857
train_loss 4.953 val_loss 5.807 perplexity 332.630
train_loss 4.949 val_loss 5.815 perplexity 335.326


In [180]:
save_model(model, PATH/"model20.pth")