In [129]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import math

## Data wikitext-2
The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good articles on Wikipedia.

The data can be dowloaded using: `wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip`

In [45]:
! head -5  wikitext-2/wiki.train.tokens 

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers . 

In [46]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [47]:
from collections import defaultdict
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab      

In [48]:
train_tokens = read_file("wikitext-2/wiki.train.tokens")
valid_tokens = read_file("wikitext-2/wiki.valid.tokens")

In [49]:
word_count = get_vocab(train_tokens)

In [50]:
len(word_count.keys())

33280

In [51]:
def get_vocab_from_word_count(word_count):
    for word in list(word_count):
        if word_count[word] < 5:
            del word_count[word]
        
    vocab2index = {"UNK": 0}
    words = ["UNK"]
    for word in word_count:
        vocab2index[word] = len(words)
        words.append(word)
    return vocab2index, words

In [52]:
vocab2index, words = get_vocab_from_word_count(word_count)

In [53]:
len(words)

21590

In [54]:
words[:10]

['UNK',
 'Chronicles',
 '=',
 'III',
 'Valkyria',
 'runs',
 'video',
 'pitted',
 'and',
 'nation']

In [75]:
class LanguageModelDataset(Dataset):
    def __init__(self, text, vocab2index, K=3):
        tokens = np.concatenate([x.split() for x in text])
        self.text = np.array([ vocab2index.get(x, 0) for x in tokens])
        self.K = K
    
    def __len__(self):
        return len(self.text) - self.K
    
    def __getitem__(self, idx):
        return self.text[idx:idx+self.K], self.text[idx+self.K]

In [76]:
train_tokens[:3]

[' \n', ' = Valkyria Chronicles III = \n', ' \n']

In [77]:
tokens = np.concatenate([x.split() for x in train_tokens[:3]])

In [78]:
tokens

array(['=', 'Valkyria', 'Chronicles', 'III', '='], dtype='<U32')

In [79]:
np.array([ vocab2index.get(x, 0) for x in tokens])

array([2, 4, 1, 3, 2])

In [80]:
toy_train_ds = LanguageModelDataset(train_tokens[:3], vocab2index)

In [81]:
toy_train_ds[0]

(array([2, 4, 1]), 3)

In [82]:
toy_train_ds[1]

(array([4, 1, 3]), 2)

In [84]:
len(toy_train_ds)

2

In [115]:
len(train_tokens), len(valid_tokens)

(36718, 3760)

In [None]:
# restricting text for faster training
train_ds = LanguageModelDataset(train_tokens[:5000], vocab2index)
valid_ds = LanguageModelDataset(valid_tokens[:100], vocab2index)

In [117]:
class NeuralModel(nn.Module):
    def __init__(self, vocab_size, emb_size=50, K=3, M=100):
        """Initialize an embedding layer and a linear layer
        """
        super(NeuralModel, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.linear1 = nn.Linear(K*emb_size, M)
        self.linear2 = nn.Linear(M, vocab_size)
        
    def forward(self, x):
        x = self.emb(x)
        x = x.flatten(1)
        x = F.relu(self.linear1(x))
        return self.linear2(x)

In [118]:
train_dl = DataLoader(train_ds, batch_size=1000, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=1000, shuffle=False)

In [119]:
x, y = next(iter(train_dl))

In [120]:
vocab_size = len(words)
model = NeuralModel(vocab_size)

In [121]:
model(x).shape

torch.Size([1000, 21590])

In [122]:
def val_metrics(model, valid_dl):
    model.eval()
    losses = []
    for x, y in valid_dl:
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        losses.append(loss.item())
    return np.mean(losses)

In [127]:
def train_epocs(model, optimizer, epochs=10):
    for i in range(epochs):
        losses = []
        model.train()
        for x, y in train_dl:
            y_hat = model(x)
            loss = F.cross_entropy(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
        train_loss = np.mean(losses)
        val_loss = val_metrics(model, valid_dl)
        
        print("train_loss %.3f train_ppl %.3f val_loss %.3f val_ppl %.3f" % (
            train_loss, math.exp(train_loss), val_loss, math.exp(val_loss)))

In [124]:
val_metrics(model, valid_dl)

9.993993918100992

In [135]:
def update_learning_rate(optimizer, lr):
    for g in optimizer.param_groups:
        g['lr'] = lr

In [132]:
vocab_size = len(words)
model = NeuralModel(vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1)

In [133]:
train_epocs(model, optimizer, 5)

train_loss 60.789 train_ppl 251339632163322557108322304.000 val_loss 104.546 val_ppl 2533378217971242035698838669910209404961030144.000
train_loss 34.861 train_ppl 1380432020013199.000 val_loss 35.286 val_ppl 2111226875019214.750
train_loss 12.375 train_ppl 236814.552 val_loss 49.514 val_ppl 3189923016260578705408.000
train_loss 7.548 train_ppl 1896.828 val_loss 50.759 val_ppl 11079817131936681295872.000
train_loss 7.318 train_ppl 1506.507 val_loss 50.863 val_ppl 12293597738840757370880.000


In [134]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
train_epocs(model, optimizer, 5)

train_loss 7.101 train_ppl 1212.768 val_loss 50.509 val_ppl 8625494110465340473344.000
train_loss 6.871 train_ppl 964.270 val_loss 50.415 val_ppl 7849021847812407558144.000
train_loss 6.829 train_ppl 924.011 val_loss 50.376 val_ppl 7550684868131036332032.000
train_loss 6.815 train_ppl 911.303 val_loss 50.350 val_ppl 7358852457708025544704.000
train_loss 6.810 train_ppl 906.539 val_loss 50.367 val_ppl 7487324502420585709568.000


In [136]:
train_epocs(model, optimizer, 5)

train_loss 6.809 train_ppl 905.969 val_loss 50.359 val_ppl 7423465666402831040512.000
train_loss 6.808 train_ppl 905.397 val_loss 50.359 val_ppl 7421399309995187634176.000
train_loss 6.809 train_ppl 906.262 val_loss 50.366 val_ppl 7476112253852637462528.000
train_loss 6.809 train_ppl 906.062 val_loss 50.374 val_ppl 7533881104917638152192.000
train_loss 6.809 train_ppl 906.279 val_loss 50.370 val_ppl 7502425736906208706560.000


## Lab:
Write a pipeline for word2vec with negative sample.  