In [2]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
SOS_token = 0
EOS_token = 1

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [16]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('translation-data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['je saigne gravement .', 'i m bleeding badly .']


In [21]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

## Embed pairs

In [None]:
n_iters = 1000
training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]

In [90]:
val_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(200)]

In [50]:
# Show example pair embedded
for x in training_pairs[:3]:
    french_sent = x[0].flatten()
    english_sent = x[1].flatten()
    
    print("French:")
    print(french_sent)
    print([input_lang.index2word[i.item()] for i in french_sent])
    print("-"*30)
    print("English:")
    print(english_sent)
    print([output_lang.index2word[i.item()] for i in english_sent])
    print()

French:
tensor([145,  25,  66, 410,   5,   1])
['c', 'est', 'un', 'bigot', '.', 'EOS']
------------------------------
English:
tensor([ 14,  15,  42, 235,   4,   1])
['he', 's', 'a', 'bigot', '.', 'EOS']

French:
tensor([ 348, 1220, 3633,  689,    5,    1])
['ils', 'vont', 'torturer', 'tom', '.', 'EOS']
------------------------------
English:
tensor([ 221,   78,   61,  532, 2252,  378,    4,    1])
['they', 're', 'going', 'to', 'torture', 'tom', '.', 'EOS']

French:
tensor([ 123,  124,   34,  101, 2727,  241, 3767,    5,    1])
['nous', 'avons', 'fini', 'de', 'repondre', 'aux', 'questions', '.', 'EOS']
------------------------------
English:
tensor([  77,   78,   22, 1613, 2347,    4,    1])
['we', 're', 'done', 'answering', 'questions', '.', 'EOS']



### Define the model
In this tutorial, we train `nn.TransformerEncoder` model on a language modeling task. The language modeling task is to assign a probability for the likelihood of a given word (or a sequence of words) to follow a sequence of words. 

A sequence of tokens are passed to the embedding layer first, followed by a positional encoding layer to account for the order of the word (see the next paragraph for more details). 

The nn.TransformerEncoder consists of multiple layers of `nn.TransformerEncoderLayer`. Along with the input sequence, a square attention mask is required because the self-attention layers in `nn.TransformerEncoder` are only allowed to attend the earlier positions in the sequence. For the language modeling task, any tokens on the future positions should be masked. 

To have the actual words, the output of `nn.TransformerEncoder` model is sent to the final Linear layer, which is followed by a log-Softmax function.

In [1]:
import torch 
print(torch.__version__)

1.2.0


In [119]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):

    def __init__(self, src_lang_size, target_lang_size, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(src_lang_size, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, target_lang_size)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        #output = output.argmax(dim=2)
        return output

### Positional Encoding
`PositionalEncoding` module injects some information about the relative or absolute position of the tokens in the sequence. The positional encodings have the same dimension as the embeddings so that the two can be summed. Here, we use sine and cosine functions of different frequencies.

In [120]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

### Load and batch data
The training process uses Wikitext-2 dataset from `torchtext`. The vocab object is built based on the train dataset and is used to numericalize tokens into tensors. 

Starting from sequential data, the `batchify()` function arranges the dataset into columns, trimming off any tokens remaining after the data has been divided into batches of size `batch_size`. 

For instance, with the alphabet as the sequence (total length of 26) and a batch size of 4, we would divide the alphabet into 4 sequences of length 6.

These columns are treated as independent by the model, which means that the dependence of G and F can not be learned, but allows more efficient batch processing.

In [121]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Initiate an instance
The model is set up with the hyperparameter below. The vocab size is equal to the length of the vocab object.

In [122]:
src_lang_size = input_lang.n_words # the size of the french vocab
target_lang_size = output_lang.n_words # the size of the english vocab

emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(src_lang_size, target_lang_size, emsize, nhead, nhid, nlayers, dropout).to(device)

model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=20

### Run the model
`CrossEntropyLoss` is applied to track the loss and SGD implements stochastic gradient descent method as the optimizer. The initial learning rate is set to 5.0. StepLR is applied to adjust the learn rate through epochs. During the training, we use nn.utils.clip_grad_norm_ function to scale all the gradient together to prevent exploding.

In [123]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [126]:
test_x, test_y = training_pairs[0]
out = model.forward(test_x)
out.sum(dim=0)

tensor([[ 3.3119, -3.3002,  1.3234,  ..., -8.0747,  3.2964, -0.7686]],
       grad_fn=<SumBackward1>)

In [130]:
criterion(out, test_y)

ValueError: Expected target size (6, 2803), got torch.Size([6, 1])

In [134]:
x = torch.randn(10, 3)
output = F.sigmoid(x)
target = torch.Tensor(10, 3).random_(2)

criterion = nn.BCELoss(reduce=False)
loss = criterion(output, target)
print(loss)

tensor([[0.3818, 0.3180, 1.1355],
        [0.5823, 0.4457, 0.1678],
        [0.4351, 0.6102, 1.2674],
        [1.3871, 0.9824, 0.4858],
        [0.1288, 0.9224, 1.0233],
        [1.1999, 1.8882, 0.9079],
        [0.4688, 1.1591, 1.2756],
        [2.2619, 0.4256, 1.3807],
        [1.6341, 0.3187, 0.3346],
        [1.0537, 0.5703, 0.1285]])




In [135]:
output.shape, target.shape

(torch.Size([10, 3]), torch.Size([10, 3]))

In [127]:
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    n_pairs = len(training_pairs)
    
    #for batch, i in enumerate(range(0, n_pairs - 1)):
    for i, (src, tar) in enumerate(training_pairs):
        optimizer.zero_grad()
        
        # Run source sentence through translator
        output = model(src)
        #output = output.view(-1, target_lang_size)
        
        print(output.shape, tar.shape)
        # Calculate loss and perform backprop
        loss = criterion(output, tar)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        # Add to total loss
        total_loss += loss.item()
        
        if i%200 == 0:
            print("Epoch: %i, Loss: %i" % (epoch, loss))        

In [128]:
def evaluate(eval_model, val_pairs):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.

    with torch.no_grad():
        for src, tar in val_pairs:
            output = eval_model(data)
            #output_flat = output.view(-1, target_lang_size)
            total_loss += len(src) * criterion(output_flat, targets).item()
    return total_loss / len(val_pairs)

In [129]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    #epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

torch.Size([6, 1, 2803]) torch.Size([6, 1])


ValueError: Expected target size (6, 2803), got torch.Size([6, 1])

In [None]:
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [10]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 3195 batches | lr 5.00 | ms/batch 40.03 | loss  2.12 | ppl     8.29
| epoch   1 |   400/ 3195 batches | lr 5.00 | ms/batch 39.14 | loss  2.06 | ppl     7.82
| epoch   1 |   600/ 3195 batches | lr 5.00 | ms/batch 39.28 | loss  1.95 | ppl     7.03
| epoch   1 |   800/ 3195 batches | lr 5.00 | ms/batch 39.43 | loss  2.01 | ppl     7.45
| epoch   1 |  1000/ 3195 batches | lr 5.00 | ms/batch 39.24 | loss  1.91 | ppl     6.77
| epoch   1 |  1200/ 3195 batches | lr 5.00 | ms/batch 39.37 | loss  1.92 | ppl     6.80
| epoch   1 |  1400/ 3195 batches | lr 5.00 | ms/batch 39.12 | loss  1.92 | ppl     6.83
| epoch   1 |  1600/ 3195 batches | lr 5.00 | ms/batch 39.30 | loss  1.94 | ppl     6.94
| epoch   1 |  1800/ 3195 batches | lr 5.00 | ms/batch 39.26 | loss  1.93 | ppl     6.91
| epoch   1 |  2000/ 3195 batches | lr 5.00 | ms/batch 39.24 | loss  1.94 | ppl     6.97
| epoch   1 |  2200/ 3195 batches | lr 5.00 | ms/batch 39.09 | loss  1.93 | ppl     6.87
| epoch   1 |  2400/ 

In [12]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  0.60 | test ppl     1.82


In [None]:
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)

In [24]:
data.shape, targets.shape

(torch.Size([6, 20]), torch.Size([120]))

In [None]:
def index_to_string(indices):
    if len(indices.shape) > 1:
        indices = indices.view(-1)
    return [TEXT.vocab.itos[i] for i in indices]

def predict_text(model, x):
    pred = best_model.forward(x)
    pred = pred.view(-1, ntokens)
    pred = pred.argmax(dim=1)
    pred_text = index_to_string(pred)
    return pred_text

In [None]:

pred_text = predict_text(model, data)

In [None]:
target_text = index_to_string(targets)

In [None]:
input_text = index_to_string(data)

In [83]:
input_text

['simmons',
 'rapid',
 'only',
 '.',
 'to',
 'for',
 'listing',
 'in',
 'could',
 'the',
 'a',
 'unk',
 '21',
 'august',
 'ensure',
 '.',
 'polar',
 'into',
 '<',
 '.',
 'one',
 ';',
 'necessary',
 '<eos>',
 'their',
 'a',
 'under',
 'a',
 'merely',
 'arikamedu',
 'splendid',
 '>',
 'entering',
 '2004',
 'the',
 'in',
 'regions',
 'the',
 'unk',
 'even',
 'second',
 'in',
 'workers',
 ' ',
 'limit',
 'loss',
 'the',
 'spiral',
 'be',
 'archaeological',
 'idea',
 'scholar',
 'the',
 ',',
 'survival',
 'the',
 'to',
 'entity',
 '>',
 'when',
 'later',
 'particular',
 'remained',
 'smaller',
 ',',
 ',',
 'planning',
 'around',
 'a',
 'site',
 '"',
 ',',
 'fourth',
 'carpenter',
 'of',
 'episode',
 'contain',
 'that',
 'and',
 'correctly',
 ',',
 ',',
 'at',
 'groups',
 'and',
 'however',
 '(',
 'a',
 '<',
 'was',
 '.',
 'claimed',
 'on',
 'was',
 'the',
 ',',
 'permanently',
 'it',
 'walpole',
 'prepared',
 'stopping',
 'there',
 'the',
 'included',
 'resupply',
 ',',
 'listed',
 'woody',

In [74]:
xx

tensor([[12075],
        [ 2981],
        [   78],
        [    6],
        [   13],
        [   24],
        [ 3617],
        [   12],
        [  158],
        [    4],
        [   15],
        [   10],
        [  517],
        [  192],
        [ 3424],
        [    6],
        [ 8377],
        [   71],
        [    8],
        [    6],
        [   49],
        [   45],
        [ 2029],
        [    3],
        [   42],
        [   15],
        [  139],
        [   15],
        [ 4457],
        [ 6438],
        [14907],
        [    9],
        [ 2502],
        [  530],
        [    4],
        [   12],
        [ 1866],
        [    4],
        [   10],
        [  261],
        [  106],
        [   12],
        [ 1530],
        [   14],
        [ 4708],
        [  827],
        [    4],
        [12090],
        [   39],
        [ 1687],
        [ 1007],
        [ 3491],
        [    4],
        [    5],
        [ 4934],
        [    4],
        [   13],
        [ 8559],
        [    9