# REFERENCE
1. http://jalammar.github.io/illustrated-transformer/
2. https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb
3. Good lectures on attention mechanism and positional encoding https://www.youtube.com/watch?v=gJ9kaJsE78k
4. https://www.youtube.com/watch?v=-9vVhYEXeyQ
    
# Todo
- [ ] check why are we using src_mask in the 2nd multihead attn of decoder
- [ ] why are we not using positional encoding using sine and cosine curve 

# IMPORTS

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
from tqdm import tqdm as tqdm

# GLOBAL SETTINGS

In [31]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# DATA PREPARATION

In [32]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [33]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [34]:
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

In [35]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [36]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

# BUILDING THE MODEL
We will do German to English translation where input/source sentence (in German)is passes through the encoder which gives a context vector. Using this context vector we will be decoding the German src sentence to English 

## Encoder
![](https://charon.me/img/15871364052016.jpg)

In [39]:
'''
    Encoder arguments
        input_dim --> [src_len,batch_size]
        hid_dim --> convert each input word to embedding
        n_layers --> number of transformer layers
        n_heads --> number of heads for multi head attention
        pf_dim --> hidden layer upscaling in pointwise feedforward layer
        dropout --> dropout value 
        device --> gpu or cpu
        max_length --> max langth of the sentence, used in positional encoding
        
    Encoder outputs
        src after changing it through the self attention and pointwise feed forward layer
        
    Encoder description
        - combines source and positional embedding
        - passes it through self attention layer and calculates attention
        - passes attention output to pointwise feed forward layer 
        - returns the src after passing it to the aforementioned layers
'''

class Encoder(nn.Module):
    def __init__(self,
                 input_dim,
                 hid_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 dropout,
                 device,
                 max_length=100):
        super().__init__()

        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length,hid_dim)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.dropout = nn.Dropout(dropout)
        self.layers = nn.ModuleList([
            EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device)
            for _ in range(n_layers)
        ])

    def forward(self, src, src_mask):
        # src_mask is simply the same shape as the source and is 1 where <pad> is not there else 0
        # src batch_size, src_len
        
        batch_size = src.shape[0]
        src_len = src.shape[1]

        # pos batch_size, src_len
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size,
                                                           1).to(device)
        # src batch_size, src_len , hid_dim
        src = self.dropout((self.tok_embedding(src) * self.scale) +
                           self.pos_embedding(pos))

        # pass it to EncoderLayer and call it n_layers time
        for layer in self.layers:
            src = layer(src, src_mask)

        #src = [batch size, src len, hid dim]
        return src

## Encoder Layer

In [40]:
'''
    EncoderLayer arguments
    
        hid_dim --> convert each input word to embedding
        n_heads --> number of heads for multi head attention
        pf_dim --> hidden layer upscaling in pointwise feedforward layer
        dropout --> dropout value 
        device --> gpu or cpu
        
    EncoderLayer outputs
        src after changing it through the self attention and pointwise feed forward layer
        
    EncoderLayer description
        -  First It takes the src coming from encoder and passes it through attention mechanism.
        -  Then applies the layer norm and feed forward operations on the transformed source
'''

class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.dropout = nn.Dropout(dropout)
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.pointwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)

    def forward(self, src, src_mask):
        # src = [batch_size,src_len,hid_dim]
        # src_mask = [batch_size,1,1,hid_dim] ## todo check this

        # self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(self.dropout(_src) + src)
        _src = self.pointwise_feedforward(src)

        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))

        #src = [batch size, src len, hid dim]

        return src

## MultiHeadAttention Layer
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer-attention.png)

In [41]:
'''
    MultiHeadAttentionLayer arguments
    
        hid_dim --> convert each input word to embedding
        n_heads --> number of heads for multi head attention
        dropout --> dropout value 
        device --> gpu or cpu
        
    MultiHeadAttentionLayer outputs
        src after changing it through the self attention mechanism and the attention mask
        
    MultiHeadAttentionLayer description
        -  takes the query key and value matrices
        -  computes energy by multiplying query and key matrices
        -  apply mask on the energy vector
        -  while processing the encoder src mask is the mask where the unnecessary pad tokens are removed
        -  while processing the decoder trg mask is the mask where future information is masked so that
        -  decoder can only see previous and current word while decoding next word
        -  the energy matrix is then multiplied by the value matrix to get the final transformed source matrix
        -  we also return the attention mask after applying a softmax on the energy vector 
        -  this could later be used to visually test the attention on words
'''

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dims = hid_dim // n_heads

        # explain why use 3 different layer , they are sorting the embedding data so that we can divide it later
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        # hyper params
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dims])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        # query = [batch_size,query_len,hid_dim]
        # key = [batch_size,key_len,hid_dim]
        # value = [batch_size,value_len,hid_dim]

        ## SORTING TO KEEP SIMILAR THINGS TOGETHER
        Q = self.fc_q(query)
        K = self.fc_v(value)
        V = self.fc_k(key)

        ## DIVIDE THE
        # if we have 16 heads with 1024 hid dim then head_dims is 1024 // 16 = 64
        Q = Q.view(batch_size, -1, self.n_heads,
                   self.head_dims).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads,
                   self.head_dims).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads,
                   self.head_dims).permute(0, 2, 1, 3)

        # Q.K^T
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        if mask is not None:
            ## todo check out this operation [batch_size,n_heads,query_len,key_len] vs [batch_size,1,1,key_len]
            energy = energy.masked_fill(mask == 0, -1e10) 
        # attention = [batch_size,n_heads,query_len,key_len]
        attention = self.dropout(torch.softmax(energy, dim=-1))

        # x [batch_size,n_heads,query_len,head_dims]
        x = torch.matmul(attention, V)

        x = x.permute(0, 2, 1, 3).contiguous()
        # wkt self.head_dims = hid_dim // n_heads, and wakt x dim [batch_size,n_heads,query_len,head_dims]
        # so we get hid_dim, and whatever is left is query_len
        x = x.view(batch_size, -1, self.hid_dim)

        x = self.fc_o(x)
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [42]:
'''
    PositionwiseFeedforwardLayer arguments
    
        hid_dim --> convert each input word to embedding
        pf_dim --> hidden layer upscaling in pointwise feedforward layer
        dropout --> dropout value 
        
    PositionwiseFeedforwardLayer outputs
        src after changing it through couple of linear layers and relu activation
        
'''


class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()

        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        #x = [batch size, seq len, hid dim]

        x = self.dropout(torch.relu(self.fc_1(x)))

        #x = [batch size, seq len, pf dim]

        x = self.fc_2(x)

        #x = [batch size, seq len, hid dim]

        return x

# Decoder
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer-decoder.png)

In [43]:
'''
    Decoder arguments
        output_dim --> [teg_len,batch_size]
        hid_dim --> convert each input word to embedding
        n_layers --> number of transformer layers
        n_heads --> number of heads for multi head attention
        pf_dim --> 
        dropout --> dropout value 
        device --> gpu or cpu
        max_length --> max langth of the sentence, used in positional encoding
        
    Decoder outputs
        trg after changing it through the layers shown in the figure above
        
    Decoder description
        - combines trg and postional embedding
        - passes it through the decoder layers as shown in the figure 
        - the final output is passed through linear layer to match output vocab dimension
        - we would later do softmax on this output to find the best word prediction

'''


class Decoder(nn.Module):
    def __init__(self,
                 output_dim,
                 hid_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 dropout,
                 device,
                 max_length=100):
        super().__init__()
        self.device = device
        # initital layers
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        # processing layers
        self.layers = nn.ModuleList([
            DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device)
            for _ in range(n_layers)
        ])

        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

        # last layers
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        #pos = [batch size, trg len]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size,
                                                           1).to(self.device)

        #trg = [batch size, trg len, hid dim]
        trg = self.dropout((self.tok_embedding(trg) * self.scale) +
                           self.pos_embedding(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]

        output = self.fc_out(trg)

        #output = [batch size, trg len, output dim]

        return output, attention

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer-decoder.png)

In [44]:
'''
    DecoderLayer arguments
    
        hid_dim --> convert each input word to embedding
        n_heads --> number of heads for multi head attention
        pf_dim --> hidden layer upscaling in pointwise feedforward layer
        dropout --> dropout value 
        device --> gpu or cpu
        
    DecoderLayer outputs
        trg after changing it through the self attention and pointwise feed forward layers
        
    DecoderLayer description
        - The decoder layer is almost as same as encoder layer with thr following changes
        - decoder layer uses 2 attention layers
        - first trg goes through self attention layer and attention is calculated, here trg is masked using trg_mask
        - attention output is then passed through add and layernorm block
        - again we calculate attention using encoder key value and decoder query
        - this 2nd attention output is then passed through add and layernorm block
'''


class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads,
                                                      dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(
            hid_dim, n_heads, dropout, device)
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)

        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(
            hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch_size, trg_len, hid_dim]
        # enc_src = [batch_size, src_len, hid_dim]
        # trg_mask = [batch_size,1,trg_len,trg_len] ## todo check this dim
        # src_mask = [batch_size,1,1,trg_len] ## because we will multiply this with energy vector in the attention block

        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        #trg = [batch size, trg len, hid dim]

        #encoder attention
        _trg, attention = self.encoder_attention(
            trg, enc_src, enc_src, src_mask)  ## todo why src_mask not trg_mask
        # query, key, value

        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)

        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]

        return trg, attention

In [45]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,src_pad_idx,trg_pad_idx,device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self,src):
        # trg = [batch_size,trg_len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        ## todo check this func
        #trg = [batch size, trg len]
    
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)    
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask
    
    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [17]:
# torch.arange(0,20).repeat(4,1).shape , torch.arange(0,20).unsqueeze(0).repeat(4,1).shape

In [18]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [19]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [20]:
SRC.vocab.stoi[SRC.pad_token]

1

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,038,597 trainable parameters


In [22]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [23]:
model.apply(initialize_weights);

In [24]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [25]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [26]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [27]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
# to understand how the training and loss works check out the last part of the following blog
# http://jalammar.github.io/illustrated-transformer/
# we feed first trg <start> token to decoder and decoder predicts the next token
# so while computing the loss we ignore <start> token from trg original and <end> from the target ??

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 4.230 | Train PPL:  68.700
	 Val. Loss: 2.995 |  Val. PPL:  19.991
Epoch: 02 | Time: 0m 13s
	Train Loss: 2.788 | Train PPL:  16.246
	 Val. Loss: 2.275 |  Val. PPL:   9.729
Epoch: 03 | Time: 0m 13s
	Train Loss: 2.220 | Train PPL:   9.205
	 Val. Loss: 1.975 |  Val. PPL:   7.207
Epoch: 04 | Time: 0m 13s
	Train Loss: 1.877 | Train PPL:   6.532
	 Val. Loss: 1.805 |  Val. PPL:   6.078
Epoch: 05 | Time: 0m 13s
	Train Loss: 1.636 | Train PPL:   5.132
	 Val. Loss: 1.706 |  Val. PPL:   5.507
Epoch: 06 | Time: 0m 13s
	Train Loss: 1.446 | Train PPL:   4.247
	 Val. Loss: 1.647 |  Val. PPL:   5.190
Epoch: 07 | Time: 0m 13s
	Train Loss: 1.296 | Train PPL:   3.654
	 Val. Loss: 1.627 |  Val. PPL:   5.087
Epoch: 08 | Time: 0m 12s
	Train Loss: 1.171 | Train PPL:   3.225
	 Val. Loss: 1.623 |  Val. PPL:   5.069
Epoch: 09 | Time: 0m 13s
	Train Loss: 1.062 | Train PPL:   2.892
	 Val. Loss: 1.614 |  Val. PPL:   5.022
Epoch: 10 | Time: 0m 13s
	Train Loss: 0.967 | Train PPL