In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
from torchtext.datasets import Multi30k
from torchtext.data import Field,BucketIterator

In [15]:

import spacy

from typing import Tuple
import random
import math
import time



In [16]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

In [17]:
SRC = Field(tokenize="spacy",
            tokenizer_language="it",
           init_token= '<sos>',
           eos_token= '<eos>',
           lower = True)

TRG = Field(tokenize= "spacy",
           tokenizer_language="it",
           init_token='<sos>',
           eos_token='<eos>',
           lower= True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))


In [19]:
!ls

 data					  tut3-model.pt
 it					  Untitled1.ipynb
 itwiki-latest-pages-articles.xml	  Untitled.ipynb
'Pytorch language model tutorial.ipynb'   wikiextractor
 tut3-model-modified_attn.pt


In [20]:
import torchtext

In [35]:
datasets =torchtext.datasets.TranslationDataset(path="data",exts = ('/itaerr', '/itacor'),
                                                    fields = (SRC, TRG))

In [42]:
train_data, valid_data, test_data= datasets.split([0.6,0.2,0.2])

In [43]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [44]:
SRC.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fb1c0e58850>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             'di': 4,
             '_': 5,
             'e': 6,
             "'": 7,
             'il': 8,
             'la': 9,
             'num': 10,
             'in': 11,
             'a': 12,
             'che': 13,
             'del': 14,
             'un': 15,
             'l': 16,
             'è': 17,
             'per': 18,
             'della': 19,
             'unk': 20,
             'nel': 21,
             'si': 22,
             'le': 23,
             'i': 24,
             'una': 25,
             'con': 26,
             'da': 27,
             'al': 28,
             'dell': 29,
             'non': 30,
             'più': 31,
             'dei': 32,
             'alla': 33,
             'come': 34,
             'anche': 35,
             'delle': 36,
             'gli': 37,

In [45]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data,valid_data,test_data), \
    batch_size = BATCH_SIZE, \
    device= device) 

In [46]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch import Tensor

class Encoder(nn.Module):
    def __init__(self, 
                 input_dim: int, 
                 emb_dim: int, 
                 enc_hid_dim: int, 
                 dec_hid_dim: int, 
                 dropout: float):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, 
                src: Tensor) -> Tuple[Tensor]:
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        # Note: torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # is of shape [batch_size, enc_hid_dim * 2]
        
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src sent len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [47]:
class Attention(nn.Module):
    def __init__(self, 
                 enc_hid_dim: int, 
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
        
        self.attn = nn.Linear(self.attn_in, attn_dim)
        self.v = nn.Parameter(torch.rand(attn_dim))
        
    def forward(self, 
                decoder_hidden: Tensor, 
                encoder_outputs: Tensor) -> Tensor:
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #decoder_hidden = [batch size, src sent len, dec hid dim]
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        # Step 1: to enable feeding through "self.attn" pink box above, concatenate 
        # `repeated_decoder_hidden` and `encoder_outputs`:
        # torch.cat((hidden, encoder_outputs), dim = 2) has shape 
        # [batch_size, seq_len, enc_hid_dim * 2 + dec_hid_dim]
        
        # Step 2: feed through self.attn to end up with:
        # [batch_size, seq_len, attn_dim]
        
        # Step 3: feed through tanh       
        
        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden, 
            encoder_outputs), 
            dim = 2))) 
        
        #energy = [batch size, src sent len, attn_dim]
        
        energy = energy.permute(0, 2, 1)
        
        #energy = [batch size, attn_dim, src sent len]
        
        #v = [attn_dim]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        #v = [batch size, 1, attn_dim]
        
        # High level: energy a function of both encoder element outputs and most recent decoder hidden state,
        # of shape attn_dim x enc_seq_len for each observation
        # v, being 1 x attn_dim, transforms this into a vector of shape 1 x enc_seq_len for each observation
        # Then, we take the softmax over these to get the output of the attention function

        attention = torch.bmm(v, energy).squeeze(1)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [48]:
class ModifiedAttention(nn.Module):
    def __init__(self, 
                 enc_hid_dim: int, 
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
        
        self.attn = nn.Linear(self.attn_in, attn_dim)
        
    def forward(self, 
                decoder_hidden: Tensor, 
                encoder_outputs: Tensor) -> Tensor:
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #decoder_hidden = [batch size, src sent len, dec hid dim]
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        # Step 1: to enable feeding through "self.attn" pink box above, concatenate 
        # `repeated_decoder_hidden` and `encoder_outputs`:
        # torch.cat((hidden, encoder_outputs), dim = 2) has shape 
        # [batch_size, seq_len, enc_hid_dim * 2 + dec_hid_dim]
        
        # Step 2: feed through self.attn to end up with:
        # [batch_size, seq_len, attn_dim]
        
        # Step 3: feed through tanh       
        
        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden, 
            encoder_outputs), 
            dim = 2))) 
        
        #energy = [batch size, src sent len, attn_dim]

        attention = torch.sum(energy, dim=2)
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [49]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim: int, 
                 emb_dim: int, 
                 enc_hid_dim: int, 
                 dec_hid_dim: int, 
                 dropout: int, 
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        # Note: from Attention: self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, attn_dim)
        
        # Note: `output_dim` same as `vocab_size`
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def _weighted_encoder_rep(self, 
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:
        
        # Attention, at a high level, takes in:
        # The decoder hidden state
        # All the "seq_len" encoder outputs
        # Outputs a vector summing to 1 of length seq_len for each observation
        a = self.attention(decoder_hidden, encoder_outputs)

        #a = [batch size, src len]

        a = a.unsqueeze(1)

        #a = [batch size, 1, src len]

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        #weighted_encoder_rep = [batch size, 1, enc hid dim * 2]

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        #weighted_encoder_rep = [1, batch size, enc hid dim * 2]
        
        return weighted_encoder_rep
        
        
    def forward(self, 
                input: Tensor, 
                decoder_hidden: Tensor, 
                encoder_outputs: Tensor) -> Tuple[Tensor]:
             
        #input = [batch size] Note: "one character at a time"
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, 
                                                          encoder_outputs)
        
        # Then, the input to the decoder _for this character_ is a concatenation of:
        # This weighted attention
        # The embedding itself
        
        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
        
        #output = [sent len, batch size, dec hid dim * n directions]
        #decoder_hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == decoder_hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)
        
        output = self.out(torch.cat((output, 
                                     weighted_encoder_rep, 
                                     embedded), dim = 1))
        
        #output = [bsz, output dim]
        
        return output, decoder_hidden.squeeze(0)

In [50]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder: nn.Module, 
                 decoder: nn.Module, 
                 device: torch.device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, 
                src: Tensor, 
                trg: Tensor, 
                teacher_forcing_ratio: float = 0.5) -> Tensor:
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [51]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ATTN_DIM = 64
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [52]:
mod_attn = ModifiedAttention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec_mod_attn = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, mod_attn)

model_mod_attn = Seq2Seq(enc, dec_mod_attn, device).to(device)

In [53]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)
model_mod_attn.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5979, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): ModifiedAttention(
      (attn): Linear(in_features=1536, out_features=64, bias=True)
    )
    (embedding): Embedding(6179, 256)
    (rnn): GRU(1280, 512)
    (out): Linear(in_features=1792, out_features=6179, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [54]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')
print(f'The model has {count_parameters(model_mod_attn):,} trainable parameters')

The model has 19,935,651 trainable parameters
The model has 19,935,587 trainable parameters


In [55]:
optimizer = optim.Adam(model.parameters())

optimizer_mod_attn = optim.Adam(model_mod_attn.parameters())

In [56]:
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [57]:
def train(model: nn.Module, 
          iterator: BucketIterator, 
          optimizer: optim.Adam, 
          criterion: nn.modules.loss.CrossEntropyLoss, 
          clip: float):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [58]:
def evaluate(model: nn.Module, 
             iterator: BucketIterator, 
             criterion: nn.modules.loss.CrossEntropyLoss):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [59]:
def epoch_time(start_time: int, 
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [60]:
MODEL_PATH = 'tut3-model.pt'
MODEL_PATH_MOD_ATTN = 'tut3-model-modified_attn.pt'

In [61]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model_mod_attn, train_iterator, optimizer_mod_attn, criterion, CLIP)
    valid_loss = evaluate(model_mod_attn, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_mod_attn.state_dict(), MODEL_PATH_MOD_ATTN)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 7.195 | Train PPL: 1333.066
	 Val. Loss: 6.028 |  Val. PPL: 414.790
Epoch: 02 | Time: 0m 12s
	Train Loss: 6.417 | Train PPL: 612.122
	 Val. Loss: 5.928 |  Val. PPL: 375.221
Epoch: 03 | Time: 0m 12s
	Train Loss: 6.289 | Train PPL: 538.410
	 Val. Loss: 5.967 |  Val. PPL: 390.212
Epoch: 04 | Time: 0m 12s
	Train Loss: 6.130 | Train PPL: 459.526
	 Val. Loss: 5.874 |  Val. PPL: 355.824
Epoch: 05 | Time: 0m 13s
	Train Loss: 6.022 | Train PPL: 412.314
	 Val. Loss: 5.877 |  Val. PPL: 356.563
Epoch: 06 | Time: 0m 13s
	Train Loss: 5.901 | Train PPL: 365.284
	 Val. Loss: 5.896 |  Val. PPL: 363.458
Epoch: 07 | Time: 0m 13s
	Train Loss: 5.789 | Train PPL: 326.632
	 Val. Loss: 5.947 |  Val. PPL: 382.624
Epoch: 08 | Time: 0m 12s
	Train Loss: 5.691 | Train PPL: 296.279
	 Val. Loss: 5.958 |  Val. PPL: 387.014
Epoch: 09 | Time: 0m 12s
	Train Loss: 5.601 | Train PPL: 270.697
	 Val. Loss: 5.931 |  Val. PPL: 376.467
Epoch: 10 | Time: 0m 12s
	Train Loss: 5.517 | Train PP

In [62]:
model_mod_attn.load_state_dict(torch.load(MODEL_PATH_MOD_ATTN))

test_loss = evaluate(model_mod_attn, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


| Test Loss: 5.845 | Test PPL: 345.575 |


In [63]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_PATH)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 6.900 | Train PPL: 992.536
	 Val. Loss: 6.002 |  Val. PPL: 404.248
Epoch: 02 | Time: 0m 12s
	Train Loss: 6.237 | Train PPL: 511.362
	 Val. Loss: 5.851 |  Val. PPL: 347.665
Epoch: 03 | Time: 0m 12s
	Train Loss: 6.076 | Train PPL: 435.426
	 Val. Loss: 5.854 |  Val. PPL: 348.612
Epoch: 04 | Time: 0m 12s
	Train Loss: 5.895 | Train PPL: 363.329
	 Val. Loss: 5.872 |  Val. PPL: 354.919
Epoch: 05 | Time: 0m 12s
	Train Loss: 5.742 | Train PPL: 311.661
	 Val. Loss: 5.949 |  Val. PPL: 383.499
Epoch: 06 | Time: 0m 12s
	Train Loss: 5.621 | Train PPL: 276.224
	 Val. Loss: 5.975 |  Val. PPL: 393.430
Epoch: 07 | Time: 0m 12s
	Train Loss: 5.495 | Train PPL: 243.468
	 Val. Loss: 6.039 |  Val. PPL: 419.428
Epoch: 08 | Time: 0m 13s
	Train Loss: 5.408 | Train PPL: 223.125
	 Val. Loss: 5.963 |  Val. PPL: 388.761
Epoch: 09 | Time: 0m 12s
	Train Loss: 5.285 | Train PPL: 197.408
	 Val. Loss: 5.974 |  Val. PPL: 393.264
Epoch: 10 | Time: 0m 13s
	Train Loss: 5.143 | Train PPL

In [64]:

model.load_state_dict(torch.load(MODEL_PATH))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')



| Test Loss: 5.817 | Test PPL: 335.975 |
