In [22]:
#!pip install apache_beam
!pip install datasets



In [2]:
from datasets import list_datasets, list_metrics, load_dataset, load_metric
from pprint import pprint
import pandas as pd

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset

import spacy
import numpy as np

import random
import math
import time

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
datasets = list_datasets()

In [9]:
print(f"🤩 Currently {len(datasets)} datasets are available on the hub:")
pprint(datasets, compact=True)

🤩 Currently 606 datasets are available on the hub:
['acronym_identification', 'ade_corpus_v2', 'aeslc', 'afrikaans_ner_corpus',
 'ag_news', 'ai2_arc', 'air_dialogue', 'ajgt_twitter_ar', 'allegro_reviews',
 'allocine', 'alt', 'amazon_polarity', 'amazon_reviews_multi',
 'amazon_us_reviews', 'ambig_qa', 'amttl', 'anli', 'app_reviews', 'aqua_rat',
 'aquamuse', 'ar_cov19', 'ar_res_reviews', 'arabic_billion_words',
 'arabic_pos_dialect', 'arcd', 'arsentd_lev', 'art', 'arxiv_dataset',
 'aslg_pc12', 'asnq', 'asset', 'assin', 'assin2', 'atomic', 'autshumato',
 'bc2gm_corpus', 'best2009', 'bianet', 'bible_para', 'big_patent', 'billsum',
 'bing_coronavirus_query_set', 'biomrc', 'blended_skill_talk', 'blimp',
 'blog_authorship_corpus', 'bookcorpus', 'bookcorpusopen', 'boolq', 'bprec',
 'break_data', 'brwac', 'bsd_ja_en', 'bswac', 'c3', 'c4', 'cail2018', 'capes',
 'catalonia_independence', 'cawac', 'cc100', 'cdsc', 'cdt', 'cfq', 'chr_en',
 'cifar10', 'circa', 'civil_comments', 'clickbait_news_bg', 

In [9]:
hotpot_qa = list_datasets(with_details=True)[datasets.index('hotpot_qa')]
pprint(hotpot_qa.__dict__) 

{'author': None,
 'citation': '@inproceedings{yang2018hotpotqa,\n'
             '  title={{HotpotQA}: A Dataset for Diverse, Explainable '
             'Multi-hop Question Answering},\n'
             '  author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and '
             'Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan '
             'and Manning, Christopher D.},\n'
             '  booktitle={Conference on Empirical Methods in Natural Language '
             'Processing ({EMNLP})},\n'
             '  year={2018}\n'
             '}',
 'description': 'HotpotQA is a new dataset with 113k Wikipedia-based '
                'question-answer pairs with four key features:\n'
                '(1) the questions require finding and reasoning over multiple '
                'supporting documents to answer;\n'
                '(2) the questions are diverse and not constrained to any '
                'pre-existing knowledge bases or knowledge schemas;\n'
                '(3) we 

Loading the hotpot_qa from datasets

In [10]:

dataset = load_dataset('hotpot_qa','fullwiki')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2355.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1422.0, style=ProgressStyle(description…


Downloading and preparing dataset hotpot_qa/fullwiki (download: 629.52 MiB, generated: 615.88 MiB, post-processed: Unknown size, total: 1.22 GiB) to /root/.cache/huggingface/datasets/hotpot_qa/fullwiki/1.0.0/5b529f51b10fc1cb0a543005fd58b683fc65d6ab784a0b11f901d5aae47dd137...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=566426227.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=47454698.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=46213747.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset hotpot_qa downloaded and prepared to /root/.cache/huggingface/datasets/hotpot_qa/fullwiki/1.0.0/5b529f51b10fc1cb0a543005fd58b683fc65d6ab784a0b11f901d5aae47dd137. Subsequent calls will reuse this data.


## **Exploring the Hotpot QA dataset**

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
})

In [12]:
dataset['train'].__getitem__('context')[0].get('sentences')

[["Radio City is India's first private FM radio station and was started on 3 July 2001.",
  ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
  ' It plays Hindi, English and regional songs.',
  ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
  ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
  ' The Radio station currently plays a mix of Hindi and Regional music.',
  ' Abraham Thomas is the CEO of the company.'],
 ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.',
  " This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had

Context is a dictionary with two key value pairs - Sentences and Title.
Sentences is the text and title is the metadata. Sentences is a list of lists containing individual sentences

In [13]:
dataset['train'].__getitem__('context')[0].get('sentences')

[["Radio City is India's first private FM radio station and was started on 3 July 2001.",
  ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
  ' It plays Hindi, English and regional songs.',
  ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
  ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
  ' The Radio station currently plays a mix of Hindi and Regional music.',
  ' Abraham Thomas is the CEO of the company.'],
 ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.',
  " This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had

In [14]:
import itertools
print("".join(list(itertools.chain.from_iterable(dataset['train'].__getitem__('context')[0].get('sentences')))))

Radio City is India's first private FM radio station and was started on 3 July 2001. It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). It plays Hindi, English and regional songs. It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. The Radio station currently plays a mix of Hindi and Regional music. Abraham Thomas is the CEO of the company.Football in Albania existed before the Albanian Football Federation (FSHF) was created. This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, com

In [15]:
train_context, train_question = dataset['train'].__getitem__('context'), dataset['train'].__getitem__('question')
train_ans = dataset['train'].__getitem__('answer')


In [16]:
train_context[10],train_question[10],train_ans[10]

({'sentences': [['Lights Out Paris is the first studio album by American hip hop artist Sims, a member of Minneapolis indie hip hop collective Doomtree.',
    ' It was released July 28, 2005 on Doomtree Records and includes guest appearances from P.O.S, Crescent Moon, and Toki Wright, among others.',
    ' The album was re-released with four remixes and five songs from Sims\' "False Hopes Four" on vinyl in June 2015.'],
   ['Jaime Meline (born March 2, 1975), better known by his stage name El-P (shortened from El Producto), is an American hip hop recording artist, record producer, and record executive.',
    ' Originally a member of Company Flow, El-P has been a major driving force in alternative hip hop for more than two decades, producing for several notable rappers such as Aesop Rock, Mr. Lif, and Cage, among others.'],
   ['Born and Raised is the debut EP by American hip hop duo Smif-N-Wessun, released on December 3, 2013, under Duck Down Music Inc..',
    ' Entirely produced by Be

In [17]:
type(train_context)

list

In [18]:
["".join(list(itertools.chain.from_iterable(dataset['train'].__getitem__('context')[i].get('sentences')))) for i in range(2)]

['Radio City is India\'s first private FM radio station and was started on 3 July 2001. It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). It plays Hindi, English and regional songs. It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. The Radio station currently plays a mix of Hindi and Regional music. Abraham Thomas is the CEO of the company.Football in Albania existed before the Albanian Football Federation (FSHF) was created. This was evidenced by the team\'s registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition,

In [19]:
[train_ans[i] for i in range(2)]

["Arthur's Magazine", 'Delhi']

In [20]:
len(train_question)

90447

In [25]:
%%time
Train = [["".join(list(itertools.chain.from_iterable(dataset['train'].__getitem__('context')[i].get('sentences')))), train_question[i], train_ans[i]] for i in range(1000)]

CPU times: user 2h 12min 46s, sys: 6min 23s, total: 2h 19min 10s
Wall time: 2h 19min 10s


In [44]:
%%time
train_df = pd.DataFrame(Train, columns = ['Context','Question','Answer'])
train_df.to_csv('Train.csv', index =  False)

CPU times: user 124 ms, sys: 2.97 ms, total: 127 ms
Wall time: 131 ms


In [36]:
import os
os.getcwd()

'/content'

In [29]:
val_context, val_question = dataset['validation'].__getitem__('context'), dataset['validation'].__getitem__('question')
val_ans = dataset['validation'].__getitem__('answer')

In [32]:
val_ans[0], val_question[0], "".join(list(itertools.chain.from_iterable(dataset['validation'].__getitem__('context')[0].get('sentences'))))

('yes',
 'Were Scott Derrickson and Ed Wood of the same nationality?',
 'Adam Collis is an American filmmaker and actor. He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010. He also studied cinema at the University of Southern California from 1991 to 1997. Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995). In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood. The film concerns the period in Wood\'s life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau. Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.Tyler Bates (born June 5, 1965) is an American musician, music producer, and com

In [34]:
%%time
Val = [["".join(list(itertools.chain.from_iterable(dataset['validation'].__getitem__('context')[i].get('sentences')))), val_question[i], val_ans[i]] for i in range(1000)]

CPU times: user 11min 23s, sys: 116 ms, total: 11min 23s
Wall time: 11min 23s


In [45]:
val_df = pd.DataFrame(Val, columns = ['Context','Question','Answer'])
val_df.to_csv('Validation.csv', index = False)

In [46]:
spacy_en = spacy.load('en')
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [47]:
SRC = Field(tokenize=tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

In [48]:
train_data, val_data = TabularDataset.splits(path='.',
                                             train = 'Train.csv',
                                             validation = 'Validation.csv',
                                             skip_header = True,
                                             format = 'csv',
                                             fields =[('src',SRC), ('trg', TRG)])

In [83]:
vars(val_data.examples[2])

{'src': ['animorphs',
  'is',
  'a',
  'science',
  'fantasy',
  'series',
  'of',
  'young',
  'adult',
  'books',
  'written',
  'by',
  'katherine',
  'applegate',
  'and',
  'her',
  'husband',
  'michael',
  'grant',
  ',',
  'writing',
  'together',
  'under',
  'the',
  'name',
  'k.',
  'a.',
  'applegate',
  ',',
  'and',
  'published',
  'by',
  'scholastic',
  '.',
  'it',
  'is',
  'told',
  'in',
  'first',
  'person',
  ',',
  'with',
  'all',
  'six',
  'main',
  'characters',
  'taking',
  'turns',
  'narrating',
  'the',
  'books',
  'through',
  'their',
  'own',
  'perspectives',
  '.',
  'horror',
  ',',
  'war',
  ',',
  'dehumanization',
  ',',
  'sanity',
  ',',
  'morality',
  ',',
  'innocence',
  ',',
  'leadership',
  ',',
  'freedom',
  'and',
  'growing',
  'up',
  'are',
  'the',
  'core',
  'themes',
  'of',
  'the',
  'series',
  '.',
  'science',
  'fantasy',
  ',',
  'which',
  'also',
  'appeared',
  'under',
  'the',
  'titles',
  'impulse',
  'and',

In [50]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [67]:
BATCH_SIZE = 32

train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE, 
    sort = False,
    device = device)

In [69]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

In [70]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden

In [71]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        
        #context also used as the initial hidden state of the decoder
        hidden = context
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [72]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [73]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(27755, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(1534, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=1534, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [74]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 12,614,910 trainable parameters


In [75]:
optimizer = optim.Adam(model.parameters())

In [76]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [77]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [78]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [79]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [81]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 4.828 | Train PPL: 124.989
	 Val. Loss: 4.448 |  Val. PPL:  85.494
Epoch: 02 | Time: 0m 12s
	Train Loss: 4.760 | Train PPL: 116.724
	 Val. Loss: 4.441 |  Val. PPL:  84.878
Epoch: 03 | Time: 0m 12s
	Train Loss: 4.668 | Train PPL: 106.440
	 Val. Loss: 4.457 |  Val. PPL:  86.217
Epoch: 04 | Time: 0m 13s
	Train Loss: 4.595 | Train PPL:  98.958
	 Val. Loss: 4.532 |  Val. PPL:  92.943
Epoch: 05 | Time: 0m 12s
	Train Loss: 4.547 | Train PPL:  94.376
	 Val. Loss: 4.458 |  Val. PPL:  86.276
Epoch: 06 | Time: 0m 12s
	Train Loss: 4.463 | Train PPL:  86.782
	 Val. Loss: 4.457 |  Val. PPL:  86.252
Epoch: 07 | Time: 0m 12s
	Train Loss: 4.421 | Train PPL:  83.192
	 Val. Loss: 4.538 |  Val. PPL:  93.527
Epoch: 08 | Time: 0m 12s
	Train Loss: 4.364 | Train PPL:  78.604
	 Val. Loss: 4.474 |  Val. PPL:  87.733
Epoch: 09 | Time: 0m 12s
	Train Loss: 4.328 | Train PPL:  75.786
	 Val. Loss: 4.548 |  Val. PPL:  94.437
Epoch: 10 | Time: 0m 12s
	Train Loss: 4.290 | Train PPL