In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import torch.optim as optim


In [2]:
data = load_dataset('wmt16','cs-en')

Reusing dataset wmt16 (/home/gaojunting/.cache/huggingface/datasets/wmt16/cs-en/1.0.0/28ebdf8cf22106c2f1e58b2083d4b103608acd7bfdb6b14313ccd9e5bc8c313a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 997240
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2656
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})

In [4]:
train_data=data['train'][:100000]

In [5]:
val_data=data['validation']
test_data=data['test']

In [6]:
class CustomSet(Dataset):
    def __init__(self,raw_data):
        self.raw_data=raw_data['translation']
    def __getitem__(self,idx):
        return self.raw_data[idx]
    def __len__(self):
        return len(self.raw_data)

In [7]:
train_ds=CustomSet(train_data)
val_ds=CustomSet(val_data)
test_ds=CustomSet(test_data)

In [8]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

In [9]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<EOS>", "<PAD>", "<SOS>"]  # special tokens

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

In [10]:
cs_tokenizer,cs_trainer=prepare_tokenizer_trainer('BPE')
en_tokenizer,en_trainer=prepare_tokenizer_trainer('BPE')

In [11]:
cs_lang=[item['cs'] for item in train_ds] 

In [12]:
en_lang=[item['en'] for item in train_ds]

In [13]:
cs_tokenizer.train_from_iterator(cs_lang,trainer=cs_trainer)
en_tokenizer.train_from_iterator(en_lang,trainer=en_trainer)




In [14]:
cs_tokenizer.get_vocab_size()

30000

In [15]:
en_tokenizer.get_vocab_size()

30000

In [16]:
a=next(iter(train_ds))

In [17]:
a['cs']

'Následný postup na základě usnesení Parlamentu: viz zápis'

In [18]:
cs_tokenizer.encode(a['cs']).ids

[14044, 1183, 194, 1204, 1177, 822, 26, 1343, 1427]

In [19]:
src_pad_id=cs_tokenizer.token_to_id('<PAD>')
trg_pad_id=en_tokenizer.token_to_id('<PAD>')
src_eos_id=cs_tokenizer.token_to_id('<EOS>')
src_sos_id=cs_tokenizer.token_to_id('<SOS>')
trg_eos_id=en_tokenizer.token_to_id('<EOS>')
trg_sos_id=en_tokenizer.token_to_id('<SOS>')

In [20]:
src_pad_id,src_sos_id,src_eos_id

(2, 3, 1)

In [21]:
trg_pad_id,trg_sos_id,trg_eos_id

(2, 3, 1)

In [22]:
cs_tokenizer.enable_padding(length=50,pad_id=src_pad_id)
cs_tokenizer.enable_truncation(max_length=50)
en_tokenizer.enable_padding(length=50,pad_id=trg_pad_id)
en_tokenizer.enable_truncation(max_length=50)

In [23]:
def collate_fn(batch):
    cs_ls,en_ls=[],[]
    for item in batch:
        cs_sent='<SOS> '+item['cs']+' <EOS>'
        en_sent='<SOS> '+item['en']+' <EOS>'
        cs_ls.append(torch.LongTensor(cs_tokenizer.encode(cs_sent).ids))
        en_ls.append(torch.LongTensor(en_tokenizer.encode(en_sent).ids))
    return torch.vstack(cs_ls),torch.vstack(en_ls)

In [24]:
train_loader=DataLoader(dataset=train_ds,batch_size=64,shuffle=True,collate_fn=collate_fn)
valid_loader=DataLoader(dataset=val_ds,batch_size=64,shuffle=True,collate_fn=collate_fn)
test_loader=DataLoader(dataset=test_ds,batch_size=64,shuffle=True,collate_fn=collate_fn)

In [25]:
next(iter(train_loader))[0][0]

tensor([    3, 19228,  1306,   195,  1230,  1620,  7881,   194,  1790,    12,
          940,   195,   837,    12,   740,   362,  2765,  2058,    12,   508,
         7734,    14,     1,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2])

In [26]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self,hid_dim,n_heads,dropout,device):
        super().__init__()
        assert hid_dim%n_heads==0
        self.hid_dim=hid_dim
        self.n_heads=n_heads
        self.head_dim=hid_dim//n_heads
        self.q=nn.Linear(hid_dim,hid_dim)
        self.k=nn.Linear(hid_dim,hid_dim)
        self.v=nn.Linear(hid_dim,hid_dim)
        self.o=nn.Linear(hid_dim,hid_dim)
        self.dropout=nn.Dropout(dropout)
        self.scale=torch.sqrt(torch.LongTensor([self.head_dim])).to(device)

    def forward(self,query,key,value,mask=None):
        batch_size=query.shape[0]
        Q=self.q(query)
        K=self.k(key)
        V=self.v(value)
        #K V is the same size while Q may not

        Q=Q.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        K=K.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        V=V.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)

        energy=(Q@K.permute(0,1,3,2))/self.scale

        if mask is not None:
            energy=energy.masked_fill(mask==0,-1e10)
        attention = torch.softmax(energy,dim=-1)

        x = self.dropout(attention) @ V
        x = x.permute(0,2,1,3).contiguous()
        x = x.view(batch_size,-1,self.hid_dim)
        x = self.o(x)
        return x,attention
        

In [27]:
class PositionwiseForwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [28]:


class EncoderLayer(nn.Module):
    def __init__(self,hid_dim,n_heads,pf_dim,dropout,device):
        super().__init__()
        self.self_att_layer_norm=nn.LayerNorm(hid_dim)
        self.ff_layer_norm=nn.LayerNorm(hid_dim)
        self.self_attn=MultiHeadAttentionLayer(hid_dim,n_heads,dropout,device)
        self.positionwise_feedforward=PositionwiseForwardLayer(hid_dim,pf_dim,dropout)
        self.dropout=nn.Dropout(dropout)
    def forward(self,src,src_mask):
        _src,_=self.self_attn(src,src,src,src_mask)
        src = self.self_att_layer_norm(src+self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src=self.ff_layer_norm(src+self.dropout(_src))
        return src


In [29]:
class Encoder(nn.Module):
    def __init__(self,input_dim,hid_dim,n_layers,n_heads,pf_dim,dropout,device,max_length=50):
        super().__init__()
        self.device=device
        self.embedding = nn.Embedding(input_dim,hid_dim)
        self.pos_embedding=nn.Embedding(max_length,hid_dim)
        self.layers=nn.ModuleList([EncoderLayer(hid_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)
        self.scale=torch.FloatTensor([hid_dim]).to(device)
    def forward(self,src,src_mask):
        #batch first
        batch_size=src.shape[0]
        src_len=src.shape[1]
        pos=torch.arange(0, src_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        
        src = self.dropout((self.embedding(src)*self.scale)+self.pos_embedding(pos))

        for layer in self.layers:
            src=layer(src,src_mask)
        return src

In [30]:
# def train_tokenizer(files, alg='WLV'):
#     """
#     Takes the files and trains the tokenizer.
#     """
#     tokenizer, trainer = prepare_tokenizer_trainer(alg)
#     tokenizer.train(files, trainer) # training the tokenzier
#     tokenizer.save("./tokenizer-trained.json")
#     tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
#     return tokenizer

In [31]:
class DecoderLayer(nn.Module):
    def __init__(self,hid_dim,n_heads,pf_dim,dropout,device) -> None:
        super().__init__()
        self.multi_att=MultiHeadAttentionLayer(hid_dim,n_heads,dropout,device)
        self.multi_att_norm=nn.LayerNorm(hid_dim)
        self.feed_forward_layer=PositionwiseForwardLayer(hid_dim,pf_dim,dropout)
        self.feed_norm=nn.LayerNorm(hid_dim)

        self.encoder_attn=MultiHeadAttentionLayer(hid_dim,n_heads,dropout,device)
        self.encoder_norm=nn.LayerNorm(hid_dim)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,trg,en_src,trg_mask,enc_mask):

        _trg,_=self.multi_att(trg,trg,trg,trg_mask)
        trg=self.multi_att_norm(trg+self.dropout(_trg))

        _trg,att=self.encoder_attn(trg,en_src,en_src,enc_mask)
        trg = self.encoder_norm(trg+self.dropout(_trg))

        _trg=self.feed_forward_layer(trg)
        trg=self.feed_norm(trg+self.dropout(_trg))

        return trg,att



In [32]:
class Decoder(nn.Module):
    def __init__(self,output_dim,hid_dim,n_layers,n_heads,pf_dim,dropout,device,max_length=50):
        super().__init__()
        self.device=device

        self.embedding=nn.Embedding(output_dim,hid_dim)
        self.pos_embedding=nn.Embedding(max_length,hid_dim)

        self.layers=nn.ModuleList([DecoderLayer(hid_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])

        self.fc_out=nn.Linear(hid_dim,output_dim)
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.dropout=nn.Dropout(dropout)
    def forward(self,trg,enc_src,trg_mask,enc_mask):
        batch_size=trg.shape[0]
        trg_len=trg.shape[1]

        pos=torch.arange(0,trg_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        trg=self.dropout(self.embedding(trg)*self.scale +self.pos_embedding(pos))

        for layer in self.layers:
            trg,attention=layer(trg,enc_src,trg_mask,enc_mask)
        output=self.fc_out(trg)
        return output,attention
        



In [33]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,src_pad_idx,trg_pad_idx,device):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_pad_idx=src_pad_idx
        self.trg_pad_idx=trg_pad_idx
        self.device=device
    def make_src_mask(self,src):
        src_mask=(src!=self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    def make_trg_mask(self,trg):
        trg_pad_mask=(trg!=self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len=trg.shape[1]
        trg_sub_mask=torch.tril(torch.ones(trg_len,trg_len)).bool().to(self.device)
        #batch_size 1 seq_len seq_len
        trg_mask=trg_pad_mask&trg_sub_mask
        return trg_mask

    def forward(self,src,trg):
        src_mask=self.make_src_mask(src)
        trg_mask=self.make_trg_mask(trg)

        enc_src=self.encoder(src,src_mask)

        output,attention=self.decoder(trg,enc_src,trg_mask,src_mask)

        return output,attention


In [142]:
from torch import Tensor

In [143]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


In [169]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [170]:
input_d=cs_tokenizer.get_vocab_size()
output_d=en_tokenizer.get_vocab_size()

In [171]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [172]:
INPUT_DIM = input_d
OUTPUT_DIM = output_d
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

In [173]:
# model=nn.Transformer(d_model=HID_DIM,nhead=ENC_HEADS,dim_feedforward=ENC_PF_DIM)

In [192]:
model=Seq2SeqTransformer(num_encoder_layers=ENC_LAYERS,
                         num_decoder_layers=DEC_LAYERS,
                         src_vocab_size=INPUT_DIM,
                         emb_size=HID_DIM,
                         nhead=ENC_HEADS,
                         tgt_vocab_size=OUTPUT_DIM,
                         dropout=ENC_DROPOUT
                        )

In [193]:
model=model.to(device)

In [194]:
mask = (torch.tril(torch.ones((12, 12))))

In [195]:
mask

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [196]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == src_pad_id).transpose(0, 1)
    tgt_padding_mask = (tgt == trg_pad_id).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


In [197]:
# encoder=Encoder(INPUT_DIM,HID_DIM,ENC_LAYERS,ENC_HEADS,ENC_PF_DIM,ENC_DROPOUT,device)
# decoder=Decoder(OUTPUT_DIM,HID_DIM,DEC_LAYERS,DEC_HEADS,DEC_PF_DIM,DEC_DROPOUT,device)

In [198]:
# model=Seq2Seq(encoder,decoder,src_pad_id,trg_pad_id,device).to(device)

In [199]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 27,024,688 trainable parameters


In [200]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [201]:
model.apply(initialize_weights)

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_feature

In [202]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [203]:
criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_id)

In [204]:
from tqdm import tqdm

In [205]:
next(iter(train_loader))[1][:,:-1].shape

torch.Size([64, 49])

In [249]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0].to(device).T
        trg = batch[1].to(device).T
        train_trg=trg[:-1,:]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, train_trg)
        optimizer.zero_grad()
        print(src.size(1))
        print(trg.size(1))
        
        output= model(src,train_trg,src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        print(output.shape)
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[1:,:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [241]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(tqdm(iterator)):

            src = batch[0].to(device).T
            trg = batch[1].to(device).T

            output, _ = model(src, trg[:,:-1])
            #batch first
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [242]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [243]:
import time
import math

In [250]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Size([49, 64, 30000])
64
64
torch.Si

KeyboardInterrupt: 

In [216]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss = evaluate(model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

RuntimeError: Error(s) in loading state_dict for Seq2SeqTransformer:
	Missing key(s) in state_dict: "transformer.encoder.layers.0.self_attn.in_proj_weight", "transformer.encoder.layers.0.self_attn.in_proj_bias", "transformer.encoder.layers.0.self_attn.out_proj.weight", "transformer.encoder.layers.0.self_attn.out_proj.bias", "transformer.encoder.layers.0.linear1.weight", "transformer.encoder.layers.0.linear1.bias", "transformer.encoder.layers.0.linear2.weight", "transformer.encoder.layers.0.linear2.bias", "transformer.encoder.layers.0.norm1.weight", "transformer.encoder.layers.0.norm1.bias", "transformer.encoder.layers.0.norm2.weight", "transformer.encoder.layers.0.norm2.bias", "transformer.encoder.layers.1.self_attn.in_proj_weight", "transformer.encoder.layers.1.self_attn.in_proj_bias", "transformer.encoder.layers.1.self_attn.out_proj.weight", "transformer.encoder.layers.1.self_attn.out_proj.bias", "transformer.encoder.layers.1.linear1.weight", "transformer.encoder.layers.1.linear1.bias", "transformer.encoder.layers.1.linear2.weight", "transformer.encoder.layers.1.linear2.bias", "transformer.encoder.layers.1.norm1.weight", "transformer.encoder.layers.1.norm1.bias", "transformer.encoder.layers.1.norm2.weight", "transformer.encoder.layers.1.norm2.bias", "transformer.encoder.layers.2.self_attn.in_proj_weight", "transformer.encoder.layers.2.self_attn.in_proj_bias", "transformer.encoder.layers.2.self_attn.out_proj.weight", "transformer.encoder.layers.2.self_attn.out_proj.bias", "transformer.encoder.layers.2.linear1.weight", "transformer.encoder.layers.2.linear1.bias", "transformer.encoder.layers.2.linear2.weight", "transformer.encoder.layers.2.linear2.bias", "transformer.encoder.layers.2.norm1.weight", "transformer.encoder.layers.2.norm1.bias", "transformer.encoder.layers.2.norm2.weight", "transformer.encoder.layers.2.norm2.bias", "transformer.encoder.norm.weight", "transformer.encoder.norm.bias", "transformer.decoder.layers.0.self_attn.in_proj_weight", "transformer.decoder.layers.0.self_attn.in_proj_bias", "transformer.decoder.layers.0.self_attn.out_proj.weight", "transformer.decoder.layers.0.self_attn.out_proj.bias", "transformer.decoder.layers.0.multihead_attn.in_proj_weight", "transformer.decoder.layers.0.multihead_attn.in_proj_bias", "transformer.decoder.layers.0.multihead_attn.out_proj.weight", "transformer.decoder.layers.0.multihead_attn.out_proj.bias", "transformer.decoder.layers.0.linear1.weight", "transformer.decoder.layers.0.linear1.bias", "transformer.decoder.layers.0.linear2.weight", "transformer.decoder.layers.0.linear2.bias", "transformer.decoder.layers.0.norm1.weight", "transformer.decoder.layers.0.norm1.bias", "transformer.decoder.layers.0.norm2.weight", "transformer.decoder.layers.0.norm2.bias", "transformer.decoder.layers.0.norm3.weight", "transformer.decoder.layers.0.norm3.bias", "transformer.decoder.layers.1.self_attn.in_proj_weight", "transformer.decoder.layers.1.self_attn.in_proj_bias", "transformer.decoder.layers.1.self_attn.out_proj.weight", "transformer.decoder.layers.1.self_attn.out_proj.bias", "transformer.decoder.layers.1.multihead_attn.in_proj_weight", "transformer.decoder.layers.1.multihead_attn.in_proj_bias", "transformer.decoder.layers.1.multihead_attn.out_proj.weight", "transformer.decoder.layers.1.multihead_attn.out_proj.bias", "transformer.decoder.layers.1.linear1.weight", "transformer.decoder.layers.1.linear1.bias", "transformer.decoder.layers.1.linear2.weight", "transformer.decoder.layers.1.linear2.bias", "transformer.decoder.layers.1.norm1.weight", "transformer.decoder.layers.1.norm1.bias", "transformer.decoder.layers.1.norm2.weight", "transformer.decoder.layers.1.norm2.bias", "transformer.decoder.layers.1.norm3.weight", "transformer.decoder.layers.1.norm3.bias", "transformer.decoder.layers.2.self_attn.in_proj_weight", "transformer.decoder.layers.2.self_attn.in_proj_bias", "transformer.decoder.layers.2.self_attn.out_proj.weight", "transformer.decoder.layers.2.self_attn.out_proj.bias", "transformer.decoder.layers.2.multihead_attn.in_proj_weight", "transformer.decoder.layers.2.multihead_attn.in_proj_bias", "transformer.decoder.layers.2.multihead_attn.out_proj.weight", "transformer.decoder.layers.2.multihead_attn.out_proj.bias", "transformer.decoder.layers.2.linear1.weight", "transformer.decoder.layers.2.linear1.bias", "transformer.decoder.layers.2.linear2.weight", "transformer.decoder.layers.2.linear2.bias", "transformer.decoder.layers.2.norm1.weight", "transformer.decoder.layers.2.norm1.bias", "transformer.decoder.layers.2.norm2.weight", "transformer.decoder.layers.2.norm2.bias", "transformer.decoder.layers.2.norm3.weight", "transformer.decoder.layers.2.norm3.bias", "transformer.decoder.norm.weight", "transformer.decoder.norm.bias", "generator.weight", "generator.bias", "src_tok_emb.embedding.weight", "tgt_tok_emb.embedding.weight", "positional_encoding.pos_embedding". 
	Unexpected key(s) in state_dict: "encoder.embedding.weight", "encoder.pos_embedding.weight", "encoder.layers.0.self_att_layer_norm.weight", "encoder.layers.0.self_att_layer_norm.bias", "encoder.layers.0.ff_layer_norm.weight", "encoder.layers.0.ff_layer_norm.bias", "encoder.layers.0.self_attn.q.weight", "encoder.layers.0.self_attn.q.bias", "encoder.layers.0.self_attn.k.weight", "encoder.layers.0.self_attn.k.bias", "encoder.layers.0.self_attn.v.weight", "encoder.layers.0.self_attn.v.bias", "encoder.layers.0.self_attn.o.weight", "encoder.layers.0.self_attn.o.bias", "encoder.layers.0.positionwise_feedforward.fc_1.weight", "encoder.layers.0.positionwise_feedforward.fc_1.bias", "encoder.layers.0.positionwise_feedforward.fc_2.weight", "encoder.layers.0.positionwise_feedforward.fc_2.bias", "encoder.layers.1.self_att_layer_norm.weight", "encoder.layers.1.self_att_layer_norm.bias", "encoder.layers.1.ff_layer_norm.weight", "encoder.layers.1.ff_layer_norm.bias", "encoder.layers.1.self_attn.q.weight", "encoder.layers.1.self_attn.q.bias", "encoder.layers.1.self_attn.k.weight", "encoder.layers.1.self_attn.k.bias", "encoder.layers.1.self_attn.v.weight", "encoder.layers.1.self_attn.v.bias", "encoder.layers.1.self_attn.o.weight", "encoder.layers.1.self_attn.o.bias", "encoder.layers.1.positionwise_feedforward.fc_1.weight", "encoder.layers.1.positionwise_feedforward.fc_1.bias", "encoder.layers.1.positionwise_feedforward.fc_2.weight", "encoder.layers.1.positionwise_feedforward.fc_2.bias", "encoder.layers.2.self_att_layer_norm.weight", "encoder.layers.2.self_att_layer_norm.bias", "encoder.layers.2.ff_layer_norm.weight", "encoder.layers.2.ff_layer_norm.bias", "encoder.layers.2.self_attn.q.weight", "encoder.layers.2.self_attn.q.bias", "encoder.layers.2.self_attn.k.weight", "encoder.layers.2.self_attn.k.bias", "encoder.layers.2.self_attn.v.weight", "encoder.layers.2.self_attn.v.bias", "encoder.layers.2.self_attn.o.weight", "encoder.layers.2.self_attn.o.bias", "encoder.layers.2.positionwise_feedforward.fc_1.weight", "encoder.layers.2.positionwise_feedforward.fc_1.bias", "encoder.layers.2.positionwise_feedforward.fc_2.weight", "encoder.layers.2.positionwise_feedforward.fc_2.bias", "decoder.embedding.weight", "decoder.pos_embedding.weight", "decoder.layers.0.multi_att.q.weight", "decoder.layers.0.multi_att.q.bias", "decoder.layers.0.multi_att.k.weight", "decoder.layers.0.multi_att.k.bias", "decoder.layers.0.multi_att.v.weight", "decoder.layers.0.multi_att.v.bias", "decoder.layers.0.multi_att.o.weight", "decoder.layers.0.multi_att.o.bias", "decoder.layers.0.multi_att_norm.weight", "decoder.layers.0.multi_att_norm.bias", "decoder.layers.0.feed_forward_layer.fc_1.weight", "decoder.layers.0.feed_forward_layer.fc_1.bias", "decoder.layers.0.feed_forward_layer.fc_2.weight", "decoder.layers.0.feed_forward_layer.fc_2.bias", "decoder.layers.0.feed_norm.weight", "decoder.layers.0.feed_norm.bias", "decoder.layers.0.encoder_attn.q.weight", "decoder.layers.0.encoder_attn.q.bias", "decoder.layers.0.encoder_attn.k.weight", "decoder.layers.0.encoder_attn.k.bias", "decoder.layers.0.encoder_attn.v.weight", "decoder.layers.0.encoder_attn.v.bias", "decoder.layers.0.encoder_attn.o.weight", "decoder.layers.0.encoder_attn.o.bias", "decoder.layers.0.encoder_norm.weight", "decoder.layers.0.encoder_norm.bias", "decoder.layers.1.multi_att.q.weight", "decoder.layers.1.multi_att.q.bias", "decoder.layers.1.multi_att.k.weight", "decoder.layers.1.multi_att.k.bias", "decoder.layers.1.multi_att.v.weight", "decoder.layers.1.multi_att.v.bias", "decoder.layers.1.multi_att.o.weight", "decoder.layers.1.multi_att.o.bias", "decoder.layers.1.multi_att_norm.weight", "decoder.layers.1.multi_att_norm.bias", "decoder.layers.1.feed_forward_layer.fc_1.weight", "decoder.layers.1.feed_forward_layer.fc_1.bias", "decoder.layers.1.feed_forward_layer.fc_2.weight", "decoder.layers.1.feed_forward_layer.fc_2.bias", "decoder.layers.1.feed_norm.weight", "decoder.layers.1.feed_norm.bias", "decoder.layers.1.encoder_attn.q.weight", "decoder.layers.1.encoder_attn.q.bias", "decoder.layers.1.encoder_attn.k.weight", "decoder.layers.1.encoder_attn.k.bias", "decoder.layers.1.encoder_attn.v.weight", "decoder.layers.1.encoder_attn.v.bias", "decoder.layers.1.encoder_attn.o.weight", "decoder.layers.1.encoder_attn.o.bias", "decoder.layers.1.encoder_norm.weight", "decoder.layers.1.encoder_norm.bias", "decoder.layers.2.multi_att.q.weight", "decoder.layers.2.multi_att.q.bias", "decoder.layers.2.multi_att.k.weight", "decoder.layers.2.multi_att.k.bias", "decoder.layers.2.multi_att.v.weight", "decoder.layers.2.multi_att.v.bias", "decoder.layers.2.multi_att.o.weight", "decoder.layers.2.multi_att.o.bias", "decoder.layers.2.multi_att_norm.weight", "decoder.layers.2.multi_att_norm.bias", "decoder.layers.2.feed_forward_layer.fc_1.weight", "decoder.layers.2.feed_forward_layer.fc_1.bias", "decoder.layers.2.feed_forward_layer.fc_2.weight", "decoder.layers.2.feed_forward_layer.fc_2.bias", "decoder.layers.2.feed_norm.weight", "decoder.layers.2.feed_norm.bias", "decoder.layers.2.encoder_attn.q.weight", "decoder.layers.2.encoder_attn.q.bias", "decoder.layers.2.encoder_attn.k.weight", "decoder.layers.2.encoder_attn.k.bias", "decoder.layers.2.encoder_attn.v.weight", "decoder.layers.2.encoder_attn.v.bias", "decoder.layers.2.encoder_attn.o.weight", "decoder.layers.2.encoder_attn.o.bias", "decoder.layers.2.encoder_norm.weight", "decoder.layers.2.encoder_norm.bias", "decoder.fc_out.weight", "decoder.fc_out.bias". 

In [161]:
src=next(iter(test_ds))['cs']
trg=next(iter(test_ds))['en']

In [162]:
# cs_tokenizer.encode(src).ids

In [163]:
# src='Bílá kočka a černá kočka'

In [164]:
trg_pad_id

2

In [165]:
tids=torch.LongTensor(en_tokenizer.encode(trg).ids)

In [166]:
(tids!=trg_pad_id).unsqueeze(1).unsqueeze(2).shape[1]

1

In [167]:
make_trg_mask_(tids.unsqueeze(0))

tensor([[[[ True, False, False,  ..., False, False, False],
          [ True,  True, False,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False],
          ...,
          [ True,  True,  True,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False]]]], device='cuda:0')

In [168]:
def make_trg_mask_(trg):
        trg_pad_mask = (trg != trg_pad_id).unsqueeze(1).unsqueeze(2).to(device)
        
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).bool().to(device)
        # batch_size 1 seq_len seq_len
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

In [172]:
def translate_sentence(sentence, model, device, max_len = 50):
    
    model.eval()    
        
    src_indexes = cs_tokenizer.encode('<SOS> '+sentence+' <EOS>').ids

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    # print(src_tensor)
    
    src_mask = model.make_src_mask(src_tensor)

    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_sos_id]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        
        # print(trg_tensor.shape)
        trg_mask = model.make_trg_mask(trg_tensor)
        # print(trg_mask)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        # print(output.shape)
        pred_token = output.argmax(2)[:,-1].item()
        # print(pred_token)
        trg_indexes.append(pred_token)

        if pred_token == trg_eos_id:
            break
    
    trg_tokens = en_tokenizer.decode(trg_indexes)
    
    return trg_tokens,trg_indexes

In [173]:
translate_sentence(src,model,device)

('He accused his friend Milan metro following emergency disaster .',
 [3, 1361, 7387, 951, 2214, 14669, 12105, 1704, 3967, 3876, 15, 1])

In [171]:
trg

'In 911 Call, Professor Admits to Shooting Girlfriend'

In [131]:
sth = torch.randn(1,1,1,16940)

In [132]:
sth.argmax(2)[:,-1].shape

torch.Size([1, 16940])