In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd

import torch.nn.functional as F
import torch.nn as nn
import torch
import numpy as np
from tqdm.auto import tqdm

from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, BertTokenizer, EncoderDecoderModel


In [3]:
dataset = load_dataset(*['wmt14', 'de-en'])

Found cached dataset wmt14 (/home/administrator/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)


  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenizer

In [4]:
de_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
en_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Data Loader

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
subset = list(range(0, 8))
dataset['train'] = torch.utils.data.Subset(dataset['train'], subset)
dataset['validation'] = torch.utils.data.Subset(dataset['validation'], subset)

In [7]:
BS = 2
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=BS, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset['validation'], batch_size=BS, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=BS, shuffle=False)

In [None]:
b  = next(iter(train_loader))
b

d = (de_tokenizer(b['translation']['de'], padding=True, truncation=True, return_tensors="pt"),
en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt"))
d

# Train

In [8]:
criterion = nn.NLLLoss(ignore_index=de_tokenizer.pad_token_id)

In [9]:
def compute_loss(predictions, targets):
    """Compute our custom loss"""
    predictions = predictions[:, :-1, :].contiguous()
    targets = targets[:, 1:]

    rearranged_output = predictions.view(predictions.shape[0]*predictions.shape[1], -1)
    rearranged_target = targets.contiguous().view(-1)

    loss = criterion(rearranged_output, rearranged_target)

    return loss

In [10]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "dbmdz/bert-base-german-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['bert.encoder.laye

In [11]:
model.config.decoder_start_token_id = de_tokenizer.cls_token_id
model.config.pad_token_id = de_tokenizer.pad_token_id
# model.config.eos_token_id = en_tokenizer.eos_token_id
# model.config.bos_token_id = en_tokenizer.bos_token_id

In [14]:
model = model.to(device)

In [None]:
optimizer = transformers.AdamW(model.parameters(), lr=1e-5)

In [None]:
def train_model(train_loader):
    model.train()
    epoch_loss = 0

    num_train_batches = len(train_loader)
    for i, b in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()

        de_token = de_tokenizer(b['translation']['de'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
        en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
        
        de_output = de_token['input_ids'].to(device)
        de_masks = de_token['attention_mask'].to(device)
        
        en_input = en_token['input_ids'].to(device)
        en_masks = en_token['attention_mask'].to(device)

        out = model(input_ids=en_input, attention_mask = en_masks, decoder_input_ids=de_output, labels = de_output)
        prediction_scores = out[1]
        predictions = F.log_softmax(prediction_scores, dim=-1)
        loss = compute_loss(predictions, de_output)

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / num_train_batches

In [None]:
for epoch in range(100):
    train_epoch_loss = train_model(train_loader)
    print(f"epoch: {epoch}, train_loss: {train_epoch_loss}")
    

### Save Model Weights

In [None]:
# PATH = "./Checkpoint/translate_en_de.pth"

In [None]:
# torch.save(model.state_dict(), PATH)
model.load_state_dict(torch.load(PATH))
model.eval()

In [None]:
resume_file = "runs/en_de_maxlen100_subset8/translator_epoch_{}_batch_N.pth".format(249)
resume_dict = torch.load(resume_file, map_location=device) 

model.load_state_dict(resume_dict['model_state_dict'])

# Eval Model

In [None]:
def eval_model(val_loader):
    model.eval()
    epoch_loss = 0

    num_valid_batches = len(val_loader)
    for i, b in enumerate(val_loader):

        de_token = de_tokenizer(b['translation']['de'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
        en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
        
        def greedy_decoding(de_token, en_token, device):
            de_output = de_token['input_ids'].to(device)
            de_masks = de_token['attention_mask'].to(device)

            en_input = en_token['input_ids'].to(device)
            en_masks = en_token['attention_mask'].to(device)

            BS = en_input.shape[0]

            pred_words = torch.tensor([[de_tokenizer.cls_token_id]]*BS)
            dec_out = pred_words.to(device)
            unfinished_seq = np.array([1]*BS)
            
            for i in range(de_output.shape[1]):
                output = model(input_ids = en_input, attention_mask = en_masks, decoder_input_ids = dec_out )
                pred_words = torch.argmax(output.logits, dim=-1)[:,-1:]    
                pred_words[unfinished_seq==0,:] = de_tokenizer.pad_token_id
                dec_out = torch.cat((dec_out,pred_words),dim=1)

                unfinished_seq[(dec_out[:,-1] == de_tokenizer.sep_token_id).cpu().numpy()] = 0

            predictions = F.log_softmax(output.logits, dim=-1)
            
            pred_sent = list(map(de_tokenizer.decode, dec_out))
            input_sent = list(map(de_tokenizer.decode, de_output))
            for x in zip(pred_sent, input_sent):
                print(*x, sep="\n")
                print()
            
            return 0#compute_loss(predictions, de_output).item()
        
        epoch_loss += greedy_decoding(de_token, en_token, device)

    return (epoch_loss / num_valid_batches)

eval_model(train_loader)

# Generate 


In [None]:
train_iterator = iter(train_loader)

In [None]:
b = next(train_iterator)

In [None]:
en_token = en_tokenizer(b['translation']['en'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
b


In [None]:
de_tokenizer(b['translation']['de'], padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)


In [None]:
out = model.generate(en_token['input_ids'].to(device))
out

In [None]:
list(map(de_tokenizer.decode, out))
b['translation']['de']
