In [2]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

import torch
import numpy as np

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import EncoderDecoderModel, BertTokenizer, RobertaTokenizer


device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [3]:
# from datasets import load_dataset

# dataset = load_dataset("librispeech_asr", 'clean')

In [7]:
# Reference - 1) Leveraging Pre-trained ... https://huggingface.co/blog/warm-starting-encoder-decoder
# Reference - 1) https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Leveraging_Pre_trained_Checkpoints_for_Encoder_Decoder_Models.ipynb#scrollTo=EZokD01chq3x
# Reference - 2) EncoderDecoderModel ... https://huggingface.co/docs/transformers/main/en/model_doc/encoder-decoder#transformers.EncoderDecoderModel


# initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", 'gpt2-medium')
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("gpt2", "gpt2")

tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer_roberta.bos_token = tokenizer_roberta.cls_token  # CLS token will work as BOS token
tokenizer_roberta.eos_token = tokenizer_roberta.sep_token  # SEP token will work as EOS token


def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2-medium')
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
tokenizer_gpt2.pad_token = tokenizer_gpt2.unk_token

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['h.13.crossattention.q_attn.weight', 'h.7.crossattention.c_proj.bias', 'h.10.crossattention.bias', 'h.4.crossattention.q_attn.weight', 'h.3.crossattention.masked_bias', 'h.20.

In [9]:
# forward
input_ids = tokenizer_roberta(
    "Hello, my dog is cute", add_special_tokens=True, return_tensors="pt"
)  # Batch size 1
print(input_ids)

output_ids = tokenizer_gpt2(
    "Hello, my dog is cute", add_special_tokens=True, return_tensors="pt"
)  # Batch size 1
print(output_ids)

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[50256, 15496,    11,   616,  3290,   318, 13779, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [10]:
outputs = model(input_ids=input_ids.input_ids, decoder_input_ids=output_ids.input_ids)

In [11]:
outputs

Seq2SeqLMOutput(loss=None, logits=tensor([[[ -69.2139,  -61.2195,  -64.3711,  ...,  -84.6100,  -81.8234,
           -66.4667],
         [ -59.1535,  -66.9265,  -69.9804,  ...,  -71.2427,  -72.1222,
           -65.0375],
         [ -53.7141,  -56.9994,  -59.0328,  ...,  -61.6606,  -62.3242,
           -53.2519],
         ...,
         [-106.1582, -108.8470, -111.9173,  ..., -115.2189, -115.1562,
          -108.9926],
         [ -42.9007,  -49.2121,  -52.2798,  ...,  -60.9998,  -58.5005,
           -49.0492],
         [ -64.3020,  -57.7391,  -60.1545,  ...,  -78.5611,  -77.4039,
           -64.9706]]], grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 3.5759e-01,  1.0449e-01,  3.3007e-01,  ..., -6.1403e-01,
            9.6434e-02, -1.5244e-01],
          [-2.5180e-01,  6.3556e-02, -5.4910e-02,  ..., -1.4660e-02,
           -4.5628e-01,  7.9456e-02],
          [-4.6958e-01,  3.1659e-01,  2.2025e-01,  ..., -9.6201e-01,
            3.4632e-02, -7.5118e-01],
          ...,
      

tensor([[15496,    11,   616,  3290,   318, 13779]])

In [12]:
BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 3e-5
WARMUP_STEPS = 0
MAX_SEQ_LEN = 400
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
    
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)


# training
for i in range(EPOCHS):
    outputs = model(input_ids=input_ids.input_ids, decoder_input_ids=output_ids.input_ids, labels=output_ids.input_ids)
    loss, logits = outputs.loss, outputs.logits
    loss.backward()
    optimizer.step()
    scheduler.step() 
    optimizer.zero_grad()
    model.zero_grad()

#     if EPOCHS % 10 != 0:
#         continue


    # # save and load from pretrained
    # model.save_pretrained("bert2gpt2")
    # model = EncoderDecoderModel.from_pretrained("bert2gpt2")




    # generation
    generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)

    output_text = tokenizer_gpt2.decode(generated.squeeze().to('cpu').numpy())
    print(output_text)



AttributeError: 

In [None]:
# https://huggingface.co/patrickvonplaten/bert2gpt2-cnn_dailymail-fp16

#!/usr/bin/env python3
import nlp
from datasets import load_dataset
import logging
from transformers import RobertaTokenizer, GPT2Tokenizer, EncoderDecoderModel, Trainer, TrainingArguments

logging.basicConfig(level=logging.INFO)

model = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "gpt2")
# cache is currently not supported by EncoderDecoder framework
model.decoder.config.use_cache = False
bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

# load train and validation data
train_dataset = load_dataset("ccdv/cnn_dailymail", "3.0.0", split="train")
val_dataset = load_dataset("ccdv/cnn_dailymail", "3.0.0", split="validation[:5%]")

# load rouge for validation
rouge = nlp.load_metric("rouge", experiment_id=1)

encoder_length = 512
decoder_length = 128
batch_size = 16


# map data correctly
def map_to_encoder_decoder_inputs(batch):    # Tokenizer will automatically set [BOS] <text> [EOS] 
    # use bert tokenizer here for encoder
    inputs = bert_tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_length)
    # force summarization <= 128
    outputs = gpt2_tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask

    # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
    batch["labels"] = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
    ]

    assert all([len(x) == encoder_length for x in inputs.input_ids])
    assert all([len(x) == decoder_length for x in outputs.input_ids])

    return batch


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = gpt2_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = gpt2_tokenizer.eos_token_id
    label_str = gpt2_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }


# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "highlights"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = TrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
#     predict_from_generate=True,
#     evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=1000,
    eval_steps=1000,
    overwrite_output_dir=True,
    warmup_steps=2000,
    save_total_limit=10,
#     fp16=True,
)

# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# start training
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: id.
***** Running training *****
  Num examples = 287113
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 53835
