### 0. Initial Setting

In [1]:
# %%capture
# !pip install datasets==1.0.2
# !pip install transformers==4.2.1

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

cache_dir = "/data4/yoomcache"
model_cache_dir = os.path.join(cache_dir, 'huggingface')
data_cache_dir = os.path.join(cache_dir, 'datasets')
checkpoint_dir = os.path.join(cache_dir, 'checkpoint')

import logging
logging.getLogger().setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.INFO)


import torch
from datasets import load_dataset, load_metric, load_from_disk
from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
from transformers import AutoConfig, EncoderDecoderConfig, EncoderDecoderModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

import wandb
wandb.init(project="testing-roberta2gpt", entity="yoom618")

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myoom618[0m (use `wandb login --relogin` to force relogin)


### 1. Initialize Model

In [3]:
config_encoder = AutoConfig.from_pretrained("roberta-base", cache_dir=model_cache_dir)
config_decoder = AutoConfig.from_pretrained("gpt2", cache_dir=model_cache_dir)
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder, cache_dir=model_cache_dir)
model = EncoderDecoderModel(config=config)
# model.save_pretrained("roberta2gpt", cache_dir=model_cache_dir)
# model = EncoderDecoderModel.from_pretrained("roberta2gpt", cache_dir=model_cache_dir)

model.encoder.encoder.layer = model.encoder.encoder.layer[:6]
model.decoder.transformer.h = model.decoder.transformer.h[-6:]

In [4]:
encoder_tokenizer = RobertaTokenizer.from_pretrained("roberta-base", cache_dir=model_cache_dir)
encoder_tokenizer.bos_token = encoder_tokenizer.cls_token  # CLS token will work as BOS token
encoder_tokenizer.eos_token = encoder_tokenizer.sep_token  # SEP token will work as EOS token

# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=model_cache_dir)
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
decoder_tokenizer.pad_token = decoder_tokenizer.unk_token


model.config.decoder_start_token_id = encoder_tokenizer.cls_token_id
model.config.eos_token_id = encoder_tokenizer.sep_token_id
model.config.pad_token_id = encoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size


# set decoding params
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

In [5]:
# Freeze decoder parameters
for param in model.decoder.parameters():
    param.requires_grad = False


### 2. Preparing Dataset

In [6]:
# map data correctly
def map_to_encoder_decoder_inputs(batch):    # Tokenizer will automatically set [BOS] <text> [EOS] 
    encoder_length, decoder_length = 512, 128
    inputs = encoder_tokenizer(batch["article"], 
                               padding="max_length", 
                               truncation=True, 
                               max_length=encoder_length)
    outputs = decoder_tokenizer(batch["highlights"], 
                                padding="max_length", 
                                truncation=True, 
                                max_length=decoder_length)
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask

    # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
    batch["labels"] = -100 if batch["decoder_attention_mask"] == 0 else batch["labels"]

    assert len(inputs.input_ids) == encoder_length
    assert len(outputs.input_ids) == decoder_length

    return batch

In [7]:
if os.path.exists(os.path.join(cache_dir, 'preprocessed/train')):
    train_dataset = load_from_disk(os.path.join(cache_dir, 'preprocessed/train'))
else:
    train_dataset = load_dataset("ccdv/cnn_dailymail", "3.0.0", split="train", cache_dir=data_cache_dir)
    train_dataset = train_dataset.map(
        map_to_encoder_decoder_inputs, 
        # batched=True, 
        # batch_size=batch_size, 
        remove_columns=['id', 'article', 'highlights'],
    )
    train_dataset.set_format(
        type="torch", 
        columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
    )
    
    train_dataset.save_to_disk(os.path.join(cache_dir, 'preprocessed/train'))


if os.path.exists(os.path.join(cache_dir, 'preprocessed/val')):
    val_dataset = load_from_disk(os.path.join(cache_dir, 'preprocessed/val'))
else:
    val_dataset = load_dataset("ccdv/cnn_dailymail", "3.0.0", split="validation", cache_dir=data_cache_dir)
    val_dataset = val_dataset.map(
        map_to_encoder_decoder_inputs, 
        # batched=True, 
        # batch_size=batch_size, 
        remove_columns=['id', 'article', 'highlights'],
    )
    val_dataset.set_format(
        type="torch", 
        columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
    )
    val_dataset.save_to_disk(os.path.join(cache_dir, 'preprocessed/val'))

### Training Model

In [8]:
# load rouge for validation
rouge = load_metric("rouge")
# rouge = load_metric("rouge", experiment_id=1)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = decoder_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = decoder_tokenizer.eos_token_id
    label_str = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
batch_size = 8

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir=os.path.join(checkpoint_dir, "roberta2gpt"),
    # do_train=True,
    # do_eval=True,
    # do_predict=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
#     learning_rate=1e-4, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0,
    num_train_epochs=100,
    max_steps=-1,
    # lr_scheduler_type='linear', warmup_ratio=0.0, 
    
    logging_strategy='steps',
    save_strategy='steps',
    evaluation_strategy='steps',
    logging_steps=1000,
    save_steps=2000,
    eval_steps=1000,
    warmup_steps=10000,
    save_total_limit=3,
    overwrite_output_dir=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# start training
trainer.train()

***** Running training *****
  Num examples = 28711
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 358900
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1000,5.4566,3.91702,0.0,0.0,0.0
2000,5.0106,3.927585,0.0,0.0,0.0
3000,4.9954,3.937867,0.0,0.0,0.0
4000,4.9966,3.9373,0.0,0.0,0.0
5000,4.9815,3.948406,0.0,0.0,0.0
6000,4.9728,3.936208,0.0,0.0,0.0
7000,4.9664,3.953957,0.0,0.0,0.0
8000,4.9578,3.950956,0.0,0.0,0.0
9000,4.9463,3.942316,0.0,0.0,0.0
10000,4.9512,3.949973,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
Saving model checkpoint to /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-2000
Configuration saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-2000/config.json
Model weights saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [/data4/yoomcache/checkpoint/roberta2gpt/checkpoint-33] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
Saving model checkpoint to /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-4000
Configuration saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-4000/config.json
Model weights saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-4000/pytorch_model.bin
Deleting older checkpoint [/data4/yoomcache/checkpoint/roberta2gpt/

***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
Saving model checkpoint to /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-18000
Configuration saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-18000/config.json
Model weights saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-18000/pytorch_model.bin
Deleting older checkpoint [/data4/yoomcache/checkpoint/roberta2gpt/checkpoint-12000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1337
  Batch size = 1
Saving model checkpoint to /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-20000
Configuration saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-20000/config.json
Model weights saved in /data4/yoomcache/checkpoint/roberta2gpt/checkpoint-20000/pytorch_model.bin
Deleting older checkpoint [/data4/yoomcache/checkpoint/rob

https://huggingface.co/patrickvonplaten/bert2gpt2-cnn_dailymail-fp16

In [None]:
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
model.to(device)

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
batch_size = 4


# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = bert_tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])

In [None]:
# load rouge for validation
rouge = load_metric("rouge")

pred_str = results["pred"]
label_str = results["highlights"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)