In [1]:
%%capture
!pip install datasets
!pip install tokenizers==0.9.4
!pip install transformers==4.2.1
!pip install rouge_score

In [None]:
#!/usr/bin/env python3
import datasets
import logging
import torch
import pandas as pd
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

logging.basicConfig(level=logging.INFO)

#model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
model.to(device)

# cache is currently not supported by EncoderDecoder framework
model.decoder.config.use_cache = False
bert_tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

# load train and validation data
text_df = pd.read_csv('rrs-mimiciii/all/train.findings.tok', sep="delimiter", header=None, names=['text'])
summary_df = pd.read_csv('rrs-mimiciii/all/train.impression.tok',sep='delimiter', header=None, names=['summary'])
train_df = pd.concat([text_df,summary_df], axis=1, join='inner')

text_df = pd.read_csv('rrs-mimiciii/all/validate.findings.tok', sep="delimiter", header=None, names=['text'])
summary_df = pd.read_csv('rrs-mimiciii/all/validate.impression.tok',sep='delimiter', header=None, names=['summary'])
valid_df = pd.concat([text_df,summary_df], axis=1, join='inner')

text_df = pd.read_csv('rrs-mimiciii/all/test.findings.tok', sep="delimiter", header=None, names=['text'])
summary_df = pd.read_csv('rrs-mimiciii/all/test.impression.tok',sep='delimiter', header=None, names=['summary'])
test_df = pd.concat([text_df,summary_df], axis=1, join='inner')

train_dataset = datasets.Dataset.from_dict({"text":train_df['text'].tolist(),"summary":train_df['summary'].tolist()})
val_dataset = datasets.Dataset.from_dict({"text":valid_df['text'].tolist(),"summary":valid_df['summary'].tolist()})

#train_dataset = train_dataset.select(range(512))
#val_dataset = val_dataset.select(range(128))

# load rouge for validation
rouge = datasets.load_metric("rouge", experiment_id=1)

encoder_length = 512
decoder_length = 256
batch_size = 8


# map data correctly
def map_to_encoder_decoder_inputs(batch):    # Tokenizer will automatically set [BOS] <text> [EOS] 
    # use bert tokenizer here for encoder
    inputs = bert_tokenizer(batch["text"], padding="max_length", truncation=True, max_length=encoder_length)
    # force summarization <= 128
    outputs = gpt2_tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask

    # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
    batch["labels"] = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
    ]

    assert all([len(x) == encoder_length for x in inputs.input_ids])
    assert all([len(x) == decoder_length for x in outputs.input_ids])

    return batch


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = gpt2_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = gpt2_tokenizer.eos_token_id
    label_str = gpt2_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str)
    
    res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}

    return {k: round(v, 4) for k, v in res.items()}


# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["text", "summary"],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["text", "summary"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# set training arguments - these params are not really tuned, feel free to change


model.config.pad_token_id = model.config.decoder.eos_token_id

'''
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    evaluation_strategy="steps",
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=1000,
    eval_steps=1000,
    overwrite_output_dir=True,
    warmup_steps=2000,
    save_total_limit=10,
    fp16=True,
)

'''
training_args = Seq2SeqTrainingArguments(
    output_dir="/scratch/vgvinodv_root/vgvinodv0/varu/jupyter/",
    #evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    #per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    do_train=True,
    do_eval=False,
    predict_with_generate=True,
    overwrite_output_dir=True,
    logging_steps=1000,
    #eval_steps=100,
    warmup_steps=2000,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    train_dataset = train_dataset,
    #eval_dataset=val_dataset,
    )

trainer.train()


  return func(*args, **kwargs)


  0%|          | 0/7415 [00:00<?, ?ba/s]

  0%|          | 0/927 [00:00<?, ?ba/s]

Step,Training Loss
10000,2.4042


In [None]:
#!/usr/bin/env python3
import datasets
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel

#model = EncoderDecoderModel.from_pretrained("./checkpoint-16")
model.to(device)

bert_tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

test_dataset = datasets.Dataset.from_dict({"text":test_df['text'].tolist(),"summary":test_df['summary'].tolist()})
#test_dataset = test_dataset.select(range(128))
batch_size = 64


# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = bert_tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["text"])

# load rouge for validation
rouge = datasets.load_metric("rouge")

pred_str = results["pred"]
label_str = results["summary"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print({k: round(v, 4) for k, v in res.items()})


In [None]:
sample=10
print(test_dataset['text'][sample])

In [None]:
print(pred_str[sample])

In [None]:
print(label_str[sample])