In [None]:
from datasets import load_dataset, Sequence, Value
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM, create_optimizer, AdamWeightDecay
from transformers.keras_callbacks import KerasMetricCallback
import tensorflow as tf
import evaluate

In [None]:
dataset = load_dataset('kmfoda/booksum')

In [None]:
model_checkpoint = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generate_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [4]:
prefix = "summarize: "
def process(example):
    inputs = [prefix + doc for doc in example["chapter"]]
    model_data = tokenizer(example['chapter'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        model_label = tokenizer(example["summary"], max_length=128, truncation=True)
    model_data['labels'] = model_label['input_ids'] 
    return model_data

In [5]:
tok_df = dataset.map(process, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [6]:
tf_train_df = model.prepare_tf_dataset(
    tok_df["train"],
    collate_fn=collator,
    shuffle=True,
    batch_size=8,
)
tf_test_df = model.prepare_tf_dataset(
    tok_df["test"],
    collate_fn=collator,
    shuffle=False,
    batch_size=8,
)
tf_eval_df = model.prepare_tf_dataset(
    tok_df["validation"],
    collate_fn=collator,
    shuffle=False,
    batch_size=8,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_df)

In [14]:
train_results = model.fit(x=tf_train_df, validation_data=tf_eval_df, epochs=1, callbacks=[metric_callback])

ValueError: Creating variables on a non-first call to a function decorated with tf.function.