In [2]:
import pandas as pd
import os
import json
import logging
from datasets import Dataset
import nltk
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, pipeline
import keras_hub
from transformers.keras_callbacks import KerasMetricCallback

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 384  # Maximum length of the input to the model
MAX_TARGET_LENGTH = 48  # Maximum length of the output by the model
MIN_TARGET_LENGTH = 5   # Minimum length of the output by the model
BATCH_SIZE = 8          # Batch-size for training our model
LEARNING_RATE = 0.001   # Learning-rate for training our model
MAX_EPOCHS = 1          # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

These cells imports all necessary libraries and frameworks, sets up logging and environment configurations, and defines global constants for dataset splitting and model hyperparameters. It prepares the T5-small checkpoint and training parameters for a sequence-to-sequence learning task.  

# Load the Data

In [4]:
# Number of records to load from the JSON snapshot
N = 200_000

records = []

# Read and parse up to N lines from the ArXiv metadata file
with open("arxiv-metadata-oai-snapshot.json", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        # Stop once we've read N records
        if i >= N:
            break
        # Skip blank lines
        if not line.strip():
            continue
        # Parse each non-empty line as JSON and add to the list
        records.append(json.loads(line))

# Build a DataFrame, keep only the 'id', 'title', and 'abstract' columns,
# drop any rows with missing values, and reset the index
df = (
    pd.DataFrame(records)
      .loc[:, ["id", "title", "abstract"]]
      .dropna()
      .reset_index(drop=True)
)

# Confirm how many records were successfully loaded
print(f"Loaded {len(df)} records")

Loaded 200000 records


In [5]:
# Convert the pandas DataFrame into a Hugging Face Dataset
ds = Dataset.from_pandas(df)

# Rename columns for the seq2seq task:
# - 'abstract' becomes the input to the model
# - 'title' becomes the target output
ds = ds.rename_column("abstract", "input_text")
ds = ds.rename_column("title",    "target_text")

# Split the dataset into training and test sets using the predefined ratio
splits = ds.train_test_split(test_size=TRAIN_TEST_SPLIT, seed=42)
train_ds = splits["train"]
test_ds  = splits["test"]

This cell loads the first 200,000 ArXiv records from a JSON snapshot into a pandas DataFrame, filters and cleans the data, converts it into a Hugging Face Dataset with appropriately named input and target columns, and performs a reproducible train/test split.

# Load tokenizer, model, and collator

In [6]:
# Load the pretrained tokenizer and sequence-to-sequence model from the Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# Configure a data collator to handle dynamic padding for TensorFlow batches
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    return_tensors="tf"
)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [7]:
# For T5 models, prepend a task-specific prefix to the inputs
if MODEL_CHECKPOINT.startswith("t5-"):
    prefix = "summarize: "
else:
    prefix = ""

In [8]:
def preprocess_function(examples):
    # Add the prefix to each abstract
    inputs = [prefix + txt for txt in examples["input_text"]]
    # Tokenize inputs with truncation and padding
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
    )

    # Tokenize target texts (titles) similarly
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length",
    )
    # Assign tokenized label IDs for the model's training objective
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
# Apply the preprocessing function over the dataset splits, removing original columns
tokenized_splits = splits.map(
    preprocess_function,
    batched=True,
    remove_columns=["id", "input_text", "target_text"],
)

Map: 100%|██████████| 180000/180000 [01:53<00:00, 1590.72 examples/s]
Map: 100%|██████████| 20000/20000 [00:12<00:00, 1619.21 examples/s]


These cells loads the pretrained tokenizer and T5 model, configures the data collator and optional “summarize:” prefix, defines a preprocessing function to tokenize inputs and target summaries to fixed lengths, and maps this function over the train/test splits to produce TensorFlow-ready datasets.  

# Build tf datasets

In [10]:
# Convert the training split into a TensorFlow dataset:
# - Use 'input_ids' and 'attention_mask' as inputs
# - Use 'labels' as targets
# - Shuffle for training, batch according to BATCH_SIZE, and apply the data collator
train_dataset = tokenized_splits["train"].to_tf_dataset(
    columns    = ["input_ids", "attention_mask"],
    label_cols = ["labels"],
    shuffle    = True,
    batch_size = BATCH_SIZE,
    collate_fn = data_collator,
)

# Convert the test split into a TensorFlow dataset for evaluation:
# - No shuffling to preserve order, same batching and collator
test_dataset = tokenized_splits["test"].to_tf_dataset(
    columns    = ["input_ids", "attention_mask"],
    label_cols = ["labels"],
    shuffle    = False,
    batch_size = BATCH_SIZE,
    collate_fn = data_collator,
)

# Prepare a small subset of the test split for generation/inference:
# - Shuffle once for randomness, select the first 200 examples
# - Convert to a tf.data.Dataset without further shuffling
generation_dataset = (
    tokenized_splits["test"]
        .shuffle(seed=42)
        .select(range(200))
        .to_tf_dataset(
            columns    = ["input_ids", "attention_mask"],
            label_cols = ["labels"],
            shuffle    = False,
            batch_size = BATCH_SIZE,
            collate_fn = data_collator,
        )
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [11]:
# Compile the model with the Adam optimizer (default parameters)
model.compile(optimizer="adam")

This cell transforms the tokenized train and test splits into TensorFlow datasets with appropriate batching, shuffling, and collator settings for training, evaluation, and inference, then compiles the model using the Adam optimizer.  

# Training and Evaluating the Model

In [12]:
# Initialize the RougeL metric for sequence-level evaluation
rouge_l = keras_hub.metrics.RougeL()

def metric_fn(eval_predictions):
    preds, labels = eval_predictions

    # Convert tensors to NumPy and ensure integer token IDs
    if hasattr(preds, "numpy"):
        preds = preds.numpy()
    preds = np.clip(preds.astype(np.int64), 0, tokenizer.vocab_size - 1)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)

    if hasattr(labels, "numpy"):
        labels = labels.numpy()
    labels = np.where(labels < 0, tokenizer.pad_token_id, labels)
    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

    # Compute RougeL F1
    res = rouge_l(decoded_labels, decoded_preds)
    return {"RougeL": float(res["f1_score"])}

In [13]:
# Create a Keras callback to compute RougeL during training/inference
metric_callback = KerasMetricCallback(
    metric_fn,
    eval_dataset=generation_dataset,
    predict_with_generate=True
)
callbacks = [metric_callback]

In [None]:
# Train the model for 3 epochs, tracking RougeL on the validation set
model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,
    steps_per_epoch=500,
    validation_steps=50,
    callbacks=callbacks,
)

Epoch 1/3

I0000 00:00:1752015637.429716 3310612 service.cc:152] XLA service 0x310cbb4e0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752015637.429730 3310612 service.cc:160]   StreamExecutor device (0): Host, Default Version
2025-07-08 16:00:37.485121: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1752015637.884299 3310612 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-07-08 16:01:27.773392: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 2/3

In [None]:
# Build a Hugging Face pipeline for summarization using our fine-tuned model
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    framework="tf"
)

Device set to use 0


These cells define and attach a custom RougeL callback to track sequence-level F1 scores during training, runs the training loop for three epochs, then constructs a summarization pipeline and demonstrates inference by generating and printing a title (“Work Function Algorithm for the k-server problem”) for one test abstract.  
