In [2]:
from datasets import load_dataset
import torch

In [3]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
elif torch.backends.mps.is_available():
	device = torch.device("mps")

In [6]:
# %%capture
# arxiv_abstracts_2021 = load_dataset("gfissore/arxiv-abstracts-2021",split="train")

In [7]:
# arxiv_abstracts_2021[0]

{'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with C

In [8]:
# %%capture
arxiv_summarization = load_dataset("ccdv/arxiv-summarization",split="train")

In [9]:
arxiv_summarization[0]

{'article': 'additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . \n it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . \n many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years \n many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xcite , @xcite , @xcite , @xcite and the references therein

In [32]:
arxiv_summarization = arxiv_summarization.train_test_split(test_size=0.2)

In [33]:
arxiv_summarization["train"][0]

{'article': "clusters of galaxies represent unique signposts in the universe , where the physical properties of the cosmic diffuse baryons can be studied in great details and used to trace the past history of cosmic structure formation ( e.g. rosati et al . \n 2002 ; voit 2005 , for reviews ) . as a result of adiabatic compression and shocks generated by supersonic motion during shell crossing and virialization , \n a hot thin gas permeating the cluster gravitational potential well is formed . \n typically this gas , which is enriched with metals ejected form supernovae ( sne ) explosions through subsequent episodes of star formation ( e.g. matteucci & vettolani 1988 ; renzini 1997 ) , reaches temperatures of several @xmath7 k and therefore emits mainly via thermal bremsstrahlung in the x - rays . at such temperatures most of the elements are either fully ionized or in a high ionization state . \n + particularly evident in x  ray spectra of galaxy clusters are the strong transitions to

In [10]:
from transformers import AutoTokenizer

In [11]:
model_checkpoint = "t5-small"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
prefix = "summarize: "

In [18]:
def summarize_preprocess_function(article_data):
    inputs = [prefix + abstract for abstract in article_data["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=article_data["abstract"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [34]:
tokenized_arxiv_summarization = arxiv_summarization.map(summarize_preprocess_function, batched=True)

Map:   0%|          | 0/162429 [00:00<?, ? examples/s]

In [20]:
from transformers import DataCollatorForSeq2Seq

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

In [23]:
import evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
rouge = evaluate.load("rouge")

In [26]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [27]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [28]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

  with safe_open(checkpoint_file, framework="pt") as f:
  return self.fget.__get__(instance, owner)()
  storage = cls(wrap_storage=untyped_storage)
  with safe_open(filename, framework="pt", device=device) as f:


In [30]:
training_args = Seq2SeqTrainingArguments(
    output_dir="arxiv_summarization_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

In [31]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arxiv_summarization["train"],
    eval_dataset=tokenized_arxiv_summarization["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)