In [1]:
!pip install transformers
!pip install datasets
!pip install SentencePiece
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [1]:
from datasets import load_dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
# Load pre-trained Pegasus model and tokenizer
model_name = 'google/pegasus-cnn_dailymail'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [9]:
# Load and preprocess the dataset
train_dataset = load_dataset('cnn_dailymail', '3.0.0', split='train[:50]')  # Load a subset of the "cnn_dailymail" dataset



In [10]:
def preprocess_function(examples):
    inputs = tokenizer(examples['article'], truncation=True, padding='longest')
    targets = tokenizer(examples['highlights'], truncation=True, padding='longest')
    inputs['labels'] = targets['input_ids']
    return inputs

In [11]:
train_dataset = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [12]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    predict_with_generate=True,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=3,
    eval_steps=100,
    logging_steps=100,
    logging_dir='./logs',
    learning_rate=1e-5,
    warmup_steps=500,
    weight_decay=0.01,
)

In [13]:
# Create the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
)

In [14]:
# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=50, training_loss=4.350811462402344, metrics={'train_runtime': 43.0072, 'train_samples_per_second': 1.163, 'train_steps_per_second': 1.163, 'total_flos': 144473220710400.0, 'train_loss': 4.350811462402344, 'epoch': 1.0})

In [15]:
model.save_pretrained("path/to/save/directory")

In [None]:
from transformers import PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained("path/to/save/directory")

In [None]:
input_text = ["Your input text goes here"]
input_encoding = tokenizer(input_text, truncation=True, padding="longest", max_length=512, return_tensors="pt")

# Generate predictions
output = model.generate(input_encoding["input_ids"])
predicted_text = tokenizer.batch_decode(output, skip_special_tokens=True)