Load Data

In [40]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [41]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




  0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [4]:
train_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [5]:
validation_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

In [6]:
test_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 11490
})

Preprocessing

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


In [45]:
from transformers import BartTokenizer

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [49]:
def tokenize_data(example):
    input_text = example["article"]
    target_text = example["highlights"]
    input_tokenized = tokenizer(input_text, truncation=True, padding='max_length', max_length=512)
    target_tokenized = tokenizer(target_text, truncation=True, padding='max_length', max_length=512)

    return {
        "input_ids": input_tokenized.input_ids[0],
        "attention_mask": input_tokenized.attention_mask[0],
        "labels": target_tokenized.input_ids[0],
    }

In [None]:
train_dataset = train_data.map(tokenize_data, batched=True, remove_columns=['article', 'highlights', 'id'])
validation_dataset = validation_data.map(tokenize_data, batched=True, remove_columns=['article', 'highlights', 'id'])

In [35]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

Bart-base

In [None]:
# from transformers import T5ForConditionalGeneration, T5Config

# config = T5Config.from_pretrained(model_name)
# config.gradient_checkpointing = True
# model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
from transformers import BartForConditionalGeneration, BartConfig

config = BartConfig.from_pretrained(model_name)
config.gradient_checkpointing = True
model = BartForConditionalGeneration.from_pretrained(model_name, config=config)

In [15]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [16]:
device

device(type='cuda')

In [17]:
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 1
num_training_steps = len(train_loader) * epochs
warmup_steps = int(num_training_steps * 0.1)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)



Training

In [18]:
from tqdm import tqdm

In [21]:
def train_model(model, train_loader, validation_loader, optimizer, scheduler, device, epochs):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 30)

        train_loss = 0.0
        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels, use_cache=False)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        print(f"Training loss: {avg_train_loss}")

        # Evaluate the model on the validation set
        model.eval()
        validation_loss = 0.0
        for batch in validation_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels, use_cache=False)
                loss = outputs.loss

            validation_loss += loss.item()

        avg_validation_loss = validation_loss / len(validation_loader)
        print(f"Validation loss: {avg_validation_loss}")

In [20]:
train_model(model, train_loader, validation_loader, optimizer, scheduler, device, 2)

Epoch 1/2
------------------------------


100%|██████████| 35890/35890 [1:34:58<00:00,  6.30it/s]


Training loss: 0.6990862742718231
Validation loss: 0.5651097472897552
Epoch 2/2
------------------------------


100%|██████████| 35890/35890 [1:24:11<00:00,  7.10it/s]


Training loss: 0.5072323016048009
Validation loss: 0.5651097472897552


In [22]:
train_model(model, train_loader, validation_loader, optimizer, scheduler, device, 1)

Epoch 1/1
------------------------------


100%|██████████| 35890/35890 [1:34:48<00:00,  6.31it/s]


Training loss: 0.551385216374655
Validation loss: 0.5651097472897552


Use bart-large model for fine tuned

In [23]:
tokenizer_1 = BartTokenizer.from_pretrained('facebook/bart-large')
model_1 = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [30]:
model_1.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [24]:
batch_size = 4  
learning_rate = 3e-5 
gradient_accumulation_steps = 4

In [25]:
train_loader_1 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader_1 = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

optimizer_1 = AdamW(model_1.parameters(), lr=learning_rate)




In [33]:
epochs_1 = 2
num_training_steps_1 = len(train_loader) * epochs_1
warmup_steps_1 = int(num_training_steps_1 * 0.1)

scheduler_1 = get_linear_schedule_with_warmup(optimizer_1, num_warmup_steps=warmup_steps_1, num_training_steps=num_training_steps_1)

In [28]:
def train_model_1(model, train_loader, validation_loader, optimizer, scheduler, device, epochs, gradient_accumulation_steps=4):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 30)

        train_loss = 0.0
        optimizer.zero_grad()
        for i, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels, use_cache=False)
            loss = outputs.loss
            loss.backward()

            train_loss += loss.item()

            if (i + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Training loss: {avg_train_loss}")

        # Evaluate the model on the validation set
        model.eval()
        validation_loss = 0.0
        for batch in validation_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels, use_cache=False)
                loss = outputs.loss

            validation_loss += loss.item()

        avg_validation_loss = validation_loss / len(validation_loader)
        print(f"Validation loss: {avg_validation_loss}")


In [None]:
train_model_1(model_1, train_loader_1, validation_loader_1, optimizer_1, scheduler_1, device, 2,gradient_accumulation_steps)

In [None]:
print("Epoch 1/2
------------------------------
100%|██████████| 71779/71779 [4:00:05<00:00,  4.98it/s]
Training loss: 13.817555772621963
Validation loss: 16.257276042502202
Epoch 2/2
------------------------------
100%|██████████| 71779/71779 [3:55:04<00:00,  5.09it/s]
Training loss: 16.934705126247188")")

Use flan-t5-large

In [1]:
!pip install accelerate
!pip install sentencepiece
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.18.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.98
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0

In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [6]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model_2 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
model_2.to(device)

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [None]:
def preprocess_data(batch):
    input_texts = ["summarize: " + article for article in batch["article"]]
    target_texts = batch["highlights"]

    input_tokenized = tokenizer(input_texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_tokenized = tokenizer(target_texts, truncation=True, padding='max_length', max_length=150, return_tensors="pt")

    return {"input_ids": input_tokenized.input_ids, "labels": target_tokenized.input_ids}

In [9]:
train_dataset = train_data.map(preprocess_data, batched=True, remove_columns=["article", "highlights", "id"])
validation_dataset = validation_data.map(preprocess_data, batched=True, remove_columns=["article", "highlights", "id"])

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-large",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1000,
    save_total_limit=2,
    fp16=True,
    gradient_accumulation_steps=4,
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model_2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
100,No log,
200,No log,
300,No log,
400,No log,


In [51]:
torch.cuda.empty_cache()

T5-base

In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [4]:
train_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
def preprocess_data(batch):
    input_texts = ["summarize: " + article for article in batch["article"]]
    target_texts = batch["highlights"]

    input_tokenized = tokenizer(input_texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_tokenized = tokenizer(target_texts, truncation=True, padding='max_length', max_length=150, return_tensors="pt")

    return {"input_ids": input_tokenized.input_ids, "labels": target_tokenized.input_ids}

In [8]:
train_data = train_data.map(preprocess_data, batched=True, batch_size=16, remove_columns=['article', 'highlights', 'id'])
validation_data = validation_data.map(preprocess_data, batched=True, batch_size=16, remove_columns=['article', 'highlights', 'id'])

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [9]:
train_data

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 287113
})

In [10]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.98


In [11]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-base")

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [13]:
from transformers import Trainer, TrainingArguments

In [20]:
training_args = TrainingArguments(
    output_dir="./t5_base_news",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=1000,
    learning_rate=3e-5,
    warmup_steps=200,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.7732,0.847432


TrainOutput(global_step=4486, training_loss=0.854962543244347, metrics={'train_runtime': 10664.4204, 'train_samples_per_second': 26.923, 'train_steps_per_second': 0.421, 'total_flos': 1.7483424621133824e+17, 'train_loss': 0.854962543244347, 'epoch': 1.0})

In [17]:
torch.cuda.empty_cache()

Baseline

In [34]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [35]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [25]:
import re

def first_n_sentences(text, n):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return ' '.join(sentences[:n])

def first_sentence(text):
    return first_n_sentences(text, 1)

test_data_first_3_sentences = [first_n_sentences(row['article'], 3) for row in test_data]
test_data_first_sentence = [first_sentence(row['article']) for row in test_data]

In [26]:
from rouge import Rouge

def calculate_rouge_scores(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

test_data_highlights = [row['highlights'] for row in test_data]

rouge_scores_first_3_sentences = calculate_rouge_scores(test_data_first_3_sentences, test_data_highlights)
rouge_scores_first_sentence = calculate_rouge_scores(test_data_first_sentence, test_data_highlights)

print("ROUGE scores for first 3 sentences:", rouge_scores_first_3_sentences)
print("ROUGE scores for first sentence:", rouge_scores_first_sentence)

ROUGE scores for first 3 sentences: {'rouge-1': {'r': 0.44673285691118386, 'p': 0.3166649314677224, 'f': 0.35984567498421866}, 'rouge-2': {'r': 0.18997002540392843, 'p': 0.12702930460945516, 'f': 0.146553109147204}, 'rouge-l': {'r': 0.41061720531432794, 'p': 0.29125301351751404, 'f': 0.330865336568481}}
ROUGE scores for first sentence: {'rouge-1': {'r': 0.1969565936659986, 'p': 0.36670973644942156, 'f': 0.2468008482101198}, 'rouge-2': {'r': 0.06262446435547418, 'p': 0.12811433523799115, 'f': 0.08016134930310587}, 'rouge-l': {'r': 0.17388121811045126, 'p': 0.32620801357423784, 'f': 0.21830610774359815}}


Evaluation

In [27]:
from tqdm import tqdm

In [29]:
def generate_summaries(model, tokenizer, test_data, device):
    model.eval()
    generated_summaries = []

    for i in tqdm(range(len(test_data))):
        input_text = "summarize: " + test_data[i]['article']
        input_ids = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512).input_ids.to(device)
       
        outputs = model.generate(input_ids).to(device)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_summaries.append(summary)

    return generated_summaries

In [30]:
generated_summaries = generate_summaries(model, tokenizer, test_data, device)

100%|██████████| 11490/11490 [58:08<00:00,  3.29it/s]


In [38]:
from rouge import Rouge

def calculate_rouge_scores(generated_summaries, test_data):
    rouge = Rouge()
    target_summaries = [data['highlights'] for data in test_data]

    scores = rouge.get_scores(generated_summaries, target_summaries, avg=True)
    return scores

rouge_scores = calculate_rouge_scores(generated_summaries, test_data)
print(rouge_scores)

{'rouge-1': {'r': 0.161355381033138, 'p': 0.5396914255883426, 'f': 0.2433150103443015}, 'rouge-2': {'r': 0.06764291677613885, 'p': 0.26900742389319215, 'f': 0.10551134642310808}, 'rouge-l': {'r': 0.15339219191677025, 'p': 0.5139240245270047, 'f': 0.23140071040629134}}


In [39]:
num_examples = 5

for i in range(num_examples):
    print(f"Example {i + 1}")
    print(f"Article: {test_data[i]['article']}")
    print(f"Generated summary: {generated_summaries[i]}")
    print(f"Reference summary: {test_data[i]['highlights']}")
    print("-" * 80)

Example 1
Article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wedne