## Import Libraries

In [14]:
%%capture
!pip install datasets evaluate rouge_score
!pip install transformers

In [18]:
# Hyperparameters
checkpoint = "facebook/blenderbot-400M-distill"

##  Load BillSum dataset

In [2]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")
print(billsum)

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})


In [3]:
billsum[0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.\n(2) These veterans’ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.\n(b) As a result of congressional chartering of these veterans’ organizations, the United States Inte

In [4]:
billsum = billsum.train_test_split(test_size=0.2)
print(billsum)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})


In [5]:
example = billsum["train"][0]
print(example)

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 17052.6 of the Revenue and Taxation Code is amended to read:\n17052.6.\n(a) For each taxable year beginning on or after January 1, 2000, there shall be allowed as a credit against the “net\ntax”,\ntax,\n”\nas defined in Section 17039, an amount determined in accordance with Section 21 of the Internal Revenue Code, relating to expenses for household and dependent care services necessary for gainful employment, except that the amount of the credit shall be a percentage, as provided in subdivision (b) of the allowable federal credit without taking into account whether there is a federal tax liability.\n(b) For the purposes of subdivision (a), the percentage of the allowable federal credit shall be determined as follows:\n(1) For taxable years beginning before January 1, 2003:\n\nIf the adjusted gross income is:\nThe percentage of\ncredit is:\n$40,000 or less ........................\n63%\nOver $4

## Preprocess

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [20]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [21]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [22]:
print(tokenized_billsum)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})


In [23]:
tokenized_billsum["train"]

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 989
})

In [24]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

## Evaluate

In [25]:
import evaluate

rouge = evaluate.load("rouge")

In [26]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

##  Train

In [27]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [28]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading tf_model.h5:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

Some layers of TFBlenderbotForConditionalGeneration were not initialized from the model checkpoint at facebook/blenderbot-400M-distill and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

In [29]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_billsum["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_billsum["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a BlenderbotTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [31]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=metric_callback)

Epoch 1/3
