In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 8.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 76.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 65.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 82.8 MB/s 
Collecting x

In [None]:
from datasets import load_dataset

ds = load_dataset("billsum", split="ca_test")
ds

In [None]:
ds[1]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
tokenizer

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [None]:
max_input_length = 1024
max_target_length = 128


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"], max_length=max_input_length, truncation=True
    )
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = ds.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# test size will be 15% of train dataset
test_size=.15

processed_dataset = tokenized_datasets.shuffle().train_test_split(test_size=test_size)
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1051
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 186
    })
})

In [None]:
from transformers import TFAutoModelForSeq2SeqLM
# load pre-trained model
model = TFAutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')


All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
from transformers import DataCollatorForSeq2Seq

train_batch_size = 2
eval_batch_size = 2

# Data collator that will dynamically pad the inputs received, as well as the labels.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

# converting our train dataset to tf.data.Dataset
tf_train_dataset = processed_dataset["train"].to_tf_dataset(
   columns=["input_ids", "attention_mask", "labels"],
   shuffle=True,
   batch_size=train_batch_size,
   collate_fn=data_collator)

# converting our test dataset to tf.data.Dataset
tf_eval_dataset = processed_dataset["test"].to_tf_dataset(
   columns=["input_ids", "attention_mask", "labels"],
   shuffle=True,
   batch_size=eval_batch_size,
   collate_fn=data_collator)


In [None]:
import tensorflow as tf 
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

# compile model
model.compile(optimizer=adam)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
train_results = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=1,
)




In [None]:
text = '''
We, the people of Korea, proud of a resplendent history and traditions dating from time immemorial,
 upholding the cause of the Provisional Republic of Korea Government born of 
 the March First Independence Movement of 1919 and the democratic ideals of the April Nineteenth 
 Uprising of 1960 against injustice, having assumed the mission of democratic reform and peaceful 
 unification of our homeland and having determined to consolidate national unity with justice, 
 humanitarianism and brotherly love, and To destroy all social vices and injustice, and To afford 
 equal opportunities to every person and provide for the fullest development of individual capabilities in all fields, including political, economic, social and cultural life by further strengthening the basic free and democratic order conducive to private initiative and public harmony, and To help each person discharge those duties and responsibilities concomitant to freedoms and rights, and To elevate the quality of life for all citizens and contribute to lasting world peace and the common prosperity of mankind and thereby to ensure security, liberty and happiness for ourselves and our posterity forever, Do hereby amend, through national referendum following a resolution by the National Assembly, the Constitution, ordained and established on the Twelfth Day of July anno Domini Nineteen hundred and forty-eight, and amended eight times subsequently.
Oct. 29, 1987'''

In [None]:
model_inputs = tokenizer(
        text, max_length=max_input_length, truncation=True
    )

In [None]:
predictions = model.generate([model_inputs['input_ids']], max_length = max_input_length)

In [None]:
tokenizer.batch_decode(predictions, skip_special_tokens=True)

['An act to amend, repeal, and add the Constitution of the Provisional Republic of Korea, relating to the Korean people.']