In [None]:
!pip install datasets tqdm evaluate rouge_score transformers[torch]

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, DataCollatorForSeq2Seq, GenerationConfig
import evaluate
import numpy as np
import huggingface_hub
ilc = load_dataset("d0r1h/ILC",split="test")
ilc = ilc.train_test_split(test_size=0.2)
checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["Case"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=examples["Summary"], max_length=142, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ilc = ilc.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


Map:   0%|          | 0/812 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="Super_legal_text_summarizer",
    evaluation_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    push_to_hub=False
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ilc["train"],
    eval_dataset=tokenized_ilc["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,No log,2.069108,0.3965,0.1608,0.2317,0.2325,134.8522
1,No log,1.958098,0.4184,0.1826,0.2539,0.255,133.4433
2,No log,1.904086,0.4129,0.1792,0.2554,0.2563,127.0591
4,No log,1.853876,0.4122,0.1754,0.258,0.2586,126.0542
5,No log,1.841387,0.4197,0.1806,0.2603,0.2613,130.8177
6,No log,1.833359,0.4058,0.1712,0.2532,0.2539,126.1281
8,1.966900,1.824581,0.4129,0.1802,0.257,0.2582,126.6158
9,1.966900,1.824247,0.4168,0.1843,0.26,0.2614,126.1232


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=670, training_loss=1.89604637943097, metrics={'train_runtime': 4743.6262, 'train_samples_per_second': 1.712, 'train_steps_per_second': 0.141, 'total_flos': 1.7398154872307712e+16, 'train_loss': 1.89604637943097, 'epoch': 9.88929889298893})

In [6]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
model.save_pretrained("Super_legal_text_summarizer")
tokenizer.save_pretrained("Super_legal_text_summarizer")
trainer.create_model_card()
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


events.out.tfevents.1714294447.38a8dd3a6211.686.1:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

events.out.tfevents.1714294401.38a8dd3a6211.686.0:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

events.out.tfevents.1714294627.38a8dd3a6211.686.2:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

events.out.tfevents.1714294780.38a8dd3a6211.686.3:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1714294865.38a8dd3a6211.3508.0:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aiguy68/Super_legal_text_summarizer/commit/03ed3dfe31487b37f875539d170c6931a825584a', commit_message='End of training', commit_description='', oid='03ed3dfe31487b37f875539d170c6931a825584a', pr_url=None, pr_revision=None, pr_num=None)