<a href="https://colab.research.google.com/github/tamaskecskemeti/nlp_thesis/blob/main/Large_Language_Models_based_Automatic_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install huggingface_hub
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install bert_score
!pip install meteor_score
!pip install gradio
!pip install bitsandbytes

[31mERROR: Could not find a version that satisfies the requirement meteor_score (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for meteor_score[0m[31m


In [2]:
!pip install --upgrade transformers accelerate bitsandbytes



In [3]:
from pathlib import Path
import torch
import itertools
import random
from huggingface_hub import login
from datasets import Dataset, load_dataset
import evaluate
import pandas as pd
from transformers import AutoTokenizer, TextDataset, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import gradio as gr
import gc

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [5]:
hf_token = "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"
login(token=hf_token,add_to_git_credential=True)

In [6]:
random.seed(42)

In [7]:
df = pd.read_csv("news_and_summaries.csv", sep=',')

In [8]:
dataset = Dataset.from_pandas(df)

In [9]:
# Perform the 80-20 train-test split
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
holdout_dataset = train_test_split['test']

In [10]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [34]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True).to(device)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Apply the summarization function on the test set
test_summaries = [generate_summary(text) for text in holdout_dataset['text']]

In [35]:
# Load the necessary metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Compute ROUGE
rouge_score = rouge.compute(predictions=test_summaries, references=holdout_dataset['summary'])
print("ROUGE Score:", rouge_score)

ROUGE Score: {'rouge1': np.float64(0.21272742290379149), 'rouge2': np.float64(0.14208596083500785), 'rougeL': np.float64(0.16153359427771685), 'rougeLsum': np.float64(0.16246595196720093)}


In [None]:
# Compute BLEU
bleu_predictions = [summary for summary in test_summaries]
bleu_references = [[ref] for ref in holdout_dataset['summary']]

bleu_score = bleu.compute(predictions=bleu_predictions, references=bleu_references)
print("BLEU Score:", bleu_score)

BLEU Score: {'bleu': 0.0015607417680619175, 'precisions': [0.7094755661501788, 0.4731051344743276, 0.3996235884567127, 0.3592139175257732], 'brevity_penalty': 0.0033312421986585916, 'length_ratio': 0.14915555555555557, 'translation_length': 3356, 'reference_length': 22500}


In [None]:
# Compute METEOR
meteor_score = meteor.compute(predictions=test_summaries, references=holdout_dataset['summary'])
print("METEOR Score:", meteor_score)

METEOR Score: {'meteor': np.float64(0.11578474633228335)}


In [36]:
def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples['text']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize summaries
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    # Ensure padding tokens are ignored in the loss calculation
    model_inputs["labels"] = [
      [(token if token != tokenizer.pad_token_id else -100) for token in label]
      for label in labels["input_ids"]
    ]
    return model_inputs

# Tokenize dataset
tokenized_dataset = train_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/333 [00:00<?, ? examples/s]

In [37]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

In [38]:
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
learning_rates = [1e-05, 2e-05, 3e-05, 5e-05]
batch_sizes = [4, 8, 16]
combinations = [(lr, bs) for lr in learning_rates for bs in batch_sizes]

model_name = "facebook/bart-large-cnn"

for lr, bs in combinations:
  save_path = f"/content/drive/My Drive/my_summarizer_model/lora_finetuned_model_{lr}_{bs}"

  # Load Tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.save_pretrained(save_path)

  # Configure LoRA
  lora_config = LoraConfig(
      r=8,
      lora_alpha=32,
      lora_dropout=0.1
      )

  model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

  model = get_peft_model(model, lora_config)
  model.save_pretrained(save_path)

  training_args = TrainingArguments(
      output_dir=save_path,
      learning_rate=lr,
      per_device_train_batch_size=bs,
      per_device_eval_batch_size=4,
      num_train_epochs=4,
      weight_decay=0.01,
      save_strategy="epoch",
      remove_unused_columns=False
  )

  trainer = Trainer(
      model=model.to(device),
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=validation_dataset
  )

  # Start training
  trainer.train()
  trainer.save_model()

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


In [None]:
evaluation = pd.DataFrame(columns=['model', 'ROUGE Score', 'BLEU Score', 'METEOR Score'])
for lr, bs in combinations:
  tokenizer = AutoTokenizer.from_pretrained(f"/content/drive/My Drive/my_summarizer_model/lora_finetuned_model_{lr}_{bs}")
  model = AutoModelForSeq2SeqLM.from_pretrained(f"/content/drive/My Drive/my_summarizer_model/lora_finetuned_model_{lr}_{bs}").to(device)
  test_summaries = [generate_summary(text) for text in holdout_dataset['text']]
  rouge_score = rouge.compute(predictions=test_summaries, references=holdout_dataset['summary'])
  bleu_predictions = [summary for summary in test_summaries]
  bleu_references = [[ref] for ref in holdout_dataset['summary']]
  bleu_score = bleu.compute(predictions=bleu_predictions, references=bleu_references)
  meteor_score = meteor.compute(predictions=test_summaries, references=holdout_dataset['summary'])
  evaluation.loc[len(evaluation)] = [f'lora_finetuned_model_{lr}_{bs}', rouge_score, bleu_score, meteor_score]

In [None]:
evaluation.head(12)

Unnamed: 0,model,ROUGE Score,BLEU Score,METEOR Score
0,lora_finetuned_model_1e-05_4,"{'rouge1': 0.2587893209854084, 'rouge2': 0.183...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.14584541051751784}
1,lora_finetuned_model_1e-05_8,"{'rouge1': 0.22099021121960352, 'rouge2': 0.14...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.11916943514200977}
2,lora_finetuned_model_1e-05_16,"{'rouge1': 0.2151910392114938, 'rouge2': 0.146...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.11593232128359614}
3,lora_finetuned_model_2e-05_4,"{'rouge1': 0.3358343425054999, 'rouge2': 0.237...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.19967711060202192}
4,lora_finetuned_model_2e-05_8,"{'rouge1': 0.2641341828172985, 'rouge2': 0.187...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.14805167298277744}
5,lora_finetuned_model_2e-05_16,"{'rouge1': 0.23006024707057698, 'rouge2': 0.15...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.125403132295305}
6,lora_finetuned_model_3e-05_4,"{'rouge1': 0.48638701932689987, 'rouge2': 0.35...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.32569553432297865}
7,lora_finetuned_model_3e-05_8,"{'rouge1': 0.2958996483918966, 'rouge2': 0.203...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.16938293759481154}
8,lora_finetuned_model_3e-05_16,"{'rouge1': 0.2471489894245783, 'rouge2': 0.173...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.13630875782039265}
9,lora_finetuned_model_5e-05_4,"{'rouge1': 0.5272811310618907, 'rouge2': 0.411...","{'bleu': 0.0015607417680619175, 'precisions': ...",{'meteor': 0.3527301579749625}


In [None]:
learning_rates = [2e-05, 3e-05, 5e-05]
batch_sizes = [4, 8]
combinations = [(lr, bs) for lr in learning_rates for bs in batch_sizes]

model_name = "facebook/bart-large-cnn"
del model
import gc         # garbage collect library
gc.collect()
torch.cuda.empty_cache()

for lr, bs in combinations:


    save_path = f"/content/drive/My Drive/my_summarizer_model/full_finetuned_model_{lr}_{bs}"
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained(save_path)

    # Load and quantize the model in 4-bit
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        device_map="auto"
    )

    model.train()
    model.save_pretrained(save_path)

    training_args = TrainingArguments(
        output_dir=save_path,
        learning_rate=lr,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=4,
        num_train_epochs=4,
        weight_decay=0.01,
        save_strategy="epoch",
        optim="adamw_torch_fused"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    )

    trainer.train()
    trainer.save_model(save_path)

    del model
    gc.collect()
    torch.cuda.empty_cache()

  trainer = Trainer(


Step,Training Loss


  trainer = Trainer(


Step,Training Loss


  trainer = Trainer(


Step,Training Loss


In [None]:
for lr, bs in combinations:
  tokenizer = AutoTokenizer.from_pretrained(f"/content/drive/My Drive/my_summarizer_model/full_finetuned_model_{lr}_{bs}")
  model = AutoModelForSeq2SeqLM.from_pretrained(f"/content/drive/My Drive/my_summarizer_model/full_finetuned_model_{lr}_{bs}").to(device)
  test_summaries = [generate_summary(text) for text in holdout_dataset['text']]
  rouge_score = rouge.compute(predictions=test_summaries, references=holdout_dataset['summary'])
  bleu_score = bleu.compute(predictions=bleu_predictions, references=bleu_references)
  meteor_score = meteor.compute(predictions=test_summaries, references=holdout_dataset['summary'])
  evaluation.loc[len(evaluation)] = [f'finetuned_model_{lr}_{bs}', rouge_score, bleu_score, meteor_score]

In [None]:
iface = gr.Interface(
    fn=generate_summary,
    inputs=gr.Textbox(lines=10, label="Enter text to summarize"),
    outputs=gr.Textbox(label="Summary"),
    title="Text Summarizer",
    description="Enter a paragraph and the model will generate a summary."
)

iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a04a9e1f17d82ba5a5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


