<a href="https://colab.research.google.com/github/tamaskecskemeti/financial_nlp/blob/main/Large_Language_Models_based_Automatic_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install huggingface_hub
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install bert_score
!pip install meteor_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from pathlib import Path
import itertools
import random
from huggingface_hub import login
from datasets import Dataset, load_dataset
import evaluate
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

In [None]:
hf_token = "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"
login(token=hf_token,add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
random.seed(42)

In [4]:
df = pd.read_csv("news_and_summaries.csv", sep=',')

In [5]:
df.head()

Unnamed: 0,text,summary
0,Labour plans maternity pay rise\n\nMaternity p...,She said her party would boost maternity pay i...
1,Watchdog probes e-mail deletions\n\nThe inform...,All e-mails are subject to the freedom of info...
2,Hewitt decries 'career sexism'\n\nPlans to ext...,Ms Hewitt also announced a new drive to help w...
3,Labour chooses Manchester\n\nThe Labour Party ...,The Labour Party will hold its 2006 autumn con...
4,Brown ally rejects Budget spree\n\nChancellor ...,"But Mr Balls, a prospective Labour MP, said he..."


In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
# Perform the 80-20 train-test split
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [8]:
model_name = "google-t5/t5-small"  # You can choose a different model as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply the summarization function on the test set
test_summaries = [generate_summary(text) for text in test_dataset['text']]

In [20]:
# Load the necessary metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
# Compute ROUGE
rouge_score = rouge.compute(predictions=test_summaries, references=test_dataset['summary'])
print("ROUGE Score:", rouge_score)

ROUGE Score: {'rouge1': 0.2592649595021869, 'rouge2': 0.1752341916677011, 'rougeL': 0.1916811442622505, 'rougeLsum': 0.19099905235593118}


In [22]:
# Compute BLEU
bleu_predictions = [summary for summary in test_summaries]
bleu_references = [[ref] for ref in test_dataset['summary']]

bleu_score = bleu.compute(predictions=bleu_predictions, references=bleu_references)
print("BLEU Score:", bleu_score)

BLEU Score: {'bleu': 0.004453174883514457, 'precisions': [0.7397704590818364, 0.45642201834862384, 0.37265625, 0.323482428115016], 'brevity_penalty': 0.009914346245496225, 'length_ratio': 0.17813333333333334, 'translation_length': 4008, 'reference_length': 22500}


In [23]:
# Compute METEOR
meteor_score = meteor.compute(predictions=test_summaries, references=test_dataset['summary'])
print("METEOR Score:", meteor_score)

METEOR Score: {'meteor': 0.1474278260433635}


In [26]:
test_dataset[0]["text"]

'Goldsmith denies war advice claim\n\nThe attorney general has denied his statement to Parliament about the legality of the Iraq war was drafted by Downing Street officials.\n\nLord Goldsmith said Lord Falconer and Baroness Morgan played no part in drafting the answer. He added the answer represented his view that the war was legal, but was not a summary of his advice to the PM. The government has resisted calls to publish the full advice, saying such papers are always kept confidential.\n\nIn a statement, Lord Goldsmith said: "I was fully involved throughout the drafting process and personally finalised, and of course approved, the answer." He said the answer had been prepared in his office with the involvement of Solicitor General Harriet Harman, two of his own officials, three Foreign Office officials, a QC, Christopher Greenwood and the then Lord Chancellor, Lord Irvine of Lairg.\n\n"No other minister or official was involved in any way." He suggested the claim that Lord Falconer a

In [24]:
test_dataset[0]["summary"]

'Former minister Clare Short, who resigned from the government over the Iraq war, said the ministerial answer was the same statement that was earlier shown to the cabinet as it discussed military action.Former foreign secretary Robin Cook said Lord Goldsmith\'s admission that his parliamentary answer was not a summary of his legal opinion suggested Parliament may have been misled.In a statement, Lord Goldsmith said: "I was fully involved throughout the drafting process and personally finalised, and of course approved, the answer."He added the answer represented his view that the war was legal, but was not a summary of his advice to the PM.Lord Goldsmith said Lord Falconer and Baroness Morgan played no part in drafting the answer."The attorney general may never have presented his answer as a summary, but others certainly did," he said."If his original advice of 7 March accepted that military action might be illegal, how was it that he resolved any such doubts by the time the Parliamenta

In [25]:
test_summaries[0]

'the attorney general denies war advice claims. he says the answer represented his view that the war was legal. the government has resisted calls to publish the full advice.'

In [None]:
# TODO implement LoRA fine-tuning and one more fine-tuning method to compare them