# News Article Summarization With Flan T5 Base LLM

# 1.0 Installations and Libraries

In [None]:
!pip install --upgrade pip
!pip install --disable-pip-version-check
!pip install torch==1.13.1
!pip install torchdata==0.5.1 --quiet

!pip install transformers==4.27.2
!pip install datasets==2.11.0 --quiet

In [1]:
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

# 2.0 Load Dataset

In [2]:
# Loading News Articles
dataset = load_dataset("xsum")

Found cached dataset xsum (/home/ec2-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [4]:
# Get a sample of the dataset and display in dataframe
dataset_sample = dataset["train"].select(range(5))
display(dataset_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


# 3.0 Summarizing Dialogue With An Instruction Prompt

## 3.1 Summarizing Using Pipeline and Zero-Shot Inference

The `pipeline` module provides a high-level API for performing various NLP tasks using pre-trained models. It simplifies the process of using transformer models for tasks such as text generation, sentiment analysis, etc.

In [7]:
summarizer = pipeline(
    task = "summarization",
    model = "google/flan-t5-small",
    min_length = 20,
    max_length = 40,
    truncation = True
)

## 3.2 Generating Summaries

In [8]:
# Perform summarization on the first article
summary = summarizer(dataset_sample["document"][0])

print("ARTICLE: {}".format(dataset_sample["document"][0]))
print()
print("BASELINE SUMMARY: {}".format(dataset_sample["summary"][0]))
print()
print("GENERATED SUMMARY: {}".format(summary))



ARTICLE: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but

In [9]:
# Perform summarization on entire sample dataset
summary = summarizer(dataset_sample["document"])

## 3.3 Analyzing Results

In [10]:
# Analyze the results in a dataframe
import pandas as pd

results = pd.DataFrame.from_dict(summary)
results

Unnamed: 0,summary_text
0,The cost of a flood warning in Dumfries and Ga...
1,A group of people have been killed in a fire a...
2,Sebastian Vettel and Nico Rosberg won the Gran...
3,A scout leader accused of sexually abusing a b...
4,A hospital in Istanbul has been evacuated from...


In [11]:
dataset_sample_df = pd.DataFrame.from_dict(dataset_sample)

results = results.rename({"summary_text": "generated_summary"}, axis = 1)
results = results.join(dataset_sample_df)[["document", "summary", "generated_summary"]]
results

Unnamed: 0,document,summary,generated_summary
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,The cost of a flood warning in Dumfries and Ga...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,A group of people have been killed in a fire a...
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,Sebastian Vettel and Nico Rosberg won the Gran...
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,A scout leader accused of sexually abusing a b...
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,A hospital in Istanbul has been evacuated from...


# 4.0 Generating Summaries with One Shot Inference

In [None]:
# Loading Model

model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [12]:
# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

In [13]:
# function to generate the prompt for our LLM

def make_prompt(examples_full, examples_to_summarize):
    """
    This function takes in two lists. The first list contains the prompt-response pairs that the model will see before
    making a summary on the examples_to_summarize.
    
    Inputs:
        examples_full (List): List of prompt-response pairs that model will look at beforehand
        examples_to_summarize (List): List of dialogues that we want our model to summarize
    Outputs:
        prompt: prompt that will serve as input for our model. The prompt will contain response pairs as well as the 
        dialogues which wont have a summary attached. Those will be the dialogues we want our model to summarize.
    """
    
    prompt = ""
    for index in examples_full:
        document = dataset["train"][index]["document"]
        summary = dataset["train"][index]["summary"]
        
        # construct the prompt
        prompt += """
    Document:
        
    {}
        
    What was going on?
    {}
    """.format(document, summary)
        
    for index in examples_to_summarize:
        document = dataset["train"][index]["document"]
        
        prompt += """
    Document:
        
    {}
        
    What was going on?
    """.format(document)
        
    return prompt

In [19]:
# Generate prompt that will be used as input for one-shot inference
examples_full = [5]
examples_to_summarize = [4]

one_shot_prompt = make_prompt(examples_full, examples_to_summarize)

In [20]:
one_shot_prompt

'\n    Document:\n        \n    Simone Favaro got the crucial try with the last move of the game, following earlier touchdowns by Chris Fusaro, Zander Fagerson and Junior Bulumakau.\nRynard Landman and Ashton Hewitt got a try in either half for the Dragons.\nGlasgow showed far superior strength in depth as they took control of a messy match in the second period.\nHome coach Gregor Townsend gave a debut to powerhouse Fijian-born Wallaby wing Taqele Naiyaravoro, and centre Alex Dunbar returned from long-term injury, while the Dragons gave first starts of the season to wing Aled Brew and hooker Elliot Dee.\nGlasgow lost hooker Pat McArthur to an early shoulder injury but took advantage of their first pressure when Rory Clegg slotted over a penalty on 12 minutes.\nIt took 24 minutes for a disjointed game to produce a try as Sarel Pretorius sniped from close range and Landman forced his way over for Jason Tovey to convert - although it was the lock\'s last contribution as he departed with a

In [21]:
# perform one shot inference with the generated prompt

results_one_shot = []

for index in examples_to_summarize:
    inputs = tokenizer(one_shot_prompt, return_tensors = "pt")
    output = tokenizer.decode(
        model.generate(inputs["input_ids"],
                      max_new_tokens = 40,
                      min_new_tokens = 20,)[0],
        skip_special_tokens = True
    )
    results_one_shot.append(output)

In [22]:
results_one_shot

['A man has been taken hostage by a police officer in Istanbul after he threatened to shoot himself and others.']

In [27]:
# Analyzing Results
baseline_summary = dataset["train"][4]["summary"]
zero_shot = results["generated_summary"][4]

print("BASELINE SUMMARY: {}".format(baseline_summary))
print()
print("GENERATED ZERO SHOT: {}".format(zero_shot))
print()
print("GENERATED ONE SHOT: {}".format(results_one_shot))

BASELINE SUMMARY: An armed man who locked himself into a room at a psychiatric hospital in Istanbul has ended his threat to kill himself, Turkish media report.

GENERATED ZERO SHOT: A hospital in Istanbul has been evacuated from a hospital after a man threatened to shoot himself and others in an attack on a patient.

GENERATED ONE SHOT: ['A man has been taken hostage by a police officer in Istanbul after he threatened to shoot himself and others.']


# 5.0 Generating Summaries with Few Shot Inference

In [28]:
# Generate prompt that will be used as input for one-shot inference
examples_full = [5, 6, 7]
examples_to_summarize = [4]

few_shot_prompt = make_prompt(examples_full, examples_to_summarize)

In [29]:
# perform few shot inference with the generated prompt

results_few_shot = []

for index in examples_to_summarize:
    inputs = tokenizer(few_shot_prompt, return_tensors = "pt")
    output = tokenizer.decode(
        model.generate(inputs["input_ids"],
                      max_new_tokens = 40,
                      min_new_tokens = 20,)[0],
        skip_special_tokens = True
    )
    results_few_shot.append(output)

In [30]:
results_few_shot

['A man has been taken hostage by a hospital in Istanbul after he threatened to shoot himself and others.']

In [31]:
# Analyzing results

print("BASELINE SUMMARY: {}".format(baseline_summary))
print()
print("GENERATED ZERO SHOT: {}".format(zero_shot))
print()
print("GENERATED ONE SHOT: {}".format(results_one_shot))
print()
print("GENERATED FEW SHOT: {}".format(results_few_shot))

BASELINE SUMMARY: An armed man who locked himself into a room at a psychiatric hospital in Istanbul has ended his threat to kill himself, Turkish media report.

GENERATED ZERO SHOT: A hospital in Istanbul has been evacuated from a hospital after a man threatened to shoot himself and others in an attack on a patient.

GENERATED ONE SHOT: ['A man has been taken hostage by a police officer in Istanbul after he threatened to shoot himself and others.']

GENERATED FEW SHOT: ['A man has been taken hostage by a hospital in Istanbul after he threatened to shoot himself and others.']
