# Summarize data with T5 summarization and question answering

Summarization: flan_t5 - working well
QA: xlm_roberta_base_squad2   - not so much

In [19]:
import pandas as pd
import os
import sentencepiece
import torch

from transformers import pipeline, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [15]:
# config variables

summary_file = "summary_orig.xlsx"
summarization_model = "google/flan-t5-base"
qa_model = "deepset/roberta-base-squad2"


In [16]:
# load the excel data into a pandas dataframe
directory_path = os.getcwd()
summary_file = os.path.join(directory_path, summary_file)
print(summary_file)
df = pd.read_excel(summary_file)

/Users/tjordan/code/git/topic_explorer_ml/summary_orig.xlsx


In [17]:
# create the summarization pipeline and loop through the dataframe to summarize the data 
# in the source_original_text column and add the summary to the dataframe in a column named source_summary

summarizer = T5ForConditionalGeneration.from_pretrained(summarization_model)
tokenizer = T5Tokenizer.from_pretrained(summarization_model)

scaling_factor = 0.5 #how much we want to scale the summary length

df_results = pd.DataFrame(columns=["source_text", "summary_text", "source_text_tokens", "summary_text_tokens", "max_length", "min_length", 
                                   "killed", "injured", "displaced", "affected", "location", "summary_model used", "qa_model used"])

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)

    # tokenize the input text
    inputs = tokenizer.encode("summarize: " + source_text, return_tensors="pt", max_length=1024, truncation=True)
    input_length = len(inputs[0])
    print("Input Token length: ", input_length)

    # scale the summary length based on the input length
    max_length = int(input_length * scaling_factor)
    min_length = 50
    if max_length < 50:
        min_length = int(max_length * 0.5)

    summary_ids = summarizer.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    print("Summary Token length: ", len(summary_ids[0]))
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summary text: ", summary)
    df_results.at[index, "source_text"] = source_text
    df_results.at[index, "summary_text"] = summary
    df_results.at[index, "source_text_tokens"] = input_length
    df_results.at[index, "summary_text_tokens"] = len(summary_ids[0])
    df_results.at[index, "max_length"] = max_length
    df_results.at[index, "min_length"] = min_length
    df_results.at[index, "model used"] = summarization_model

# now write an excel file with the summarized data
# df_results.to_excel("summarized_data_flan_T5_base.xlsx")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Original text:  On 10 May 2024, flash floods swept across multiple provinces in the northeast region of Afghanistan, leaving more than 250,000 people homeless. The provinces most impacted by the flash floods were Badakhshan, Baghlan, and Takhar. These floods followed earlier flooding from 16 to 20 April 2024, which had affected 24 provinces.
Input Token length:  79
Summary Token length:  21
Summary text:  More than 250,000 people were left homeless after flash floods in Afghanistan in May 2024.
Original text:  Based on data consolidated by the Afghan Red Crescent Society (ARCS), as of 18 May 2024, at least 300 deaths had been reported in 33 provinces. More than 35,400 households (257,850 people) have been directly affected across the country, with one million people at risk of being indirectly affected nationwide. An estimated 22,000 livestock and close to 120,000 acres of agricultural land were destroyed. There are also numerous reports of bridges being destroyed, though the total num

In [20]:
# now let's set up a pipeline to answer questions based on the summarized data

qa_pipe = pipeline("question-answering", model=qa_model, tokenizer=qa_model)

killed_q = "How many people were killed?"
injured_q= "How many people were injured?"
missing_q = "How many people are missing?"
displaced_q = "How many people were displaced?"
affected_q = "How many people were affected?"
location_q = "Where did the event happen?"

#TODO:  Evaluate the confidence of the answers and only accept answers with a confidence above a certain threshold
#TODO:  Consider how to incorporate synonyms into questions (e.g. displaced/homeless/migrant/migration)

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)
    
    killed_a = qa_pipe(question=killed_q, context=source_text)
    print("Killed: ", killed_a)
    injured_a = qa_pipe(question=injured_q, context=source_text)
    print("Injured: ", injured_a)
    missing_a = qa_pipe(question=missing_q, context=source_text)
    print("Missing: ", missing_a)
    displaced_a = qa_pipe(question=displaced_q, context=source_text)
    print("Displaced: ", displaced_a)
    affected_a = qa_pipe(question=affected_q, context=source_text)
    print("Affected: ", affected_a)
    location_a = qa_pipe(question=location_q, context=source_text) 
    print("Location: ", location_a)
    
    df_results.at[index, "killed"] = killed_a
    df_results.at[index, "injured"] = injured_a
    df_results.at[index, "displaced"] = displaced_a
    df_results.at[index, "affected"] = affected_a
    df_results.at[index, "location"] = location_a


    df_results.to_excel("sum_flan_T5_base_qa_roberta.xlsx")

Original text:  On 10 May 2024, flash floods swept across multiple provinces in the northeast region of Afghanistan, leaving more than 250,000 people homeless. The provinces most impacted by the flash floods were Badakhshan, Baghlan, and Takhar. These floods followed earlier flooding from 16 to 20 April 2024, which had affected 24 provinces.
Killed:  {'score': 0.00028329723863862455, 'start': 314, 'end': 316, 'answer': '24'}
Injured:  {'score': 0.0001770278176991269, 'start': 109, 'end': 142, 'answer': 'more than 250,000 people homeless'}
Missing:  {'score': 0.0886244997382164, 'start': 119, 'end': 126, 'answer': '250,000'}
Displaced:  {'score': 0.40163159370422363, 'start': 119, 'end': 126, 'answer': '250,000'}
Affected:  {'score': 0.15461550652980804, 'start': 109, 'end': 126, 'answer': 'more than 250,000'}
Location:  {'score': 0.3871031403541565, 'start': 68, 'end': 99, 'answer': 'northeast region of Afghanistan'}
Original text:  Based on data consolidated by the Afghan Red Crescent