# Infer data

Use the data gathered through Eric's collection script and run it through summarization and NER models to add additional summary columns

In [1]:
import pandas as pd
import os

from transformers import pipeline, AutoTokenizer, BartForConditionalGeneration, BartTokenizer

In [2]:
# config variables

summary_file = "summary_orig.xlsx"
summarization_model = "facebook/bart-large-xsum"
question_answer_model = "deepset/roberta-base-squad2"

In [3]:
# load the excel data into a pandas dataframe
directory_path = os.getcwd()
summary_file = os.path.join(directory_path, summary_file)
print(summary_file)
df = pd.read_excel(summary_file)

/home/tjordan/code/git/topic_surveyor/summary_orig.xlsx


In [5]:
# create the summarization pipeline and loop through the dataframe to summarize the data 
# in the source_original_text column and add the summary to the dataframe in a column named source_summary

summarizer = BartForConditionalGeneration.from_pretrained(summarization_model)
tokenizer = BartTokenizer.from_pretrained(summarization_model)

scaling_factor = 0.5 #how much we want to scale the summary length

df_results = pd.DataFrame(columns=["source_text", "summary_text", "source_text_tokens", "summary_text_tokens", "max_length", "min_length", "model used"])

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)

    # tokenize the input text
    inputs = tokenizer.encode("summarize: " + source_text, return_tensors="pt", max_length=1024, truncation=True)
    input_length = len(inputs[0])
    print("Input Token length: ", input_length)

    # scale the summary length based on the input length
    max_length = int(input_length * scaling_factor)
    min_length = 50
    if max_length < 50:
        min_length = int(max_length * 0.5)

    summary_ids = summarizer.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    print("Summary Token length: ", len(summary_ids[0]))
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summary text: ", summary)
    df_results.at[index, "source_text"] = source_text
    df_results.at[index, "summary_text"] = summary
    df_results.at[index, "source_text_tokens"] = input_length
    df_results.at[index, "summary_text_tokens"] = len(summary_ids[0])
    df_results.at[index, "max_length"] = max_length
    df_results.at[index, "min_length"] = min_length
    df_results.at[index, "model used"] = summarization_model

# now write an excel file with the summarized data
df_results.to_excel("summarized_data.xlsx")

Original text:  On 10 May 2024, flash floods swept across multiple provinces in the northeast region of Afghanistan, leaving more than 250,000 people homeless. The provinces most impacted by the flash floods were Badakhshan, Baghlan, and Takhar. These floods followed earlier flooding from 16 to 20 April 2024, which had affected 24 provinces.
Input Token length:  73
Summary Token length:  25
Summary text:  The year 2024 was the worst on record for flash floods in Afghanistan, according to the World Health Organization (WHO).
Original text:  Based on data consolidated by the Afghan Red Crescent Society (ARCS), as of 18 May 2024, at least 300 deaths had been reported in 33 provinces. More than 35,400 households (257,850 people) have been directly affected across the country, with one million people at risk of being indirectly affected nationwide. An estimated 22,000 livestock and close to 120,000 acres of agricultural land were destroyed. There are also numerous reports of bridges being d