# Summarize data with T5 summarization and BERT-Large-Masking-SQUAD question answering


In [None]:
import pandas as pd
import os
import sentencepiece
import torch

from transformers import pipeline, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [None]:
# config variables

summary_file = "reliefweb_summaries.csv"
summarization_model = "google/flan-t5-base"
qa_model = "deepset/bert-large-uncased-whole-word-masking-squad2"


In [None]:
# load the excel data into a pandas dataframe
directory_path = os.getcwd()
summary_file = os.path.join(directory_path, summary_file)
print(summary_file)
df = pd.read_csv(summary_file)

In [None]:
# create the summarization pipeline and loop through the dataframe to summarize the data 
# in the source_original_text column and add the summary to the dataframe in a column named source_summary

summarizer = T5ForConditionalGeneration.from_pretrained(summarization_model)
tokenizer = T5Tokenizer.from_pretrained(summarization_model)

scaling_factor = 0.5 #how much we want to scale the summary length

df_results = pd.DataFrame(columns=["source_text", "summary_text", "source_text_tokens", "summary_text_tokens", "max_length", "min_length", 
                                   "killed", "injured", "displaced", "affected", "location", "summary_model used", "qa_model used"])

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)

    # tokenize the input text
    inputs = tokenizer.encode("summarize: " + source_text, return_tensors="pt", max_length=1024, truncation=True)
    input_length = len(inputs[0])
    print("Input Token length: ", input_length)

    # scale the summary length based on the input length
    max_length = int(input_length * scaling_factor)
    min_length = 50
    if max_length < 50:
        min_length = int(max_length * 0.5)

    summary_ids = summarizer.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    print("Summary Token length: ", len(summary_ids[0]))
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summary text: ", summary)
    df_results.at[index, "source_text"] = source_text
    df_results.at[index, "summary_text"] = summary
    df_results.at[index, "source_text_tokens"] = input_length
    df_results.at[index, "summary_text_tokens"] = len(summary_ids[0])
    df_results.at[index, "max_length"] = max_length
    df_results.at[index, "min_length"] = min_length
    df_results.at[index, "summary model used"] = summarization_model

# now write an excel file with the summarized data
# df_results.to_excel("summarized_data_flan_T5_base.xlsx")

In [None]:
# now let's set up a pipeline to answer questions based on the summarized data

qa_pipe = pipeline("question-answering", model=qa_model, tokenizer=qa_model)

killed_q = "How many people were killed?"
injured_q= "How many people were injured?"
missing_q = "How many people are missing?"
displaced_q = "How many people were displaced?"
affected_q = "How many people were affected?"
location_q = "Where did the event happen?"
df_results.at[index, "qa model used"] = qa_model

#TODO:  Evaluate the confidence of the answers and only accept answers with a confidence above a certain threshold
#TODO:  Consider how to incorporate synonyms into questions (e.g. displaced/homeless/migrant/migration)

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)
    
    killed_a = qa_pipe(question=killed_q, context=source_text)
    print("Killed: ", killed_a)
    injured_a = qa_pipe(question=injured_q, context=source_text)
    print("Injured: ", injured_a)
    missing_a = qa_pipe(question=missing_q, context=source_text)
    print("Missing: ", missing_a)
    displaced_a = qa_pipe(question=displaced_q, context=source_text)
    print("Displaced: ", displaced_a)
    affected_a = qa_pipe(question=affected_q, context=source_text)
    print("Affected: ", affected_a)
    location_a = qa_pipe(question=location_q, context=source_text) 
    print("Location: ", location_a)
    
    df_results.at[index, "killed"] = killed_a
    df_results.at[index, "injured"] = injured_a
    df_results.at[index, "displaced"] = displaced_a
    df_results.at[index, "affected"] = affected_a
    df_results.at[index, "location"] = location_a


    df_results.to_csv("5 - T5_summary_bert_masking_qa.csv")