# Summarize data with T5

Experimenting with BART sumamrization models

In [None]:
import pandas as pd
import os
import sentencepiece

from transformers import pipeline, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

In [None]:
# config variables

summary_file = "summary_orig.xlsx"
summarization_model = "google/flan-t5-base"
question_answer_model = "deepset/roberta-base-squad2"

In [None]:
# load the excel data into a pandas dataframe
directory_path = os.getcwd()
summary_file = os.path.join(directory_path, summary_file)
print(summary_file)
df = pd.read_excel(summary_file)

In [None]:
# create the summarization pipeline and loop through the dataframe to summarize the data 
# in the source_original_text column and add the summary to the dataframe in a column named source_summary

summarizer = T5ForConditionalGeneration.from_pretrained(summarization_model)
tokenizer = T5Tokenizer.from_pretrained(summarization_model)

scaling_factor = 0.5 #how much we want to scale the summary length

df_results = pd.DataFrame(columns=["source_text", "summary_text", "source_text_tokens", "summary_text_tokens", "max_length", "min_length", "model used"])

for index, row in df.iterrows():
    source_text = row["source_original_text"]
    print("Original text: ", source_text)

    # tokenize the input text
    inputs = tokenizer.encode("summarize: " + source_text, return_tensors="pt", max_length=1024, truncation=True)
    input_length = len(inputs[0])
    print("Input Token length: ", input_length)

    # scale the summary length based on the input length
    max_length = int(input_length * scaling_factor)
    min_length = 50
    if max_length < 50:
        min_length = int(max_length * 0.5)

    summary_ids = summarizer.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    print("Summary Token length: ", len(summary_ids[0]))
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summary text: ", summary)
    df_results.at[index, "source_text"] = source_text
    df_results.at[index, "summary_text"] = summary
    df_results.at[index, "source_text_tokens"] = input_length
    df_results.at[index, "summary_text_tokens"] = len(summary_ids[0])
    df_results.at[index, "max_length"] = max_length
    df_results.at[index, "min_length"] = min_length
    df_results.at[index, "model used"] = summarization_model

# now write an excel file with the summarized data
df_results.to_excel("summarized_data_flan_T5_base.xlsx")