## Initialization of model, constants, paths etc.

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('config.cfg')

model_summarize_path = config['mT5']['summarizer']
libcuda_path = config['Unsloth']['libcuda_path']
library_path = config['Unsloth']['library_path']
dataset_path = config['mT5']['dataset_path']
output_path = config['mT5']['output_path']

import os
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path

import json
import torch
import re
from transformers import pipeline
import rouge_raw
eval = rouge_raw.RougeRaw()


num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
print(torch.cuda.mem_get_info())


In [6]:
def load_model(model_path, max_seq_length = 512): #
    summarizer_t5 = pipeline(task='summarization', model=model_path, max_length = max_seq_length, device_map="auto")
    return summarizer_t5
def clean_text(text):
    return text.replace("-\n","").replace('\r', ' ').replace('\n', ' ')

def load_dataset(path):
    with open(path, 'r') as j:
         contents = json.loads(j.read())
    return contents

def remove_enumeration(text):
    pattern = r'^\s*\d+\.\s*'
    cleaned_lines = [re.sub(pattern, '', line) for line in text.split('\n')]
    return '\n'.join(cleaned_lines)           

In [24]:
def summarize_dataset(dataset_path, 
                      output_path, 
                      pipeline,
                      prompt_func,
                      max_new_tokens=1024, 
                      save_steps=-1, 
                      page_text_key="text", 
                      page_summary_key="summary", issue_summary_total_key="summary_total", overwrite = False):
    pages_processed = 0
    dataset = load_dataset(dataset_path)
    journals_processed = 0
    for key_journal, journal in dataset.items():
        print(f"Summarization: Processing journal {key_journal}, {journals_processed}/{len(dataset.items())}:") 
        journals_processed += 1
        for key_issue, issues in journal.items():
            print(f"Processing issue {key_issue}:") 
            issue_summary_total = []
            for key_page, pages in issues.items():
                
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    if page_summary_key in pages[i] and overwrite is False:
                        continue
                    text_to_summarize = clean_text(pages[i][page_text_key])
                    if len(text_to_summarize.split()) > 10:
                        summarized_page = (summarize_text(prompt_func(text_to_summarize), pipeline, max_new_tokens))
                    else:
                        summarized_page = " "
                    pages[i][page_summary_key] = summarized_page
                    print(summarized_page)
                    issue_summary_total.append(summarized_page)
                    pages_processed += 1
                    if save_steps > 0 and pages_processed % save_steps == 0:
                        filename = f"summarized_{os.path.splitext(os.path.basename(dataset_path))[0]}"
                        with open(f"{output_path}/{filename}.json", "w") as myfile:
                            print("Saving checkpoint")
                            myfile.write(json.dumps(dataset))
            if issue_summary_total_key in issues and overwrite is False:
                continue
            text_to_summarize = clean_text('\n'.join(issue_summary_total))
            summarized_issue = (summarize_text(prompt_func(text_to_summarize), pipeline, max_new_tokens))
            print(summarized_issue)
            issues[issue_summary_total_key] = summarized_issue
    with open(f"{output_path}/{filename}.json", "w") as myfile:
        myfile.write(json.dumps(dataset))
        print(f"Finished summarizing. Saved to {output_path}/{filename}.json")
        
def summarize_text(prompt, pipeline, max_new_tokens=512, temperature=1, top_p = 1, do_sample = True, num_beams = 4, top_k = 50, repetition_penalty = 1.2):
    res = pipeline(prompt, temperature=temperature, top_p = top_p, do_sample=do_sample, num_beams=num_beams, top_k = top_k, repetition_penalty = repetition_penalty, no_repeat_ngram_size = 4 )
    return res[0]["summary_text"]


def t5_prompt(text):
    return text


In [7]:
def evaluate_dataset(dataset_path, 
                      page_gold_key = "summary", 
                      page_system_key ="summary_reference", issue_gold_key="summary_total", issue_reference_key = "summary_total_reference"):
    dataset = load_dataset(dataset_path)
    summary_total_gold = []
    summary_total_reference = []
    page_gold = []
    page_system = []
    for key_journal, journal in dataset.items():
        for key_issue, issues in journal.items():
            for key_page, pages in issues.items():
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    #print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    gold = pages[i][page_gold_key]
                    system = pages[i][page_system_key]
                    page_gold.append(gold)
                    page_system.append(system)
            summary_total_gold.append(issues[issue_gold_key])
            summary_total_reference.append(issues[issue_reference_key])
    summary_eval = eval.corpus(gold=page_gold, system=page_system)
    summary_total_eval = eval.corpus(gold=summary_total_gold, system=summary_total_reference)
    return summary_eval, summary_total_eval

def print_rougeraw(score):
    print("ROUGE-1 F: ", score["1"].f*100)
    print("ROUGE-1 P: ", score["1"].p*100)
    print("ROUGE-1 R: ", score["1"].r*100)

    print("ROUGE-2 F: ", score["2"].f*100)
    print("ROUGE-2 P: ", score["2"].p*100)
    print("ROUGE-2 R: ", score["2"].r*100)

    print("ROUGE-L F: ", score["L"].f*100)
    print("ROUGE-L P: ", score["L"].p*100)
    print("ROUGE-L R: ", score["L"].r*100)
    
def write_rougeraw_to_file(score, filename):
    with open(filename, 'w') as file:
        file.write("ROUGE-1 F: " + str(score["1"].f*100) + "\n")
        file.write("ROUGE-1 P: " + str(score["1"].p*100) + "\n")
        file.write("ROUGE-1 R: " + str(score["1"].r*100) + "\n")

        file.write("ROUGE-2 F: " + str(score["2"].f*100) + "\n")
        file.write("ROUGE-2 P: " + str(score["2"].p*100) + "\n")
        file.write("ROUGE-2 R: " + str(score["2"].r*100) + "\n")

        file.write("ROUGE-L F: " + str(score["L"].f*100) + "\n")
        file.write("ROUGE-L P: " + str(score["L"].p*100) + "\n")
        file.write("ROUGE-L R: " + str(score["L"].r*100) + "\n")

## Generate summaries for evaluation

In [None]:
pipeline = load_model(model_summarize_path)

In [None]:
summarize_dataset(dataset_path, output_path, pipeline, t5_prompt, save_steps=5, page_summary_key="summary_mt5", issue_summary_total_key="summary_total_mt5")

## Evaluate (get rouge score of the model)

In [None]:
basename = f"summarized_{os.path.splitext(os.path.basename(dataset_path))[0]}"
filename = f"{output_path}/{basename}.json"
scores = evaluate_dataset(filename, page_system_key ="summary_mt5", issue_reference_key="summary_total_mt5")
print("summary")
write_rougeraw_to_file(scores[0], f"{output_path}/score_pages_poc_dataset.txt")
print_rougeraw(scores[0])
print("summary total")
write_rougeraw_to_file(scores[1], f"{output_path}/score_issues_poc_dataset.txt")
print_rougeraw(scores[1])