## Task: Summarization Batch Process

* Data source: Arxiv papers for math/AI/physics (5 papers each) - processed by previous data preparation process.

* File path: '../data_source/arxiv.org/<AI/math/physcis>/formrecognized_output/...'

* Azure OpenAI Completion - summarize page by page in appended approach

* Save the final summary into JSON file.

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

True

#### Set up Azure OpenAI

In [12]:
# Note: The openai-python library support for Azure OpenAI is in preview. 
# This version is not supported in ChatCompletion.
# import os
import openai

openai.api_type = "azure"
# openai.api_version = "2023-07-01-preview"
openai.api_version = "2023-09-15-preview"
API_KEY = os.getenv("OPENAI_API_KEY","").strip()
assert API_KEY, "ERROR: Azure OpenAI Key is missing"
openai.api_key = API_KEY
RESOURCE_ENDPOINT = os.getenv("OPENAI_API_ENDPOINT","").strip()
assert RESOURCE_ENDPOINT, "ERROR: Azure OpenAI Endpoint is missing"
assert "openai.azure.com" in RESOURCE_ENDPOINT.lower(), "ERROR: Azure OpenAI Endpoint should be in the form: \n\n\t<your unique endpoint identifier>.openai.azure.com"
openai.api_base = RESOURCE_ENDPOINT

COMPLETIONS_MODEL = os.getenv('DEPLOYMENT_NAME')

#### Use Azure OpenAI 'Completion' to summarize content

In [26]:
# Use Azure OpenAI completion for summarization
def openai_completion_summarization(previous_summary, new_content):
    debug = False
    if debug:
        print("openai_completion_summarization(): start")
        print("openai_completion_summarization(): previous_summary: ", previous_summary)
        print("openai_completion_summarization(): new_content: ", new_content)

    # Construct prompt
    # prompt_text = (
    #     'Provide a summary of the contents below. Note that your summary should consider:\n'
    #     'Summary based on previous content:\n ' +
    #     previous_summary + ' \n' +
    #     'and the new content: \n' + 
    #     new_content
    # )

    prompt_text = 'Provide a summary of the contents below.\n' + \
        previous_summary + ' \n ' + new_content    

    # prompt_text = 'Provide a summary of the text below that captures its main idea.\n\n' + \
    #     previous_summary + ' \n ' + new_content
    
    
    if debug: print("prompt_text:", prompt_text)

    response = openai.Completion.create(
        engine=COMPLETIONS_MODEL,
        prompt=prompt_text,
        temperature=0,
        max_tokens=2000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)

    if debug: 
        print(f"\nOpenAI completion summary: [{repr(response['choices'][0]['text'])}]")
        print("openai_completion_summarization(): end")
    return response['choices'][0]['text']

#### Summarize whole document

In [29]:
# Summrization with append approach
def summerization_append(json_filename, page_content_in_json_file ):
    print("summerization_append(): start")
    print("summerization_append(): json_filename: ", json_filename)
    # print("summerization_append(): page_content_in_json_file: ", page_content_in_json_file)
    debug = False

    # Loop through the JSON and print page_content
    previous_summary_text = 'None'
    for page in reversed(page_content_in_json_file):
        new_page_number = page.get('page_number', '')
        new_page_content = page.get('page_content', '')
        if debug:
            print("Page number: ", new_page_number)

        # print("Page content: ", new_page_content)
        summary_result = openai_completion_summarization(previous_summary_text, new_page_content)
        previous_summary_text = summary_result.split('\n\n', 1)[-1]
        # time.sleep(2)
        if debug:
            print("Summary: ", previous_summary_text)
            print("=" * 50)  # Separating page contents for better readability

    formrecognized_index = json_filename.find('_formrecognized')
    # Extract the substring before '_formrecognized'
    orig_filename = json_filename[:formrecognized_index]
    final_summary = [{"File name": orig_filename,
                    "Summary": previous_summary_text}]
    print("summerization_append(): final_summary: ", final_summary)
    print("summerization_append(): end")
    return final_summary

#### Enter the folder path for JSON files
For example:
* './data_source/arxiv.org/AI/formrecognized_output'
* './data_source/arxiv.org/math/formrecognized_output'
* './data_source/arxiv.org/physcis/formrecognized_output'

In [34]:
import os
import json

print("Summarization batch process >>>")
# Ask the user to provide the folder path for JSON files
json_folder_path = input('Enter the path to the folder containing JSON files: ')

# Check if the provided path exists
if not os.path.exists(json_folder_path):
    print(f'The folder path "{json_folder_path}" does not exist.')
else:
    # Get the parent directory of the JSON folder
    parent_folder = os.path.dirname(json_folder_path)

    # Create a new folder path one level above the parent folder
    finalsummary_folder_path = os.path.join(parent_folder, 'finalsummary_output')

    # Check if the Summary folder path exists
    if not os.path.exists(finalsummary_folder_path):
        # If it doesn't exist, create the new folder
        os.makedirs(finalsummary_folder_path)
        print(f'Summary folder "{finalsummary_folder_path}" created.')
    else:
        # If it exists, print a message
        print(f'Summary folder "{finalsummary_folder_path}" already exists.')

    # Loop through the files in the folder
    for filename in os.listdir(json_folder_path):
        # Check if the file is a JSON file
        if filename.lower().endswith('.json'):
            # If it's a JSON file, print the file name and read its content
            full_path = os.path.join(json_folder_path, filename)
            print(f'Found JSON file: {filename}, Full Path: {full_path}')

            # Read JSON content
            with open(full_path, 'r') as json_file:
                json_content = json.load(json_file)
                # print('JSON Content:')
                # print(json.dumps(json_content, indent=2))  # Assuming you want to pretty-print the JSON

                # 1. Summarize the whole document
                summary_result = summerization_append(filename, json_content)

                # 2. Save the JSON data to a file
                # Find the position of '_formrecognized'
                formrecognized_index = filename.find('_formrecognized')
                # Extract the substring before '_formrecognized'
                summary_filename = filename[:formrecognized_index] + '_summary.json'
                summary_file_full_path = os.path.join(finalsummary_folder_path, summary_filename)
                with open(summary_file_full_path, 'w') as summary_json_file:
                    json.dump(summary_result, summary_json_file, indent=4)
                print(f"JSON data has been saved to {summary_file_full_path}")
                
        # Break out of the loop after processing the first PDF file - testing only
        # break
    print('Finished processing JSON files.')
print("Summarization batch process is done! <<<")

Summarization batch process >>>
Summary folder "./data_source/arxiv.org/physcis\finalsummary_output" created.
Found JSON file: 2111.07895.pdf_formrecognized.json, Full Path: ./data_source/arxiv.org/physcis/formrecognized_output\2111.07895.pdf_formrecognized.json
summerization_append(): start
summerization_append(): json_filename:  2111.07895.pdf_formrecognized.json
summerization_append(): final_summary:  [{'File name': '2111.07895.pdf', 'Summary': " et al. 2017) or gas (Trilling et al. 2018) was detected from 'Oumuamua, so the acceleration is not due to cometary outgassing. The acceleration is consistent with the behavior of a solar sail (Loeb & Bialy 2018). The acceleration is also consistent with the behavior of a rocket (Jackson & Arkani-Hamed 2018). The acceleration is not consistent with the behavior of a comet (Bannister & Dybczynski 2019). The acceleration is not consistent with the behavior of a dust cloud (Bannister & Dybczynski 2019). The acceleration is not consistent with t