## Task: Data Preparation

* Data source: Arxiv papers for math/AI/physics (5 papers each)

* File path: '../data_source/arxiv.org/...'

* Form Recognizer - analyze PDF file, page contents are returned.

* Save the page contents into JSON file for later processing.

In [10]:
import os
from dotenv import load_dotenv
load_dotenv()

True

#### Read PDF file using form recognizer

In [11]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

document_analysis_client = DocumentAnalysisClient(
    endpoint=os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"], 
    credential=AzureKeyCredential(os.environ["AZURE_FORM_RECOGNIZER_KEY"]))

def formrecognizer_document_analysis_client(file_name):
    print("formrecognizer_document_analysis_client(): start: file name: ", file_name)
    with open(file_name, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-read", document=f
        )
    result = poller.result()
    print("formrecognizer_document_analysis_client(): end")
    return result.pages

#### Put page contents into json structure and write to a JSON file so that we can it later.

In [12]:
import json

def write_page_content_to_json_file(orig_file, formrecognize_result, output_json_file_folder):
    print("write_page_content_to_json_file(): start")
    print("write_page_content_to_json_file(): orig_file file name: ", orig_file)
    print("write_page_content_to_json_file(): output_json_file_folder: ", output_json_file_folder)

    if formrecognize_result:
        page_content = []
        for page in formrecognize_result:
            all_lines_content = []
            for line_idx, line in enumerate(page.lines):
                all_lines_content.append(' '.join([word.content for word in line.get_words()]))
            page_content.append({'filename': orig_file,
                'page_number':page.page_number, 
                'page_content':' '.join(all_lines_content)})
        
        # Save JSON data into a file so that we donot need to call form recongnizer again
        # Specify the output file path
        output_json_filename = orig_file + '_formrecognized.json'
    
        # Write content to an output file in the subfolder
        output_filename = os.path.join(output_json_file_folder, output_json_filename)

        # Save the JSON data to a file
        with open(output_filename, 'w') as json_file:
            json.dump(page_content, json_file, indent=4)
        print(f"write_page_content_to_json_file(): JSON data has been saved to {output_filename}")
        print("write_page_content_to_json_file(): end")
    else:
        # TODO: more error handling ...
        print("write_page_content_to_json_file(): error: form recognizer retruns error")

#### Enter the path to the PDF file folder.
For example:
* './data_source/arxiv.org/AI'
* './data_source/arxiv.org/math'
* './data_source/arxiv.org/physcis'

In [16]:
import os

print("Data preparation >>>")
# Ask the user to provide the folder path
folder_path = input('Enter the path to the folder: ')

# Check if the provided path exists
if not os.path.exists(folder_path):
    print(f'The folder path "{folder_path}" does not exist.')
else:
    # Create a subfolder called 'output_json_file' if it doesn't exist
    output_folder_path = os.path.join(folder_path, 'formrecognized_output')
    
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
        print(f'Subfolder "output_json_file" created under {folder_path}')
    else:
        print(f'Subfolder "output_json_file" already exists under {folder_path}')
    print(f'Subfolder "output_json_file" folder path {output_folder_path}')

    # Loop through the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a PDF
        if filename.lower().endswith('.pdf'):
            # If it's a PDF, print the file name or perform any other desired action
            # print(f'Found PDF file: {filename}')
            orig_file_with_full_path = os.path.join(folder_path, filename)
            print(f'Found PDF file: {orig_file_with_full_path}')

            # 1. Call formrecognizer to process pdf document
            formrecognize_result = formrecognizer_document_analysis_client(orig_file_with_full_path)
            # print(formrecognize_result)
            
            # 2. Write page content into a json file for later use, like embedding, summary, ...
            write_page_content_to_json_file(filename, formrecognize_result, output_folder_path)
            
            # Break out of the loop after processing the first PDF file - testing only
            # break
print("Data preparation is done! <<<")

Data preparation >>>


Subfolder "output_json_file" already exists under ./data_source/arxiv.org/physcis
Subfolder "output_json_file" folder path ./data_source/arxiv.org/physcis\formrecongnized_output
Found PDF file: ./data_source/arxiv.org/physcis\2111.07895.pdf
formrecognizer_document_analysis_client(): start: file name:  ./data_source/arxiv.org/physcis\2111.07895.pdf
formrecognizer_document_analysis_client(): end
write_page_content_to_json_file(): start
write_page_content_to_json_file(): orig_file file name:  2111.07895.pdf
write_page_content_to_json_file(): output_json_file_folder:  ./data_source/arxiv.org/physcis\formrecongnized_output
write_page_content_to_json_file(): JSON data has been saved to ./data_source/arxiv.org/physcis\formrecongnized_output\2111.07895.pdf_formrecognized.json
write_page_content_to_json_file(): end
Found PDF file: ./data_source/arxiv.org/physcis\2207.00634.pdf
formrecognizer_document_analysis_client(): start: file name:  ./data_source/arxiv.org/physcis\2207.00634.pdf
formrecogn