#Installs and Imports

In [None]:
!pip install openai





In [None]:
#utilities
import os
import time
import json
import tqdm
from tqdm import tqdm

#data handling
import pandas as pd
import numpy as np

#deep learning
import torch
print(torch.__version__)
print(torch.cuda.is_available())

#llm
import openai
from openai import OpenAI



2.5.1+cu121
False


# Utils/Helper Functions

Input Format for All Functions - All functions expect the same input JSON

- **First Level**: Keys represent document names, with values as dictionaries of entity extractions.
- **Second Level**: Keys represent entity names, with values as the model's extraction.

---

Workflow Steps

1. **Load OCR JSON**  
   Load the nested JSON file containing entity extractions.

2. **Check Keys**  
   Ensure all columns (keys) are present across documents and have uniform naming conventions and expected JSON structures.

3. **Check for Hallucination**  
   Ensure the extractions are actually present in the OCR output.

4. **Flatten**  
   Transform the nested structure into a flattened format.

5. **Convert to DataFrame**  
   Convert the flattened data into a structured DataFrame for further analysis.


In [None]:
def load_ocr_json(path):

  """
    Loads an OCR JSON file and extracts the document names and their corresponding texts.

    Parameters:
    - path (str): The path to the JSON file containing OCR data. The file should have a structure where
                  keys represent document names and values are the extracted OCR text.

    Returns:
    - document_names (list): A list of document names (keys from the JSON).
    - texts (list): A list of the OCR texts (values from the JSON).

    Workflow:
    1. Opens the OCR JSON file specified by the path.
    2. Loads the contents of the file into a dictionary.
    3. Iterates through the dictionary to extract the document names and corresponding texts.
    4. Returns the document names and texts as two separate lists.

  """

  #Load OCR JSON
  with open(path, 'r') as f:
    loading = json.load(f)

  #Storage
  document_names = []
  texts = []

  #Get Document Names and OCRs
  for k,v in loading.items():
    document_names.append(k)
    texts.append(v)

  return document_names, texts



In [None]:
#Twin Functions:

    #1. First flattens the keys of a nested JSON. This just return the flattened key structure.
    #2. Second calls the first to get flattened key names. It then checks if all records in the JSON have the same columns and structure.
      #This function will be used to check the output of an LLM, to ensure that all requested entities are present, their nested entities are present, and their column names are uniform.

def get_flattened_keys(json, parent_key=""):

    """
    Flattens the keys of a nested dictionary (input JSON) by concatenating parent and child keys with underscores.

    Parameters:
    - json (dict): The nested dictionary to process.
    - parent_key (str, optional): Used internally to build the flattened keys for nested dictionaries. Defaults to an empty string.

    Returns:
    - keys (set): A set of flattened keys, where nested keys are concatenated with underscores.

  """

    keys = set()
    for key, value in json.items():
        full_key = f"{parent_key}_{key}" if parent_key else key
        if isinstance(value, dict):
            keys.update(get_flattened_keys(value, full_key))
        else:
            keys.add(full_key)
    return keys


def check_keys(json):


  """


    Checks whether all records in a dictionary have the same flattened key structure.

    Parameters:
    - json (dict): The model's output nested JSON, where each key represents a document
                and maps to a nested dictionary with entities and their values.

    Returns:
    - None: Prints results to the console.

    Workflow:
    1. Extracts the nested dictionaries (each document's entity extraction) from the input dictionary.
    2. Flattens the keys of the first record to serve as a baseline.
    3. Compares the flattened keys of each subsequent record against the baseline.
    4. Reports inconsistencies and highlights the expected vs. actual keys.

    Example:
    >>> data = {
    ...     "doc1": {"page1": {"text": "Hello"}, "page2": {"text": "World"}},
    ...     "doc2": {"page1": {"text": "Hello"}, "page2": {"words": 50}}
    ... }
    >>> check_keys(data)
    Document 1 has different keys.
    Expected keys: {'page1_text', 'page2_text'}
    Found keys: {'page1_text', 'page2_words'}


  """


  #Extracting Independent Dictionaries
  records = [] #list of dictionary, each dictionary is each document's extraction
  for key, value in json.items():
      record = value  # Get the nested dictionary
      records.append(record)
  json = records

  #Flattening and Checking
  if json:
      # Get the flattened keys of the first document
      base_keys = get_flattened_keys(json[0])

      # Check each document for key consistency
      all_keys_match = True
      for i, entry in enumerate(json):
          entry_keys = get_flattened_keys(entry)
          if entry_keys != base_keys:
              print(f"Document {i} has different keys.")
              print(f"Expected keys: {base_keys}")
              print(f"Found keys: {entry_keys}")
              all_keys_match = False
      #Printing Results
      if all_keys_match:
          print("All documents have the same keys.")
          print("Baseline keys:", base_keys)
      else:
          print("Not all documents have the same keys.")
  else:
      print("No documents found in the JSON file.")



In [None]:
#These are two functions to handle the model's nested output JSON and prepare it to be converted into a DataFrame:
  #1. The rec_flatten function will take an independent nested JSON structure and flatten it.
  #2. The flatten_json function will take model's outputted JSON, call the rec_flatten function to get each independent JSON, and then output a list of flattened dictionaries.

def rec_flatten(single_nested_json,parent_key='', sep='_'):

  """
    Recursively flattens a nested dictionary, concatenating parent and child keys using a specified separator.

    Parameters:
    - json (dict): A single nested JSON to be flattened.
    - parent_key (str, optional): The base key to prepend to the current key. Defaults to an empty string.
    - sep (str, optional): The separator used to concatenate keys. Defaults to an underscore ('_').

    Returns:
    - dict: A flattened dictionary where nested keys are represented as concatenated strings.

    Workflow:
    1. Iterates through all key-value pairs in the dictionary.
    2. Concatenates the parent key (if any) with the current key using the separator.
    3. If the value is a dictionary, recursively flattens it and appends the results.
    4. If the value is not a dictionary, adds the concatenated key and its value to the flattened dictionary.

  """

  items = []
  for key, value in single_nested_json.items():
    new_key = f"{parent_key}{sep}{key}" if parent_key else key
    if isinstance(value, dict):
      items.extend(rec_flatten(value, new_key, sep=sep).items())
    else:
        items.append((new_key, value))

  return dict(items)



def flatten_json(json, parent_key='', sep='_'):
    """
    Takes the model's outputted nested JSON, flattens each independent JSON by calling rec_flatten,
    and outputs a list of flattened dictionaries - one for each document.

    Args:
    nested_json (dict): The model's outputted nested JSON.
    parent_key (str): The base key to append to. Default is ''.
    sep (str): The separator to use between parent and child keys. Default is '_'.

    Returns:
    list: list of flattened JSONs (dictionaries).
    """


    #Storage
    flattened_jsons= []

    #Flattening and Appending
    for key, value in json.items():
      flattened_json = rec_flatten(value)
      flattened_jsons.append(flattened_json)

    return flattened_jsons

In [None]:
#This function is a check for hallucination. It checks the model output with corresponding document's OCR
# to check whether the extracted entities are actually present in the document.


def check_hallucination(ocr_json, entity_json):


  """
    This function checks for hallucinations by verifying whether the entities extracted from the LLM
    actually appear in the OCR output. It compares the document's extracted entities with the document's corresponding OCR text.

    Parameters:
    - ocr_json (dict): A dictionary containing the OCR outputs for each document.
                        The keys represent document names and the values are the OCR texts.
    - entity_json (dict): A nested dictionary representing the extracted entities for each document. This is the model's outputted JSON.
                        The first level keys are document names, and the second level keys are entity names with values being the extracted entity.

    Returns:
    - logs (list): A list of dictionaries, each representing the log for a specific document.
                   It includes information on whether each entity was found in the document text, if not found, it gives the model's extraction for analysis.

    Workflow:
    1. Iterates through the `entity_json` dictionary to retrieve each document's extracted entities.
    2. For each entity, checks if it exists in the corresponding document's OCR text (case-insensitive).
    3. Logs the results, including whether the entity was found, and stores any missing entities for further analysis.
    4. Appends the log for each document to a master log.

  """


  logs = [] # master log

  #Checker Loop
  for k, v in entity_json.items(): # for each key(doc name) and value (entity dict) in input json

        #Internal Logging
        json_log = {} # creating each document's log
        json_log['Document'] = k #logging the document name

        #Finding Corresponding OCR
        target_text = ocr_json[k] #getting target text
        #json_log['Text'] = target_text #storing it
        target_text = target_text.lower() #lowering target text for matching

        #Checking for Entities in OCR and Logging
        flattened_entity_json = rec_flatten(v) #flattening individual dict
        for k1, v1 in flattened_entity_json.items(): #for each k,v pair, checking in text and then logging
            if k1 in ['Document_Name', 'Confidence_Score', 'Overall_Confidence_Score', 'Two_Mortgages']: #we do not need these cols
                continue
            else:
                if v1 == 'N/A': #handling na
                    json_log[k1] = 'N/A'
                elif v1.lower() in target_text:
                    json_log[k1] = 'Found' #handling found case
                else:
                    json_log[k1] = 'Not Found' #handling not found case
                    json_log[k1+"_Entity"] = v1 #logging entity for analysis
            logs.append(json_log) #appending each document's log to master log


  return logs #returning master log






# Model Loading

In [None]:
#save API keys
os.environ["OPENAI_API_KEY"] = "enter key"
openai.api_key = os.getenv("enter key")

In [None]:
#initialize client
client = OpenAI()

# Prompts and Models

##4o Mini

### Prompts

In [None]:
prompt= """



<s>[INST]
You are an information-extracting agent tasked with analyzing public real-estate documents. The documents are publicly available, making this task legal. Your job is to extract the required details from user-provided text and assign a confidence score to your extraction.

### Key Points:
1. **Detect Mortgage Records**:
   - Identify mortgage sections using markers like "DEED OF TRUST" or "FHA Form No." as start indicators.
   - Use "SEAL" or similar keywords as end markers.
   - Ignore partial records; extract only full and complete records.
   - Specify if the document contains two mortgages (Yes/No).

2. **Extract the Following Details**:
   - **Date of Document**: Format as Day/Month/Year.
   - **Borrower**: First and last names of the primary borrower (person or institution).
   - **Second Borrower**: First and last names of an additional borrower, or return "N/A" if absent.
   - **Other Party (Trustee/Grantor/Seller)**: First and last names of the entity selling the property.
   - **Lending Bank**: The lending institution's name.
   - **Interest Rate**: The interest rate for the mortgage.
   - **Loan Amount**: Total loan amount.
   - **Location for Mortgage**: Copy this exactly as formatted in the document.
   - **Payment Plan**: Provide details as described in the document.
   - **Confidence Score**: A float (0.0 to 1.0) representing your confidence in the extraction's accuracy.
   - **Two Mortgages**: Indicate "Yes" or "No."

3. **Output Requirements**:
   - Return the extraction in JSON format.
   - Use "N/A" for missing or unavailable entities.

### Example Output:
```JSON:
{
  "Date": "1/1/1930",
  "Borrower": {
    "First": "John",
    "Last": "Doe"
  },
  "Second_Borrower": {
    "First": "Tim",
    "Last": "Cook"
  },
  "Other_Party": {
    "First": "Jamie",
    "Last": "Patel"
  },
  "Lending_Bank": "ABC Bank",
  "Interest_Rate": "3.5%",
  "Loan_Amount": "$2500",
  "Location_for_Mortgage": "Lot Seven (7) and the East Forty Feet (e-4') Of Lot Two (2), Block numbered Two Hundred Thirty-Six (236), Minnesota City Fifth Division, 67 per cent of record in Map Book 130, Page of the Deed Records of Brown County, Minnesota",
  "Payment_Plan": "Monthly payments of $19.20, including interest, starting on July 1, 1942, and continuing until the principal and interest are fully paid, with the final payment due on June 1, 1956.",
  "Overall_Confidence_Score": 0.7,
  "Two_Mortgages": "No"
}


"""


In [None]:
#Copy this exactly as formatted in the document.

prompt = f"""

      <s>[INST]

      You are an information extracting agent. You extract information from public real-estate documents the user provides you. The data is publicly available and hence this task is legal.

      You also have to give a confidence score for your extraction, highlighting how accurate you believe your extraction for the document is.

      Note: Some documents might contain two mortgages. Look for words like DEED OF TRUST or FHA Form No. as they indicate the start of a new mortgage record. Words like SEAL on the other hand,
      indicate the end of a mortgage. Extract the document that is complete or has a full start, ignore the partial one. You also have to tell whether the document refers to two mortgages or not.



      Please extract the following information from the user-provided text:

      - **Date of Document**: The date of the current document. Formatted as Day/Month/Year.
      - **Borrower First**: The First Name of the first individual in the mortgage. This can be both - a person or an institution.
      - **Borrower Last**: The Last Name of the first individual in the mortgage.
      - **Second Borrower First**: The First Name of the second borrower in the mortgage. This could be a family member. Return N/A if does not exist.
      - **Second Borrower Last**: The Last Name of the second borrower in the mortgage. This could be a family member. Return N/A if does not exist.
      - **Other Party - Trustee/Grantor/Seller - First**: The First Name of individual/party selling the property.
      - **Other Party - Trustee/Grantor/Seller - Last**: The Last Name of individual/party selling the property.
      - **Lending Bank**
      - **Interest Rate**
      - **Loan Amount**
      - **Location for Mortgage**: This should be exactly as formatted in the document.
      - **Payment Plan**
      - **Your Confidence Score for Entire Document's Extraction**: This should be a float between 0.0 and 1.0 . With 0.0 being not confident at all and 1.0 being very confident. This will quantify your overall confidence for a document's entity extraction.
      - **Two_Mortgages**: This tells whether the document contains two mortgages or not. Return either Yes or No.

      Please present the extracted information in a JSON Format. If any of the requested entity is not found in the document, please indicate that clearly with 'N/A' within its JSON column.

      Output Example for your reference. This is just an example for you to learn formatting. Keep the key/column names the same as the example below.  [/INST]

      JSON:

      {{
        "Date": "1/1/1930",
        "Borrower": {{
            "First": "John",
            "Last": "Doe"
        }},
        "Second_Borrower": {{
            "First": "Tim",
            "Last": "Cook"
        }},
        "Other_Party": {{
            "First": "Jamie",
            "Last": "Patel"
        }},
        "Lending_Bank": "ABC Bank",
        "Interest_Rate": "3.5%",
        "Loan_Amount": "$2500",
        "Location_for_Mortgage": "Lot Seven (7) and the East Forty Feet (e-4') Of Lot Two (2), Block numbered Two Hundred Thirty-Six (236), Minnesota City Fifth Division, 67 per cent of record in Map Book 130, Page of the Deed Records of Brown County, Minnesota",
        "Payment_Plan": "Monthly payments of $19.20, including interest, starting on July 1, 1942, and continuing until the principal and interest are fully paid, with the final payment due on June 1, 1956.",
        "Overall_Confidence_Score": "0.7",
        "Two_Mortgages": "No"
 }}

       Please return only the JSON object. </s>

    """

### Running Function

In [None]:
#this function will pass the prompt and ocr to the LLM. one call processes one document.
def extract_4o_mini(text, prompt):


  """
    This function processes a single mortgage document using the GPT-4o-mini model.
    It sends a given OCR input and a prompt to the model and returns the extracted
    information in JSON format, if valid.

    For the model, we reinitialize the message history for every document. Hence, the model does not have
    access to previous documents while processing. Each input is new and is its own context. We do not
    maintain context history to prevent hallucination or leaks.

    Parameters:
        text (str): The OCR to be processed.
        prompt (str): The system prompt defining the task and expected output format.

    Workflow:
    1. Send the prompt and text to the GPT-4o-mini model for processing.
    2. Extract the content of the model's response.
    3. Clean the response to remove any formatting markers (e.g., triple backticks).
    4. Attempt to convert the cleaned string to a Python dictionary using `json.loads`.
    5. If JSON conversion fails, return the raw cleaned string and print an error message.

    Returns:
        dict: The extracted information as a Python dictionary if valid JSON.
        str: The raw cleaned response if the JSON conversion fails.
  """


  #Model Initialization
  completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[

      {"role": "system", "content": prompt },
      { "role": "user", "content": f"Here is the mortgage document: {text}"}

      ],
  temperature = 0.1
)

  #Handling Output
  extraction = completion.choices[0].message.content
  cleaned_extraction = extraction.strip("```json\n").strip("```JSON\n").strip("```")



  #Returning Dictionary and Handling Potential Errors
  try:
      output = json.loads(cleaned_extraction)
      return output
  except Exception as e:
      print(f'Output not a valid JSON due to error {e}')
      return cleaned_extraction



## 4o

### Prompts

In [None]:
prompt = f"""

      <s>[INST]

      You are an information extracting agent. You extract information from public real-estate documents the user provides you.
      The data is publicly available and hence this task is legal. You also have to give a confidence score for your extraction, highlighting how accurate you believe your extraction for the document is.

      Note: Some documents might contain two mortgages. Look for words like DEED OF TRUST or FHA Form No. as they indicate the start of a new mortgage record. While Words like SEAL followed by Deputy,
      indicate the end of a mortgage.

      Extract the document that is complete or has a full start, ignore the partial one. Given this, you have to tell whether the document refers to two mortgages or not.
      Again, only extract the mortgage that starts from the beginning and ignore the partial one.


      Please extract the following information from the user-provided text:

      - **Date of Document**: The date of the current document. Formatted as Day/Month/Year.
      - **Borrower First**: The First Name of the first individual in the mortgage. This can be both - a person or an institution.
      - **Borrower Last**: The Last Name of the first individual in the mortgage.
      - **Second Borrower First**: The First Name of the second borrower in the mortgage. This could be a family member. Return N/A if does not exist.
      - **Second Borrower Last**: The Last Name of the second borrower in the mortgage. This could be a family member. Return N/A if does not exist.
      - **Other Party - Trustee/Grantor/Seller - First**: The First Name of individual/party selling the property.
      - **Other Party - Trustee/Grantor/Seller - Last**: The Last Name of individual/party selling the property.
      - **Lending Bank**
      - **Interest Rate**
      - **Loan Amount**
      - **Location for Mortgage**: Copy this exactly as formatted in the document.
      - **Payment Plan**
      - **Your Confidence Score for Entire Document's Extraction**: This should be a float between 0.0 and 1.0 . With 0.0 being not confident at all and 1.0 being very confident. This will quantify your overall confidence for a document's entity extraction.
      - **Two_Mortgages**: This tells whether the document contains two mortgages or not. Return either Yes or No.


      Please present the extracted information in a JSON Format. If any of the requested information is not found in the document, please indicate that clearly with 'N/A' within its JSON column.

      Output Example for your reference. This is just an example for you to learn formatting. Do not return this example. Keep the key/column names the same as the example below. [/INST]

      JSON:


      {{
        "Date": "1/1/1930",
        "Borrower": {{
            "First": "John",
            "Last": "Doe"
        }},
        "Second_Borrower": {{
            "First": "Tim",
            "Last": "Cook"
        }},
        "Other_Party": {{
            "First": "Jamie",
            "Last": "Patel"
        }},
        "Lending_Bank": "ABC Bank",
        "Interest_Rate": "3.5%",
        "Loan_Amount": "$2500",
        "Location_for_Mortgage": "Lot Seven (7) and the East Forty Feet (e-4') Of Lot Two (2), Block numbered Two Hundred Thirty-Six (236), Minnesota City Fifth Division, 67 per cent of record in Map Book 130, Page of the Deed Records of Brown County, Minnesota",
        "Payment_Plan": "Monthly payments of $19.20, including interest, starting on July 1, 1942, and continuing until the principal and interest are fully paid, with the final payment due on June 1, 1956.",
        "Overall_Confidence_Score": "0.7",
        "Two_Mortgages": "No"
        }}


     Please return only the JSON object. </s>



    """

### Running Function

In [None]:
def extract_4o(text, prompt):


  """
    This function processes a single mortgage document using the GPT-4o model.
    It sends a given OCR input and a prompt to the model and returns the extracted
    information in JSON format, if valid.

    For the model, we reinitialize the message history for every document. Hence, the model does not have
    access to previous documents while processing. Each input is new and is its own context. We do not
    maintain context history to prevent hallucination or leaks.

    Parameters:
        text (str): The OCR to be processed.
        prompt (str): The system prompt defining the task and expected output format.

    Workflow:
    1. Send the prompt and text to the GPT-4o-mini model for processing.
    2. Extract the content of the model's response.
    3. Clean the response to remove any formatting markers (e.g., triple backticks).
    4. Attempt to convert the cleaned string to a Python dictionary using `json.loads`.
    5. If JSON conversion fails, return the raw cleaned string and print an error message.

    Returns:
        dict: The extracted information as a Python dictionary if valid JSON.
        str: The raw cleaned response if the JSON conversion fails.
  """


  #Model Initialization
  completion = client.chat.completions.create(
  model="gpt-4o",
  messages=[

      {"role": "system", "content": prompt },

      { "role": "user", "content": f"Here is the mortgage document: {text}"}],

  temperature = 0.1



)

  #Processing Output
  extracted_data = completion.choices[0].message.content

  cleaned_extraction = extracted_data.strip("```JSON\n").strip("```json\n").strip("```")


  #Converting to Dictionary and Handling Errors
  try:
      output = json.loads(cleaned_extraction)
      return output
  except Exception as e:
      print('Output not a valid JSON due to error {e}')
      return(cleaned_extraction )



# Extract Entities

In [None]:
#This function runs all the OCR inputs through the model and loads all extractions into a single master JSON.


def get_output_json(document_names,texts, model):


  """
    Runs all the OCR inputs through a chosen OpenAI and loads all extractions into a single master JSON. Handles invalid JSONs and returns them too.

    Parameters:
    - document_names (list): A list of document names corresponding to the OCR texts.
    - texts (list): A list of OCR texts, one for each document.
    - model (function): The model (LLM) to be used for extraction. Pass the entire extraction function.

    Returns:
    - output_json (dict): A dictionary containing the output for all documents. The keys are document names,
                           and the values are the extracted entities in JSON format. This is a nested JSON with 2 levels.

                              Level 1: Document Names as Keys, Nested JSON as Value.
                              Level 2: Entity Names as Keys, Extractions as Value.

                              This will be the final JSON Structure:

                              {
                                "doc_1.tif": {
                                  "First_Borrower": {
                                    "first_name": "John",
                                    "last_name": "Doe"
                                  },
                                  "Second_Borrower": {
                                    "first_name": "Jane",
                                    "last_name": "Smith"
                                  },
                                  "interest_rate": "3.5%",
                                  "location_for_mortgage": "New York"
                                },
                                "doc_2.tif": {
                                  "First_Borrower": {
                                    "first_name": "Alice",
                                    "last_name": "Johnson"
                                  },
                                  "Second_Borrower": {
                                    "first_name": "Bob",
                                    "last_name": "Brown"
                                  },
                                  "interest_rate": "4.0%",
                                  "location_for_mortgage": "California"
                                }
                              }

    - jsons (list): A list of the JSON data returned by the model for each document.
    - failed_jsons (list): A list of documents that failed to generate valid JSON. Each item is a dictionary
                           containing the document name and its extracted text. To be used for debugging.

    Workflow:
    1. Iterates over the OCR texts and document names, processing each with the provided model.
    2. For each processed document, tries to validate and store the extracted JSON.
    3. If the JSON is valid, it adds the document's name to the extracted data and appends it to the master JSON.
    4. If the JSON is invalid, it adds the document's name and the extracted text to the `failed_jsons` list.
    5. Logs the progress and the runtime of the operation.


  """


  start_time = time.time()

  #Storage
  jsons = [] #raw model output
  failed_jsons = [] #fallback for debugging
  output_json = {}  # final output JSON for a batch


  counter = 0

  #Running the LLM
  for text, name in tqdm(zip(texts, document_names), total = len(document_names), desc = 'Document'):
        json_data = model(text, prompt)  # Run the model
        jsons.append(json_data) #Append output

        #Handling Valid JSONs
        try:
            json.loads(json.dumps(json_data))  #Try to load and dump to check JSON validity
            #Append to master output after validation
            json_data['Document_Name'] = name #Add name feature to each document's JSON
            output_json[name] = json_data  # Add document to the master JSON with key as document_name

            counter+=1
            print(f'         --------Doc {counter}')

        #Handling Invalid JSONs
        except (TypeError, ValueError):
            failed_jsons.append({'Document_Name': name, 'Extracted_Text': json_data})

            print(f'Doc {counter} Failed')
            print(f"Skipping document '{name}' as it is not valid JSON format.")

  end_time = time.time()
  print(f'Model run time for {len(document_names)} documents: {end_time - start_time} seconds.')

  return output_json, jsons, failed_jsons


#Main

In [None]:
#1. Load OCR JSON
names, texts = load_ocr_json("/content/output_json__two_25_2.json") #call helper function to get the document names and texts


#2. Extract Entities by Running LLM
output_json, jsons, failed_jsons = get_output_json(names, texts, extract_4o) #passes all OCR JSONS sequentially through the model and collects entity JSONS


#3. Verify Column Names and JSON Structure
check_keys(output_json)


#4. Save Output JSON Locally - Post Verification
with open('output_json.json', 'w') as f:
    json.dump(output_json, f, indent = 4)


#5. Flatten Output JSON
flattened_output = flatten_json(output_json)


#6. Convert to DataFrame Object
df = pd.DataFrame(flattened_output)


#7. Save as CSV
df.to_csv('output_csv.csv', index = False)



Document:   4%|▍         | 1/25 [00:04<01:41,  4.22s/it]

         --------Doc 1


Document:   8%|▊         | 2/25 [00:08<01:39,  4.32s/it]

         --------Doc 2


Document:  12%|█▏        | 3/25 [00:12<01:30,  4.14s/it]

         --------Doc 3


Document:  16%|█▌        | 4/25 [00:15<01:17,  3.70s/it]

         --------Doc 4


Document:  20%|██        | 5/25 [00:19<01:12,  3.64s/it]

         --------Doc 5


Document:  24%|██▍       | 6/25 [00:22<01:07,  3.55s/it]

         --------Doc 6


Document:  28%|██▊       | 7/25 [00:26<01:08,  3.78s/it]

         --------Doc 7


Document:  32%|███▏      | 8/25 [00:29<00:59,  3.49s/it]

         --------Doc 8


Document:  36%|███▌      | 9/25 [00:37<01:19,  4.98s/it]

         --------Doc 9


Document:  40%|████      | 10/25 [00:53<02:05,  8.40s/it]

         --------Doc 10


Document:  44%|████▍     | 11/25 [01:09<02:29, 10.68s/it]

         --------Doc 11


Document:  48%|████▊     | 12/25 [01:14<01:54,  8.77s/it]

         --------Doc 12


Document:  52%|█████▏    | 13/25 [01:19<01:32,  7.68s/it]

         --------Doc 13


Document:  56%|█████▌    | 14/25 [01:26<01:21,  7.42s/it]

         --------Doc 14


Document:  60%|██████    | 15/25 [01:43<01:45, 10.52s/it]

         --------Doc 15


Document:  64%|██████▍   | 16/25 [01:57<01:43, 11.52s/it]

         --------Doc 16


Document:  68%|██████▊   | 17/25 [02:02<01:15,  9.49s/it]

         --------Doc 17


Document:  72%|███████▏  | 18/25 [02:08<00:59,  8.56s/it]

         --------Doc 18


Document:  76%|███████▌  | 19/25 [02:13<00:44,  7.50s/it]

         --------Doc 19


Document:  80%|████████  | 20/25 [02:22<00:39,  7.85s/it]

         --------Doc 20


Document:  84%|████████▍ | 21/25 [02:32<00:34,  8.58s/it]

         --------Doc 21


Document:  88%|████████▊ | 22/25 [02:37<00:21,  7.33s/it]

         --------Doc 22


Document:  92%|█████████▏| 23/25 [02:42<00:13,  6.64s/it]

         --------Doc 23


Document:  96%|█████████▌| 24/25 [02:52<00:07,  7.58s/it]

         --------Doc 24


Document: 100%|██████████| 25/25 [02:58<00:00,  7.15s/it]

         --------Doc 25
Model run time for 25 documents: 178.70416808128357 seconds.
All documents have the same keys.
Baseline keys: {'Loan_Amount', 'Two_Mortgages', 'Second_Borrower_Last', 'Overall_Confidence_Score', 'Document_Name', 'Interest_Rate', 'Other_Party_First', 'Second_Borrower_First', 'Lending_Bank', 'Location_for_Mortgage', 'Date', 'Payment_Plan', 'Other_Party_Last', 'Borrower_First', 'Borrower_Last'}



