Import Required Libraries and Load Configuration Files

In [1]:
# # Import required libraries  
import os  
import base64
from pathlib import Path
from shutil import rmtree
from requests import get, post
import json
import time
import copy 
from datetime import datetime, timedelta  
import pdfkit
from langchain.text_splitter import TokenTextSplitter, MarkdownHeaderTextSplitter


from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

In [10]:
#Load the configuration details for the Cognitive Search Service and Azure OpenAI Instance
#Credentials should be secured using a more secure method such as Azure KeyVault
config = json.load(open("config.json"))

Create Azure OpenAI and Document Intelligent (Pre-Built Layout) Instances. Set Chunking Config to split on Markdown Headers

In [11]:
#Azure OpenAI
api_base = config["openai_api_base"]
api_key = config["openai_api_key"]
openai_api_version = config["openai_api_version"]
embeddings_model = config["openai_embedding_model"]
gpt_model = config["openai_gpt_model"] 

# Doc Intelligence Config
di_endpoint = config["doc_intelligence_endpoint"]
di_apim_key = config["doc_intelligence_apim_key"]
di_headers = {
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': di_apim_key,
}
di_post_url = di_endpoint + "documentintelligence/documentModels/prebuilt-layout:analyze?api-version=2023-10-31-preview&stringIndexType=utf16CodeUnit&outputContentFormat=markdown"
credential = DefaultAzureCredential()

# Chunking Config
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=52)  
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI(
    api_version=openai_api_version,
    azure_endpoint=api_base,
    api_key=api_key)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=di_endpoint,
    credential=credential
)

Load Files in 'Data' Folder, Test Layout Analysis on ISDA-2.pdf

In [12]:
data_root_dir = config["data_root_dir"]

# Get all files in dir
def get_files_in_dir(in_dir):
    return [os.path.join(dp, f) for dp, dn, filenames in os.walk(in_dir) for f in filenames]

files = get_files_in_dir(data_root_dir)

In [None]:
for local_file in files:
    
    file_type = os.path.splitext(local_file)[1].lower()
    if file_type != ".pdf":
        continue

    if local_file != "data\cleared_ex2.pdf":
        continue
    
    else:
        print("Analyzing %s" % local_file)
        
    with open(local_file, "rb") as f:
        data_bytes = f.read()
    
    
        resp = post(url = di_post_url, data = data_bytes, headers = di_headers)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % resp.text)
            quit()
        print("POST analyze succeeded:\n%s" % resp.headers)
        get_url = resp.headers["operation-location"]
        print (get_url)
            
        n_tries = 10
        n_try = 0
        wait_sec = 2
        processing = True
        while processing:
            try:
                resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": di_apim_key})
                resp_json = json.loads(resp.text)
                if resp.status_code != 200:
                    print("GET Layout results failed:\n")
                    processing = False
                elif resp_json["status"] == "succeeded":
                    print("Layout Analysis succeeded:\n")
                    print("--------------------------------")
                    processing = False
                elif resp_json["status"] == "failed":
                    # print("Layout Analysis failed:\n%s" % resp_json)
                    print("Layout Analysis failed:\n")
                    processing = False
                else:
                    # Analysis still running. Wait and retry.
                    print ('Waiting to complete processing...')
                    time.sleep(wait_sec)
            except Exception as e:
                msg = "GET analyze results failed:\n%s" % str(e)
                print(msg)
                processing = False
            
    # Persist the Doc Int Output for further processing
    if 'analyzeResult' in resp_json:
        resp = resp_json['analyzeResult']
    
        break
    


data\cleared_ex2.pdf
Analyzing data\cleared_ex2.pdf
POST analyze succeeded:
{'Content-Length': '0', 'Operation-Location': 'https://klau-docintel.cognitiveservices.azure.com/documentintelligence/documentModels/prebuilt-layout/analyzeResults/b5106a0a-d79d-48bf-8d1a-e020c22ae162?api-version=2023-10-31-preview', 'x-envoy-upstream-service-time': '144', 'apim-request-id': 'b5106a0a-d79d-48bf-8d1a-e020c22ae162', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'x-ms-region': 'East US', 'Date': 'Sat, 01 Mar 2025 14:12:57 GMT'}
https://klau-docintel.cognitiveservices.azure.com/documentintelligence/documentModels/prebuilt-layout/analyzeResults/b5106a0a-d79d-48bf-8d1a-e020c22ae162?api-version=2023-10-31-preview
Waiting to complete processing...
Layout Analysis succeeded:

--------------------------------


Print Tables Found

In [17]:
for table_idx, table in enumerate(resp["tables"]):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table["rowCount"], table["columnCount"]
        )
    )
        
    for cell in table["cells"]:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell["rowIndex"],
            cell["columnIndex"],
            cell["content"]),
            )
        

Table # 0 has 3 rows and 8 columns
...Cell[0][0] has content 'US Government Security'
...Cell[0][1] has content 'Maturity <9M'
...Cell[0][2] has content '9M≤ Maturity < 2.5Y'
...Cell[0][3] has content '2.5Y ≤ Maturity < 5.5Y'
...Cell[0][4] has content '2.5Y ≤ Maturity < 5.5Y'
...Cell[0][5] has content '5.5Y ≤ Maturity <12Y'
...Cell[0][6] has content '12Y ≤ Maturity < 18Y'
...Cell[0][7] has content '18Y ≤ Maturity'
...Cell[1][0] has content 'U.S. Government Treasury (Bills, Notes, and Bonds) Excluding: Floating Rate Notes and STRIPS'
...Cell[1][1] has content '2.10%'
...Cell[1][2] has content '4.20%'
...Cell[1][3] has content '5.80%'
...Cell[1][4] has content '8.90%'
...Cell[1][5] has content '13.30%'
...Cell[1][6] has content '17.60%'
...Cell[1][7] has content '19.68%'
...Cell[2][0] has content 'U.S. Government Treasury (Inflation- Protected Securities, "TIPS")'
...Cell[2][1] has content '2.75%'
...Cell[2][2] has content '4.50%'
...Cell[2][3] has content '6.25%'
...Cell[2][4] has conte

Print Raw Markdown

In [18]:
print(resp["content"])

Collateral Management

| US Government Security | Maturity <9M | 9M≤ Maturity < 2.5Y | 2.5Y ≤ Maturity < 5.5Y | 2.5Y ≤ Maturity < 5.5Y | 5.5Y ≤ Maturity <12Y | 12Y ≤ Maturity < 18Y | 18Y ≤ Maturity |
| - | - | - | - | - | - | - | - |
| U.S. Government Treasury (Bills, Notes, and Bonds) Excluding: Floating Rate Notes and STRIPS | 2.10% | 4.20% | 5.80% | 8.90% | 13.30% | 17.60% | 19.68% |
| U.S. Government Treasury (Inflation- Protected Securities, "TIPS") | 2.75% | 4.50% | 6.25% | 9.50% | 9.50% | 14.00% | 18.75% |



Use Structued Outputs with Azure OpenAI. Define the class basemodel and properties (ex: asset name, party type, tenure max and min years)

In [19]:
from typing import Optional
from pydantic import BaseModel

In [25]:
class Collateral(BaseModel):
    name: Optional[str]
    tenure_min: Optional[str]
    tenure_max: Optional[str]
    haircut_percentage: Optional[str]

class Invoice(BaseModel):
    collaterals: Optional[list[Collateral]]

In [28]:
if resp["tables"]:
    completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant that specializes in extracting parties and collateral information from ISDA agreement. \
            The collateral Table with US Government Security and different Maturity levels might span multiple pages, wrap everything under eligible collateral section,  all of which must be extracted for each asset value. \
            Extract the assets in US Government Security. Extract into multiple rows if different asset types are separated by a comma i.e (Bills, Notes, Bonds) should be extracted separately as 'Bills', 'Notes', and 'Bonds', unless it is the same asset type. Extract as many asset types that are present\
            The Tenure Min Maturity is part of the row headers. Anything on the LEFT side of the sign (<) OR more than sign (>), is the value WITH the sign. Example: 2yr <= Maturity <= 5 yrs would return '2yr <='. Another example: 5yrs <= Maturity < 7 yrs would return '5yrs <='. Note that the values can repeat \
            The Tenure Max Maturity is part of the row headers. Anything on the RIGHT side of the less than sign (<), is the value AND the sign. Example:  Maturity < 9M would return '< 9M'.  2.5Y < Maturity <= 5.5Y would return '<= 5.5Y' Note that the values can repeat \
            The Valuation Percentage is in its own column.                                         ",
        },

        {
            "role": "user",
            "content": resp["content"],
        }
    ],
    response_format=Invoice,
    max_tokens=4096,
    temperature=0.1,
    top_p=0.1
)

In [30]:
event = completion.choices[0].message.parsed

print(event.model_dump_json(indent=2))

{
  "collaterals": [
    {
      "name": "Bills",
      "tenure_min": null,
      "tenure_max": "< 9M",
      "haircut_percentage": "2.10%"
    },
    {
      "name": "Notes",
      "tenure_min": null,
      "tenure_max": "< 9M",
      "haircut_percentage": "2.10%"
    },
    {
      "name": "Bonds",
      "tenure_min": null,
      "tenure_max": "< 9M",
      "haircut_percentage": "2.10%"
    },
    {
      "name": "Bills",
      "tenure_min": "9M <=",
      "tenure_max": "< 2.5Y",
      "haircut_percentage": "4.20%"
    },
    {
      "name": "Notes",
      "tenure_min": "9M <=",
      "tenure_max": "< 2.5Y",
      "haircut_percentage": "4.20%"
    },
    {
      "name": "Bonds",
      "tenure_min": "9M <=",
      "tenure_max": "< 2.5Y",
      "haircut_percentage": "4.20%"
    },
    {
      "name": "Bills",
      "tenure_min": "2.5Y <=",
      "tenure_max": "< 5.5Y",
      "haircut_percentage": "5.80%"
    },
    {
      "name": "Notes",
      "tenure_min": "2.5Y <=",
      "tenure_m