Import Required Libraries and Load Configuration Files

In [1]:
# # Import required libraries  
import os  
import base64
from pathlib import Path
from shutil import rmtree
from requests import get, post
import json
import time
import copy 
from datetime import datetime, timedelta  
import pdfkit
from langchain.text_splitter import TokenTextSplitter, MarkdownHeaderTextSplitter


from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

In [2]:
#Load the configuration details for the Cognitive Search Service and Azure OpenAI Instance
#Credentials should be secured using a more secure method such as Azure KeyVault
config = json.load(open("config.json"))

Create Azure OpenAI and Document Intelligent (Pre-Built Layout) Instances. Set Chunking Config to split on Markdown Headers

In [3]:
#Azure OpenAI
api_base = config["openai_api_base"]
api_key = config["openai_api_key"]
openai_api_version = config["openai_api_version"]
embeddings_model = config["openai_embedding_model"]
gpt_model = config["openai_gpt_model"] 

# Doc Intelligence Config
di_endpoint = config["doc_intelligence_endpoint"]
di_apim_key = config["doc_intelligence_apim_key"]
di_headers = {
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': di_apim_key,
}
di_post_url = di_endpoint + "documentintelligence/documentModels/prebuilt-layout:analyze?api-version=2023-10-31-preview&stringIndexType=utf16CodeUnit&outputContentFormat=markdown"
credential = DefaultAzureCredential()

# Chunking Config
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=52)  
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI(
    api_version=openai_api_version,
    azure_endpoint=api_base,
    api_key=api_key)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=di_endpoint,
    credential=credential
)

Load Files in 'Data' Folder, Test Layout Analysis on ISDA-2.pdf

In [4]:
data_root_dir = config["data_root_dir"]

# Get all files in dir
def get_files_in_dir(in_dir):
    return [os.path.join(dp, f) for dp, dn, filenames in os.walk(in_dir) for f in filenames]

files = get_files_in_dir(data_root_dir)

In [5]:
for local_file in files:

    file_type = os.path.splitext(local_file)[1].lower()
    if file_type != ".pdf":
        continue

    if local_file != "data\ISDA-2.pdf":
        continue
    
    else:
        print("Analyzing %s" % local_file)
        
    with open(local_file, "rb") as f:
        data_bytes = f.read()
    
    
        resp = post(url = di_post_url, data = data_bytes, headers = di_headers)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % resp.text)
            quit()
        print("POST analyze succeeded:\n%s" % resp.headers)
        get_url = resp.headers["operation-location"]
        print (get_url)
            
        n_tries = 10
        n_try = 0
        wait_sec = 2
        processing = True
        while processing:
            try:
                resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": di_apim_key})
                resp_json = json.loads(resp.text)
                if resp.status_code != 200:
                    print("GET Layout results failed:\n")
                    processing = False
                elif resp_json["status"] == "succeeded":
                    print("Layout Analysis succeeded:\n")
                    print("--------------------------------")
                    processing = False
                elif resp_json["status"] == "failed":
                    # print("Layout Analysis failed:\n%s" % resp_json)
                    print("Layout Analysis failed:\n")
                    processing = False
                else:
                    # Analysis still running. Wait and retry.
                    print ('Waiting to complete processing...')
                    time.sleep(wait_sec)
            except Exception as e:
                msg = "GET analyze results failed:\n%s" % str(e)
                print(msg)
                processing = False
            
    # Persist the Doc Int Output for further processing
    if 'analyzeResult' in resp_json:
        resp = resp_json['analyzeResult']
    
        break
    


Analyzing data\ISDA-2.pdf
POST analyze succeeded:
{'Content-Length': '0', 'Operation-Location': 'https://klau-docintel.cognitiveservices.azure.com/documentintelligence/documentModels/prebuilt-layout/analyzeResults/b10b9dd9-02de-4da9-bc78-dffb386d903d?api-version=2023-10-31-preview', 'x-envoy-upstream-service-time': '83', 'apim-request-id': 'b10b9dd9-02de-4da9-bc78-dffb386d903d', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'x-ms-region': 'East US', 'Date': 'Mon, 10 Mar 2025 17:28:19 GMT'}
https://klau-docintel.cognitiveservices.azure.com/documentintelligence/documentModels/prebuilt-layout/analyzeResults/b10b9dd9-02de-4da9-bc78-dffb386d903d?api-version=2023-10-31-preview
Waiting to complete processing...
Waiting to complete processing...
Waiting to complete processing...
Layout Analysis succeeded:

--------------------------------


Print Tables Found

In [6]:
for table_idx, table in enumerate(resp["tables"]):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table["rowCount"], table["columnCount"]
        )
    )
        
    for cell in table["cells"]:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell["rowIndex"],
            cell["columnIndex"],
            cell["content"]),
            )
        

Table # 0 has 7 rows and 5 columns
...Cell[0][0] has content ''
...Cell[0][2] has content 'Party A'
...Cell[0][3] has content 'Party B'
...Cell[0][4] has content 'Valuation Percentage'
...Cell[1][0] has content '(A)'
...Cell[1][1] has content 'Cash: U.S. Dollars in depositary account form.'
...Cell[1][2] has content 'YES'
...Cell[1][3] has content 'YES'
...Cell[1][4] has content '100%'
...Cell[2][0] has content '(B)'
...Cell[2][1] has content 'Treasury Bills: negotiable obligations issued by the U.S.
Treasury Department having a remaining maturity of not more than one year.'
...Cell[2][2] has content 'YES'
...Cell[2][3] has content 'YES'
...Cell[2][4] has content '98%'
...Cell[3][0] has content '(C)'
...Cell[3][1] has content 'Treasury Notes: negotiable debt obligations issued by the
U.S. Treasury Department having a remaining maturity of more than one year but not more than 10 years.'
...Cell[3][2] has content 'YES'
...Cell[3][3] has content 'YES'
...Cell[3][4] has content '98%'
...Ce

Print Raw Markdown

In [7]:
print(resp["content"])

<!-- PageHeader="2/11/25, 5:14 PM" -->

<!-- PageHeader="Credit Support Annex" -->

EX-10.11 13 a8821 ex10-11.htm CREDIT SUPPORT ANNEX

(Bilateral Form)

<!-- PageHeader="Exhibit 10.11" -->

(ISDA Agreements Subject to New York Law Only)

ISDA®
===

International Swaps and Derivatives Association, Inc.

CREDIT SUPPORT ANNEX

to the Schedule to the

ISDA MASTER AGREEMENT dated as of November 16, 2004 between

WACHOVIA BANK, NATIONAL ASSOCIATION ("Party A")

and


## GOLD BANK ("Party B")

This Annex supplements, forms part of, and is subject to, the ISDA Master Agreement referred to above (this "Agreement"), is part of its Schedule and is a Credit Support Document under this Agreement with respect to each party.

Accordingly, the parties agree as follows: -

Paragraphs 1 - 12. Incorporation

Paragraphs 1 through 12 inclusive of the ISDA Credit Support Annex (Bilateral Form) (ISDA Agreements Subject to New York Law Only) published in 1994 by the International Swaps and Derivatives Associ

Use Structued Outputs with Azure OpenAI. Define the class basemodel and properties (ex: asset name, party type, tenure max and min years)

In [8]:
from typing import Optional
from pydantic import BaseModel

In [9]:
class Collateral(BaseModel):
    name: Optional[str]
    tenure_min: Optional[float]
    tenure_max: Optional[float]
    haircut_percentage: Optional[str]

class Invoice(BaseModel):
    partyA: Optional[str]
    partyB: Optional[str]
    collaterals: Optional[list[Collateral]]

In [10]:
if resp["tables"]:
    completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": """You are an AI assistant that specializes in extracting parties and collateral information from ISDA agreement.
            
            1. **Multi-Page Table Handling**: \ 
            - Ensure **all pages are processed** together as part of the same table. 
            - **Do not stop after the first page**—continue extracting all consecutive pages. 
            - **Wrap everything under the eligible collateral section**.

            2. **Columns to Extract**:            
            - **Party A and Party B names** are typically found on the **first page** of the document.
            - **Asset Type** is in the first column of the table. It might not have a column name. But only get the name, like Cash, Treasury Bills, Treasury Notes, etc. The rest describes the maturity years.
            - If the Asset Type is Cash, please put null for the maturity tenure min and max.
            - The **Tenure Min Maturity** is part of the description in the asset name column. Extract the text describing the Minimum tenure maturity years, so if it says LESS than one year, the result would be one year. if it says more than ten years, the min would be ten years. otherwise, if it says not more than put 0. it is always part of the asset type text, not its own column if the asset type is cash it is N/A.
            - The **Tenure Max Maturity** is part of the description in the asset name column. Extract the text describing the Maximum tenure maturity years, so if it says one year or more but less than five, the max would be five year. otherwise put N/A. it is always part of the asset type text, not its own column.
            - The **Valuation Percentage** is in its own column. It is usually a percentage value
            
            3. **Data Structuring**:
            - Ensure each **collateral type** is extracted in **its own row**.
            - Maintain **data consistency** across multiple pages.
            - If multiple maturity tenures are present for the same asset type, **list them separately**.
                        """,
        },

        {
            "role": "user",
            "content": resp["content"],
        }
    ],
    response_format=Invoice,
    max_tokens=4096,
    temperature=0.1,
    top_p=0.1
)

In [11]:
event = completion.choices[0].message.parsed

print(event.model_dump_json(indent=2))

{
  "partyA": "WACHOVIA BANK, NATIONAL ASSOCIATION",
  "partyB": "GOLD BANK",
  "collaterals": [
    {
      "name": "Cash",
      "tenure_min": null,
      "tenure_max": null,
      "haircut_percentage": "100%"
    },
    {
      "name": "Treasury Bills",
      "tenure_min": 0.0,
      "tenure_max": 1.0,
      "haircut_percentage": "98%"
    },
    {
      "name": "Treasury Notes",
      "tenure_min": 1.0,
      "tenure_max": 10.0,
      "haircut_percentage": "98%"
    },
    {
      "name": "Treasury Bonds",
      "tenure_min": 10.0,
      "tenure_max": 30.0,
      "haircut_percentage": "92%"
    },
    {
      "name": "Agency Securities",
      "tenure_min": 0.0,
      "tenure_max": 30.0,
      "haircut_percentage": "92%"
    },
    {
      "name": "FHLMC Certificates",
      "tenure_min": 0.0,
      "tenure_max": 30.0,
      "haircut_percentage": "92%"
    },
    {
      "name": "FNMA Certificates",
      "tenure_min": 0.0,
      "tenure_max": 30.0,
      "haircut_percentage": "92%