In [20]:
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from typing import Any, Dict, List, Optional
from fastapi import  UploadFile

class AzureDocIntelligenceReader(BaseReader):
    def __init__(self,azure_api_endpoint:str,azure_api_key:str,model_id:Optional[str] = "prebuilt-read",per_page:Optional[bool]=True) -> None:
        try:
            self.client = DocumentAnalysisClient(endpoint=azure_api_endpoint, credential=AzureKeyCredential(azure_api_key))
        except Exception as e:
            print("the following error occured while creating azure client ",e)
        self.model_id = model_id
        self.per_page = per_page
    
     def load_data(self,document:UploadFile) -> List[Document]:
        data =  document
        documents = []
        try:
            poller = self.client.begin_analyze_document(self.model_id, data)
            result = poller.result()
            if self.per_page:
                for page in result.pages:
                    content = "\n".join([line.content for line in page.lines])
                    documents.append(Document(text=content,metadata={"total_pages":len(result.pages),"page":page.page_number,"filename":document.filename}))
                return documents
            else:
                documents.append(Document(text=result.content))

                print(documents)
                return documents

        except Exception as e:
            print("the following error occured while loading data ",e)
            return None

In [1]:
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

In [21]:
test_pdf = open("../data/124/EEM FINAL caa201221.pdf", "rb")
pdf_bytes = test_pdf.read()
test_pdf.close()

In [22]:
endpoint = "https://rsaf-document-intelligence.cognitiveservices.azure.com/"
key = "9646643789c44f279072a7ebbc8a42cd"

from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex

node_parser = SimpleNodeParser.from_defaults(
    chunk_size=512, chunk_overlap=0)

azure_doc_loader = AzureDocIntelligenceReader(azure_api_endpoint=endpoint,azure_api_key=key,model_id="prebuilt-layout")

def pdf_to_nodes(file: UploadFile) -> str:
    # document = loader.load(file_path=pdf_path, metadata=True)
    document = azure_doc_loader.load_data(file)
    nodes = node_parser.get_nodes_from_documents(document)
    print(nodes)
    return nodes

In [23]:
pdf_to_nodes(pdf_bytes)



[TextNode(id_='8558d6fd-f02c-423e-b542-b17f92a3f196', embedding=None, metadata={'total_pages': 319, 'page': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='faa80fa7-2f39-4c79-a364-57e8115f3624', node_type=None, metadata={'total_pages': 319, 'page': 1}, hash='f9918839cbc66be598c4ab71913b8f8ba164f6dedc3f51a5f76477cdf323cccd')}, hash='f9918839cbc66be598c4ab71913b8f8ba164f6dedc3f51a5f76477cdf323cccd', text='OFFICIAL (CLOSED)\n124 SQUADRON\nREPUBLIC OF SINGAPORE AIR FORCE\nEC120B\nEMPLOYMENT\nMANUAL\ni\nOFFICIAL (CLOSED)', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='c1626008-388c-4de4-9253-a8713fbba37b', embedding=None, metadata={'total_pages': 319, 'page': 2}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInf