https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-retrieval-augmented-generation?view=doc-intel-4.0.0

In [1]:
! pip install python-dotenv langchain langchain-community langchain-openai langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.6.0b3

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain
  Using cached langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.1.23-py3-none-any.whl.metadata (2.6 kB)
Collecting langchainhub
  Using cached langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting openai
  Using cached openai-1.44.0-py3-none-any.whl.metadata (22 kB)
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting azure-ai-documentintelligence
  Using cached azure_ai_documentintelligence-1.0.0b4-py3-none-any.whl.metadata (48 kB)
Collecting azure-identity
  Using cached azure_identity-1.17.1-py3-none-any.whl.metadata (79 kB)
Collecting azure-search-documents==11.6.0b3
  Using cached azure_search_documents-11.6.0b3-py3-none-any

In [3]:
"""
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""
import os
from dotenv import load_dotenv

_=load_dotenv()

os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

In [4]:
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

In [5]:
#check if the file exists
if not os.path.exists("./data/hyde.pdf"):
    raise Exception("The file ./data/hyde.pdf does not exist.")

pdf_path="./data/hyde.pdf"

In [6]:
# Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
loader = AzureAIDocumentIntelligenceLoader(file_path=pdf_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout", api_version="2024-02-29-preview")
docs = loader.load()

# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    #('####', 'Header 4'),
    #('#####'    , 'Header 5'),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(splits)))

Length of splits: 4


In [7]:
splits

[Document(page_content='Precise Zero-Shot Dense Retrieval without Relevance Labels\n===  \nLuyu Gao \\*\\* Xueguang Ma \\*\\* Jimmy Lin Jamie Callant  \nLanguage Technologies Institute, Carnegie Mellon University  \nDavid R. Cheriton School of Computer Science, University of Waterloo {luyug, callan} @cs.cmu.edu, {x93ma, jimmylin} @uwaterloo.ca'),
 Document(metadata={'Header 1': 'Abstract'}, page_content="arXiv:2212.10496v1 [cs.IR] 20 Dec 2022  \nWhile dense retrieval has been shown effec- tive and efficient across tasks and languages, it remains difficult to create effective fully zero-shot dense retrieval systems when no rel- evance label is available. In this paper, we recognize the difficulty of zero-shot learning and encoding relevance. Instead, we pro- pose to pivot through Hypothetical Document Embeddings (HyDE). Given a query, HyDE first zero-shot instructs an instruction-following language model (e.g. InstructGPT) to gen- erate a hypothetical document. The docu- ment captures r

## Undersatd the Azure Docuement Intelligence chunking
### TODO - Provide explanation here and why it's good for RAG 

In [10]:
# Embed the splitted documents and insert into Azure Search vector store
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings

aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), 
    openai_api_version="2024-02-01",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
)

In [20]:
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")



index_name: str = "langchain-aisearch-docintel-demo-index-1"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), 
    azure_search_key=os.getenv("AZURE_SEARCH_ADMIN_KEY"), 
    index_name=index_name,
    embedding_function=aoai_embeddings.embed_query,
)

vector_store.add_documents(documents=splits)

BadRequestError: Unsupported data type

In [None]:
# Retrieve relevant chunks based on the question

docs = vector_store.similarity_search_with_relevance_scores(query="unsupervised denseretriever", 
                                      k=3,
                                      #search_type="similarity",
                                      score_threshold=0.8)

#print(docs[0].page_content)
from pprint import pprint

pprint(docs)