https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-retrieval-augmented-generation?view=doc-intel-4.0.0

In [10]:
! pip install python-dotenv langchain langchain-community langchain-openai langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.6.0b3



In [1]:
"""
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""
import os
from dotenv import load_dotenv

_=load_dotenv()

os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

In [2]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter



In [3]:
#check if the file exists
if not os.path.exists("./data/hyde.pdf"):
    raise Exception("The file ./data/hyde.pdf does not exist.")

pdf_path="./data/hyde.pdf"

In [4]:
# Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
loader = AzureAIDocumentIntelligenceLoader(file_path=pdf_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout", api_version="2024-02-29-preview")
docs = loader.load()

# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    #('####', 'Header 4'),
    #('#####'    , 'Header 5'),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(splits)))

Length of splits: 23


## Undersatd the Azure Docuement Intelligence chunking
### TODO - Provide explanation here and why it's good for RAG 

In [6]:
# Embed the splitted documents and insert into Azure Search vector store
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings

aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    azure_endpoint=os.getenv("AZURE_EMBEDDING_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
    openai_api_version="2024-02-01", 
)

In [7]:
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")

print(vector_store_password)

index_name: str = "langchain-aisearch-docintel-demo-index-1"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=aoai_embeddings.embed_query,
)

vector_store.add_documents(documents=splits)

5q47Xdc8WOXBcW2G1ittzKz691WeO1GdtBHCmTrk9AAzSeDQ18W6


['MjBjNGY4NWQtMmE1NS00Y2VlLTg4NDUtMDg5YWExOWYwOTg3',
 'MmVkZjhiMDMtOThhYy00ZDg2LWE0NGQtZTM0OTFmODM3MzRk',
 'YjgxZjczMzktOGFjYy00YWI1LWExZDYtZDZmZTc5YmM5MzU5',
 'YjIxNzRhYjUtYTYxZS00ODZiLTgyOWEtODAwNDEwZmQyYzQ2',
 'Mjg0ZGZjYWItNDFhMi00YzFiLWEzZDUtYTQwNGEyYjY2YjI2',
 'MWI0NmNkZjctNmVkNy00YWZkLTg1NTktMTdlZjQzMDg4MDcy',
 'YTFkYzQ5OTctMWI5MS00NTk0LWIzNjgtMGRiYzNlY2FiMTcz',
 'MmUzZGFiY2EtM2M0NC00ZWUxLWI1ZDYtNTQ4ZTFkMTIwNzIw',
 'ZTc2NWU0NTUtZDliMC00ZjdlLWExMjgtNTVjNjBlYzVmMmVj',
 'ZGU3YmRiYTAtNzYyMS00YmJiLTk1YWQtNjlhN2VmMDRmNGJi',
 'Yjg0MzYyMDUtNzdiMS00YTFmLWJiYWYtNTdhNzY5MTVhNjYx',
 'NTZiZjk0MTMtYTIzNi00OGMxLWFmZmUtMDFhYjBiYTFmNTA5',
 'NGUxNzQ1MjItZTI4Mi00ODc5LTgwZGMtMDFmMGRkYzYyYjMy',
 'MDc1NjUyY2QtNDk1NS00Y2RkLWE1MGItNTgwMWMwYmM5YWEw',
 'NTk5ZTdiYjItMmEwZS00ZjBhLWExOWMtNTFhMjE3OTVhMzRh',
 'NzQ0NmVmMjktZThlNS00ZmViLWE2NDEtNjkwMGI1Mjc1Yjgz',
 'N2U0Y2I0M2YtZjk2Yi00MDBlLTliNmQtNWUxZmU1NWVhM2Y2',
 'NTk0NzE4MDctMjM1Yy00NmRiLWIyNjktODc4NWFhODYyNjc0',
 'YTAwM2E5MzUtOTgxYi00NDAyLWEzY2QtYzY2ZmRhYjU4

In [8]:
# Retrieve relevant chunks based on the question

docs = vector_store.similarity_search_with_relevance_scores(query="unsupervised denseretriever", 
                                      k=3,
                                      #search_type="similarity",
                                      score_threshold=0.8)

#print(docs[0].page_content)
from pprint import pprint

pprint(docs)

[(Document(metadata={'id': 'YzA0YjY1YjctOTI2Ny00YTdhLWIxNTYtY2EwYzc1YzMwYWFm', 'Header 1': '1 Introduction'}, page_content='Dense retrieval (Lee et al., 2019; Karpukhin et al., 2020), the method of retrieving documents using semantic embedding similarities, has been shown successful across tasks like web search, question answering, and fact verification. A variety of meth- ods such as negative mining (Xiong et al., 2021; Qu et al., 2021), distillation (Qu et al., 2021; Lin et al., 2021b; Hofstätter et al., 2021) and task-specific  \npre-training (Izacard et al., 2021; Gao and Callan, 2021; Lu et al., 2021; Gao and Callan, 2022; Liu and Shao, 2022) have been proposed to improve the effectiveness of supervised dense retrieval models.  \nOn the other hand, zero-shot dense retrieval still remains difficult. Many recent works consider the alternative transfer learning setup, where the dense retrievers are trained on a high-resource dataset and then evaluated on queries from new tasks. The M