In [30]:
from opentelemetry import trace
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

from utils.utils import(configure_aoai_env,
                         configure_logging,
                         configure_embedding_env,
                         configure_docintell_env,
                         get_credential)

from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.retrievers import AzureAISearchRetriever
from langchain_community.vectorstores.azuresearch import AzureSearch
import os
import atexit
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

from logging import INFO, getLogger
# Logging calls with this logger will be tracked
logger = getLogger(__name__)
tracer = trace.get_tracer(__name__)

# Azure Search configuration
AZURE_AI_SEARCH_SERVICE_ENDPOINT = os.getenv(
    "AZURE_AI_SEARCH_SERVICE_ENDPOINT")
AZURE_AI_SEARCH_API_KEY = os.getenv("AZURE_AI_SEARCH_API_KEY")
AZURE_AI_SEARCH_INDEX_NAME = os.getenv("AZURE_AI_SEARCH_INDEX_NAME")
AZURE_AI_SEARCH_SERVICE_NAME = os.getenv("AZURE_AI_SEARCH_SERVICE_NAME")
INDEX_SEMANTIC_CONFIGURATION_NAME = os.getenv("INDEX_SEMANTIC_CONFIGURATION_NAME")

# Azure OpenAI configuration
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv(
    "AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_OPENAI_EMBEDDING_ENDPOINT = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

# Validate environment variables
_required_env_vars = [
    "AZURE_AI_SEARCH_SERVICE_ENDPOINT", "AZURE_AI_SEARCH_API_KEY", "AZURE_AI_SEARCH_INDEX_NAME",
    "AZURE_OPENAI_KEY", "AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "AZURE_OPENAI_EMBEDDING_ENDPOINT", "AZURE_OPENAI_API_VERSION"
]

for var in _required_env_vars:
    if not os.getenv(var):
        logger.error(f"Environment variable {var} is not set.")
        raise EnvironmentError(f"Environment variable {var} is not set.")

# Initialize AzureOpenAIEmbeddings
_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    openai_api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_EMBEDDING_ENDPOINT,
    api_key=AZURE_OPENAI_KEY
)


In [31]:
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

#  Replace OpenAIEmbeddings with AzureOpenAIEmbeddings if Azure OpenAI is your provider.

embedding_function =_embeddings

index_name: str = "langchain-vector-demo-custom"


In [46]:
_fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchableField(
        name="chunk",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="text_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(_embeddings.embed_query("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
      SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

index_name: str = "vlad-index-test3"

In [47]:
try:
    # Create Langchain AzureSearch object
    _vector_search = AzureSearch(
    azure_search_endpoint=AZURE_AI_SEARCH_SERVICE_ENDPOINT,
    azure_search_key=AZURE_AI_SEARCH_API_KEY,
    index_name=index_name,
    embedding_function=_embeddings.embed_query,
    #search_type=search_type,
    semantic_configuration_name="vector-llmops-workshop-index-semantic-configuration",
    additional_search_client_options={"retry_total": 3, "logging_enable":True, "logger":logger},
    fields=_fields,
    )
            
    # Create retriever object
    #supported search types: "semantic_hybrid", "similarity" (default) , "hybryd"
    retriever = _vector_search.as_retriever(search_type="hybrid", k=3)    
           
except Exception as e:
        logger.error(f"Error during ai search index initialization: {e}")
        raise Exception(f"Error during ai search index initialization: {e}")

In [48]:
#check if the file exists
pdf_path="../data_preparation/data/hyde.pdf"
if not os.path.exists(pdf_path):
    raise Exception("The file ./data/hyde.pdf does not exist.")


In [49]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

doc_intelligence_config = configure_docintell_env()

loader = AzureAIDocumentIntelligenceLoader(file_path=pdf_path, api_key = doc_intelligence_config["doc_intelligence_key"], 
                                           api_endpoint = doc_intelligence_config["doc_intelligence_endpoint"],
                                           api_model="prebuilt-layout", api_version="2024-02-29-preview")
docs = loader.load()
#print(len(docs))
#print(docs)
# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    #('####', 'Header 4'),
    #('#####'    , 'Header 5'),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)
print("Length of splits: " + str(len(splits)))
splits

Length of splits: 23


[Document(metadata={}, page_content='Precise Zero-Shot Dense Retrieval without Relevance Labels\n===  \nLuyu Gao \\*\\* Xueguang Ma \\*\\* Jimmy Lin Jamie Callant  \nLanguage Technologies Institute, Carnegie Mellon University  \nDavid R. Cheriton School of Computer Science, University of Waterloo {luyug, callan} @cs.cmu.edu, {x93ma, jimmylin} @uwaterloo.ca'),
 Document(metadata={'Header 1': 'Abstract'}, page_content="arXiv:2212.10496v1 [cs.IR] 20 Dec 2022  \nWhile dense retrieval has been shown effec- tive and efficient across tasks and languages, it remains difficult to create effective fully zero-shot dense retrieval systems when no rel- evance label is available. In this paper, we recognize the difficulty of zero-shot learning and encoding relevance. Instead, we pro- pose to pivot through Hypothetical Document Embeddings (HyDE). Given a query, HyDE first zero-shot instructs an instruction-following language model (e.g. InstructGPT) to gen- erate a hypothetical document. The docu- me

In [50]:
_vector_search.add_documents(splits)

['MDFjNGE2YTItYzY4MS00MjI0LWI0ZTYtN2FjNTBjMmQ3MGZm',
 'M2RjODc2OTgtNjY2OS00MzdiLThlYzQtMmRhZjE1YTIyNjJh',
 'MDllYzVjODUtOWMxYy00NzE5LWEwNWItM2I4NDE3NGU5MDk0',
 'MTRmZWQ3MGEtZDU3MS00NjIzLWEyZDEtZjFkZjUzNDAyNDQ1',
 'ZWY1MTZiNTUtZjJmNC00M2FmLTkxN2EtZGRmNGU2MmNjNTI3',
 'NzU0YTM3Y2UtZTkwMy00YmIxLTlkZTktNzg1ZWQwMGYyOTcw',
 'ZmNkOGY4ZTktYTQ3ZC00MjMyLThjNDktNWFkNTMwZmY2ZWM4',
 'Nzg1Yzc3MTgtYjc0NS00YmZhLWEzNjEtZTVhYzhiMzA5ZGM2',
 'YWRlMjIxODctMjUzYi00MTE3LTg3YmMtZTY0ZjJmNzZhYWM4',
 'YmZkMzM0NmEtYmM4Ni00OGMxLTgwNmUtZmI2OGVlNzBiMTcx',
 'YzM3Y2E4MTQtZTk1NS00NDVjLTgwODktMzcwYzU3NTQ1YTcx',
 'OTczODJjODItMmFiYy00N2IwLTk3ZTItZWZiOTUwOTYwYzZl',
 'NGU5Yjg5MTEtNmVmNi00M2U2LWIwOTYtN2Q2MmFiZDcxZTAw',
 'NzYwNTY1NWQtMmQ2Yy00NmM3LWFkZWQtMDY1OTJlNGE2ZmQw',
 'OTAxMzkxYTktY2Q5MC00OGJkLWFiYzAtNzlhMDdiZjRhNjdj',
 'MzY3NTZhOWUtYTVlMi00NmFiLTlkZjgtNDk2MDI5YjM4NThi',
 'NzdmMmExMzItYTZkMy00YmYxLThkZjMtY2MxYjExMjNjZjRk',
 'OWRlNmI2ODQtNzI0YS00MDZjLWE4YjUtOGY5ZGUzMGZjMDdh',
 'NWVlODlhZDgtZGMwMS00NDU5LTg5MjAtYTlmODQ3OWQ1

In [51]:
_vector_search.similarity_search("challenges of multilingual setup")

[Document(metadata={'id': 'OTczODJjODItMmFiYy00N2IwLTk3ZTItZWZiOTUwOTYwYzZl', 'Header 1': '4 Experiments', 'Header 2': '4.4 Multilingual Retrieval'}, page_content='Multilingual setup poses several additional chal- lenges to HyDE. The small-sized contrastive en- coder gets saturated as the number of languages scales (Conneau et al., 2020; Izacard et al., 2021). Meanwhile, our generative LLM faces an opposite issue: with languages of not as high resource as English or French, the high capacity LLM can get under-trained (Hoffmann et al., 2022).  \nNevertheless, in Table 3, we still find HyDE able to improve the mContriever model. It can outperform non-Contriever models fine-tuned on and transferred from MS-MARCO. On the other hand, we do observe some margins between HyDE and fine-tuned mContrieverFT. Since HyDE and mContrieverFT use similar contrastive encoders, we hypothesize this is because the non-English lan- guages we considered are under-trained in both pre-training and instruction 

In [54]:
_vector_search.semantic_hybrid_search("challenges of multilingual setup")

[Document(metadata={'id': 'OTczODJjODItMmFiYy00N2IwLTk3ZTItZWZiOTUwOTYwYzZl', 'Header 1': '4 Experiments', 'Header 2': '4.4 Multilingual Retrieval', 'captions': {'text': 'Multilingual setup poses several additional chal- lenges to HyDE. The small-sized contrastive en- coder gets saturated as the number of languages scales (Conneau et al., 2020; Izacard et al., 2021). Meanwhile, our generative LLM faces an opposite issue: with languages of not as high resource as English or French, the high capacity LLM can get.', 'highlights': 'Multilingual setup poses several additional chal- lenges to HyDE. The small-sized contrastive en- coder gets saturated as the number of languages scales (Conneau et al., 2020; Izacard et al., 2021). Meanwhile, our<em> generative LLM </em>faces an opposite issue: with languages of not as high resource as English or French, the high capacity LLM can get.'}, 'answers': ''}, page_content='Multilingual setup poses several additional chal- lenges to HyDE. The small-si

In [62]:
res = _vector_search.semantic_hybrid_search_with_score("challenges of multilingual setup", score_type='reranker_score')
#res[0][1] - reranker score

res

[(Document(metadata={'id': 'OTczODJjODItMmFiYy00N2IwLTk3ZTItZWZiOTUwOTYwYzZl', 'Header 1': '4 Experiments', 'Header 2': '4.4 Multilingual Retrieval', 'captions': {'text': 'Multilingual setup poses several additional chal- lenges to HyDE. The small-sized contrastive en- coder gets saturated as the number of languages scales (Conneau et al., 2020; Izacard et al., 2021). Meanwhile, our generative LLM faces an opposite issue: with languages of not as high resource as English or French, the high capacity LLM can get.', 'highlights': 'Multilingual setup poses several additional chal- lenges to HyDE. The small-sized contrastive en- coder gets saturated as the number of languages scales (Conneau et al., 2020; Izacard et al., 2021). Meanwhile, our<em> generative LLM </em>faces an opposite issue: with languages of not as high resource as English or French, the high capacity LLM can get.'}, 'answers': ''}, page_content='Multilingual setup poses several additional chal- lenges to HyDE. The small-s

In [3]:
# Get the directory of the project root
import os

project_root = os.path.dirname(os.path.dirname(os.path.abspath(".")))
print(f"project_root= {project_root}")

project_root= /Users/vladfeigin/myprojects/dai-demos/aidemos


In [6]:
project_root = os.path.dirname(os.path.abspath("."))
print(f"project_root= {project_root}")

project_root= /Users/vladfeigin/myprojects/dai-demos/aidemos/llmops
