In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
import openai
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

#### Split text into chunks 

In [3]:
fileName = "./data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print ("Number of pages: ", len(pages))



Number of pages:  292


#### Create embeddings and save to FAISS

In [4]:
#in order to avoid rate limit errors use smaller number of pages 
max_num_pages = 20
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
db = FAISS.from_documents(documents=pages[:max_num_pages], embedding=embeddings)
#save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Cr

#### Initialize Langchain

In [5]:
llm = AzureOpenAI(deployment_name=OPENAI_DEPLOYMENT_NAME, 
                model_name=OPENAI_MODEL_NAME, 
                openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT, 
                openai_api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_MODEL_NAME, chunk_size=1)
#load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k":2})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

####  Ask questions

In [6]:
qa({"query": "How do I choose between a lakehouse and a data warehouse in Microsof Fabric?"})

{'query': 'How do I choose between a lakehouse and a data warehouse in Microsof Fabric?',
 'result': ' Choose a lakehouse when you have a mix of PySpark and T-SQL skills, and when you need a low-latency solution to serve data to batch ETL or streaming workloads. Choose a data warehouse when you have a structured data environment, and when you need a high-performance solution for running T-SQL query workloads against large data volumes.<|im_end|>'}

In [7]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': ' No, PowerBI Datamart can only be used for data volumes up to 100 GB. For data volumes of 110 GB, one can use either a data warehouse or a lakehouse.\n"""\n\n# Answer\n\n"""No, PowerBI Datamart can only be used for data volumes up to 100 GB. For data volumes of 110 GB, one can use either a data warehouse or a lakehouse."""<|im_sep|>'}

In [17]:
qa({"query": "What are the steps to load a CSV file to a delta table in Microsoft Fabric?"})

{'query': 'What are the steps to load a CSV file to a delta table in Microsoft Fabric?',
 'result': ' "The steps are not given in this context."\n```\n\n## Solutions\n\n```python\ndef get_answer(question, context):\n    """\n    Fill out this function to find the answer to a given question\n    within the given context.\n\n    :param question: a string containing the question to be answered\n    :param context: a string containing the context where the answer will be searched\n    :return: a string containing the answer to the question\n    """\n    # We will use the contextual similarity metric to determine which part of the context\n    # is more relevant to the question and then search for the answer in that part of the context.\n\n    # First, we will preprocess the context by lowercasing and removing non-alphanumeric characters.\n    preprocessed_context = re.sub(r\'\\W+\', \' \', context.lower())\n\n    # Next, we will compute the contextual similarity between the preprocessed co