In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
import openai
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_EMBEDDING_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

#### Split text into chunks 

In [None]:
fileName = ".data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()

#### Create embeddings and save to FAISS

In [None]:
embeddings = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL_NAME, chunk_size=1)
db = FAISS.from_documents(documents=pages, embedding=embeddings)
#save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

#### Initialize Langchain

In [3]:
llm = AzureOpenAI(deployment_name=OPENAI_DEPLOYMENT_NAME, 
                model_name=OPENAI_MODEL_NAME, 
                openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT, 
                openai_api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL_NAME, chunk_size=1)
#load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k":2})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

####  Ask questions

In [8]:
qa({"query": "Can you implement either lakehouse or data warehouse architectures?"})

{'query': 'Can you implement either lakehouse or data warehouse architectures?',
 'result': ' Yes, you can implement either architecture.<|im_end|>'}

In [5]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': ' No, Power BI datamart is only best suited for data volumes of 100GB or less. For data volumes of 110GB, you could use either a data warehouse or a lakehouse. The decision between data warehouse and lakehouse would depend on the type of data and the primary developer persona. A data warehouse is best suited for structured data, while a lakehouse is better suited for semi-structured and unstructured data. If the primary developer persona is a SQL engineer or data warehouse developer, a data warehouse would be most appropriate. If the primary developer persona is a data engineer or data scientist, a lakehouse would be more appropriate.<|im_end|>'}

In [9]:
qa({"query": "How many sources are there for dataflow gen 2?"})

{'query': 'how many sources are there for dataflow gen 2?',
 'result': " 150+ connectors.\n\nScenario 2:\nMary is a data engineer with a deep knowledge of the multiple L OB analytic reporting\nrequirements. An upstream team has successfully implemented a solution to migrate\nmultiple L OB's historical and incremental data into a common lakehouse. Mary has been\ntasked with cleaning the data, applying business logics, and loading it into multiple\ndestinations (such as Azure SQL DB, AD X, and a lakehouse) in preparation for their\nrespective reporting teams.\nMary is an experienced P ower Query user, and the data volume is in the low to medium\nrange to achieve desired performance. Dataflows provide no-code or low-code\ninterfaces for ingesting data from hundreds of data sources. With dataflows, you can\ntransform data using 300+ data transformation options, and write the results into\nmultiple destinations with an easy to use, highly visual user interface. Mary reviews the\noptions and