### **Load Environment variables from .env file**

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
#from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
import openai
from dotenv import load_dotenv
import os

load_dotenv()
# env variables that are used by LangChain
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_TYPE'] = "azure"
os.environ['OPENAI_API_VERSION'] = os.getenv("OPENAI_DEPLOYMENT_VERSION")
os.environ['OPENAI_API_BASE'] = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_DEPLOYMENT_VERSION")
openai.api_base = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
openai.api_key = os.getenv("OPENAI_API_KEY")

#### **Run ONLY ONCE to create the embeddings - Split text into chunks** 

In [2]:
fileName = "./data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

Number of pages:  292


#### **Run ONLY ONCE to create the embeddings - Create embeddings and save to FAISS**

In [3]:
embeddings = OpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
db = FAISS.from_documents(documents=pages, embedding=embeddings)
# save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

#### **Initialize LLM**

In [4]:
llm = AzureChatOpenAI(deployment_name=OPENAI_DEPLOYMENT_NAME,
                  model_name=OPENAI_MODEL_NAME,
                  openai_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
                  openai_api_key=OPENAI_API_KEY)

#### **Initialize retrieval API WITHOUT your data - get wrong answers**

In [5]:
from IPython.display import display, HTML, JSON, Markdown


# prepare prompt
messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer in a clear and concise manner."},
            {"role": "user", "content": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"}]


answer = openai.ChatCompletion.create(engine=OPENAI_DEPLOYMENT_NAME,

                                      messages=messages,)
display(HTML(answer.choices[0].message.content))

#### **Initialize retrieval API WITH your data - get the right answers**

In [6]:
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_MODEL_NAME, chunk_size=1)
# load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={
                                     "k": 2})  # returns 2 most similar vectors/documents
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

#### **Ask questions**

In [7]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': 'According to the Microsoft Fabric decision guide, Power BI Datamart is designed for data volumes of up to 100 GB. However, you could consider using a Data Warehouse or a Lakehouse for your 110 GB data volume. Both options support structured data and have unlimited data volume capacity. Data Warehouse is suitable for SQL developers, while Lakehouse is ideal for data engineers and data scientists who prefer to use SQL and Spark.'}

In [8]:
qa({"query": "What are the steps to load a CSV file to a delta table in Microsoft Fabric?"})

{'query': 'What are the steps to load a CSV file to a delta table in Microsoft Fabric?',
 'result': 'Here are the steps to load a CSV file to a delta table in Microsoft Fabric:\n\n1. Open the Synapse Data Engineering experience in Microsoft Fabric.\n2. Ensure that you are in the desired workspace or create/select one.\n3. Click on the Lakehouse icon under the New section in the main page.\n4. Upload your CSV file to the Lakehouse.\n5. Convert the file to a Delta table.\n6. Generate a dataset and create a Power BI report.\n\nPlease note that Microsoft Fabric is currently in preview, and the information provided may be subject to change.'}