## Environment Setup

In [None]:
from platform import python_version
print(python_version())

In [None]:
!pip install -r requirements.txt

## 1. Setting the Files

In [None]:
import os
from IPython.display import display
from ipywidgets import widgets

# Create Upload Button
upload_button = widgets.FileUpload(
    accept='.pdf',  # Accept only .pdf files
    multiple=True  # Accept multiple files
)

# Create Output widget to display results
output = widgets.Output()

# Create function to handle file upload
def on_file_upload(change):
    with output:
        for filename, file in upload_button.value.items():
            with open(os.path.join("sou", filename), "wb") as f: 
                f.write(file['content'])
            print(f"Successfully saved: {filename}")

# Clear the value of the upload button to prevent repeat calls
    upload_button.value.clear()
    upload_button._counter = 0 

# Set the function to be called when file upload is done
upload_button.observe(on_file_upload, names='value')

display(upload_button, output)


## 2. Indexing and Embedding the PDF Files into ChromaDB

In [None]:
import chromadb
import os
from chromadb.config import Settings
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings 
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

def loadstore(val):
    return("hello " + val)

def getfromstore(collection_name="tdocsfolder"):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    store = Chroma(collection_name=collection_name, embedding_function=embeddings, persist_directory="db/")
    return(store)

def addtostorepdf(folder_name, collection_name='db', persist_directory="db/"):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    loader = PyPDFDirectoryLoader(folder_name + "/")
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"])
    docs = text_splitter.split_documents(pages)
    store = Chroma.from_documents(docs, embedding=embeddings, collection_name=collection_name, persist_directory=persist_directory)
    store.persist()
    return(store)

def addtostoretxt(folder_name, collection_name='db', persist_directory="db/"):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    loader = DirectoryLoader(folder_name + "/", glob="**/*.txt",loader_cls=TextLoader, silent_errors=True)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    docs = text_splitter.split_documents(pages)
    store = Chroma.from_documents(docs, embedding=embeddings, collection_name=collection_name, persist_directory=persist_directory)
    store.persist()
    return(store)

def deletestore(collection_name='db', persist_directory="db/"):
    client = chromadb.Client(Settings(persist_directory=persist_directory))
    try:
        client.delete_collection(collection_name)
    except:
        print("Has the collection been already deleted?")
    val = client.reset()
    try:
        os.rmdir("db/index")  
        os.remove("db/chroma-collections.parquet")  
        os.remove("db/chroma-embeddings.parquet")
    except:
        print("Have the files been cleanedup already?")
    return val

In [None]:
print("Indexing content and metadata into ChromaDB Store")
store2 = addtostorepdf(folder_name="sou", collection_name="sou_coll", persist_directory="db/")
print(store2)  

store = getfromstore(collection_name="sou_coll")
print(store) 

## 3. Querying the ChromaDB Vector Store

In [1]:
# Import required packages
import ipywidgets as widgets
from IPython.display import display
from common.funs import getfromstore

# Define a function to get details from the store
def start_capture():
    store = getfromstore(collection_name="sou_coll")
    print(store.get())  # Display details in Jupyter output

# Create an interactive button widget
fetch_button = widgets.Button(description="Fetch Details")

# Define what happens when the button is clicked
def on_button_clicked(b):
    start_capture()

# Attach the click event handler to the button
fetch_button.on_click(on_button_clicked)

# Display the button
display(fetch_button)


Button(description='Fetch Details', style=ButtonStyle())

Using embedded DuckDB with persistence: data will be stored in: db/


{'ids': ['c87582f5-f9f3-11ed-978e-e20af6928467', 'c6f16a7c-fa14-11ed-866f-e20af6928467', 'b8e0bb04-f94e-11ed-9f6c-e20af6928467', 'b8e0bb05-f94e-11ed-86b3-e20af6928467', 'b8e096de-f94e-11ed-a43f-e20af6928467', 'b8e096df-f94e-11ed-89d1-e20af6928467', 'b8e096e0-f94e-11ed-80bf-e20af6928467', 'b8e096e1-f94e-11ed-a0c0-e20af6928467', 'b8e096e2-f94e-11ed-bef2-e20af6928467', 'b8e096e3-f94e-11ed-88a3-e20af6928467', 'b8e096e4-f94e-11ed-821d-e20af6928467', 'b8e096e5-f94e-11ed-8836-e20af6928467', 'b8e096e6-f94e-11ed-9a41-e20af6928467', 'b8e096e7-f94e-11ed-8cdb-e20af6928467', 'b8e096e8-f94e-11ed-9418-e20af6928467', 'b8e096e9-f94e-11ed-9731-e20af6928467', 'b8e096ea-f94e-11ed-b097-e20af6928467', 'b8e096eb-f94e-11ed-92a2-e20af6928467', 'b8e096ec-f94e-11ed-9659-e20af6928467', 'b8e096ed-f94e-11ed-9a02-e20af6928467', 'b8e096ee-f94e-11ed-b4c4-e20af6928467', 'b8e096ef-f94e-11ed-a073-e20af6928467', 'b8e096f0-f94e-11ed-b661-e20af6928467', 'b8e096f1-f94e-11ed-bf99-e20af6928467', 'b8e096f2-f94e-11ed-a898-e20af6

Using embedded DuckDB with persistence: data will be stored in: db/


{'ids': ['c87582f5-f9f3-11ed-978e-e20af6928467', 'c6f16a7c-fa14-11ed-866f-e20af6928467', 'b8e0bb04-f94e-11ed-9f6c-e20af6928467', 'b8e0bb05-f94e-11ed-86b3-e20af6928467', 'b8e096de-f94e-11ed-a43f-e20af6928467', 'b8e096df-f94e-11ed-89d1-e20af6928467', 'b8e096e0-f94e-11ed-80bf-e20af6928467', 'b8e096e1-f94e-11ed-a0c0-e20af6928467', 'b8e096e2-f94e-11ed-bef2-e20af6928467', 'b8e096e3-f94e-11ed-88a3-e20af6928467', 'b8e096e4-f94e-11ed-821d-e20af6928467', 'b8e096e5-f94e-11ed-8836-e20af6928467', 'b8e096e6-f94e-11ed-9a41-e20af6928467', 'b8e096e7-f94e-11ed-8cdb-e20af6928467', 'b8e096e8-f94e-11ed-9418-e20af6928467', 'b8e096e9-f94e-11ed-9731-e20af6928467', 'b8e096ea-f94e-11ed-b097-e20af6928467', 'b8e096eb-f94e-11ed-92a2-e20af6928467', 'b8e096ec-f94e-11ed-9659-e20af6928467', 'b8e096ed-f94e-11ed-9a02-e20af6928467', 'b8e096ee-f94e-11ed-b4c4-e20af6928467', 'b8e096ef-f94e-11ed-a073-e20af6928467', 'b8e096f0-f94e-11ed-b661-e20af6928467', 'b8e096f1-f94e-11ed-bf99-e20af6928467', 'b8e096f2-f94e-11ed-a898-e20af6

## 4. QA with Azure OpenAI

In [None]:
# Import required packages
import os
from dotenv import load_dotenv
from langchain.llms import AzureOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo
)
from common.funs import getfromstore

# Load default environment variables (.env)
load_dotenv()

# Set environment variables
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2022-12-01"
os.environ["OPENAI_API_BASE"] = "https://cresen-open-ai.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = os.environ["AZ_OPENAI_API_KEY"]

# Create instance of OpenAI LLM
llm = AzureOpenAI(openai_api_base=os.environ["OPENAI_API_BASE"] , model="gpt-35-turbo", temperature=0.1, verbose=True, deployment_name="cresen-gpt-35-turbo")

# Get document store
store = getfromstore(collection_name="sou_coll")

# Create vectorstore info object - metadata repo?
vectorstore_info = VectorStoreInfo(
    name="sou",
    description="sou folder",
    vectorstore=store
)

# Convert the document store into a langchain toolkit
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)

# Add the toolkit to an end-to-end LC
agent_executor = create_vectorstore_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True
)

# Get user input
prompt = input('Input your prompt here: ')

# Pass the prompt to the LLM
try:
    response = agent_executor.run(prompt)
    # Print the response
    print(response)
except:
    print("That was a difficult question!  Can you please try again with rephrasing it a bit?")

# Find the relevant pages
search = store.similarity_search_with_score(prompt)

# Print the first results
try:
    print("This information was found in:")
    for doc in search:
        score = doc[1]
        try:
            page_num = doc[0].metadata['page']
        except:
            page_num = "txt snippets"
        source = doc[0].metadata['source']
        print("Source: ", source, " - Page: ", page_num, "; Similarity Score: ", score)
        print(doc[0].page_content)
except:
    print("unable to get source document detail")


In [None]:
response = agent_executor.run(prompt)

In [None]:
from langchain.vectorstores import Chroma
from IPython.display import display
import ipywidgets as widgets
from langchain.chains import ConversationalRetrievalChain

# Create instance of OpenAI LLM
llm = AzureOpenAI(openai_api_base=os.environ["OPENAI_API_BASE"] , model="gpt-35-turbo", temperature=0.1, verbose=True, deployment_name="cresen-gpt-35-turbo")

# Get document store
store = getfromstore(collection_name="sou_coll")

# Create a retriever from the vectorstore
retriever = store.as_retriever(search_type="similarity", search_kwargs={"k":4})

# Create a question-answering agent from the retriever
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)

chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""
    
    if query.lower() == 'exit':
        print("Thanks for the chat!")
        return
    
    # Query the agent with the input question
    result = qa({"question": query, "chat_history": chat_history})
            
    chat_history.append((query, result['answer']))
    
    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="Orange">Chatbot:</font></b> {result["answer"]}'))

print("Chat with your data. Type 'exit' to stop")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)


In [None]:
print(os.environ['AZ_OPENAI_API_KEY'])

In [None]:
print(os.environ['OPENAI_API_BASE'])

In [None]:
response = agent_executor.run(prompt)
# Print the response
print(response)

In [None]:
llm("What is earth's mass ?")