<img src="./images/logo.svg" alt="lakeFS logo" width=300/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="./images/langchain.jpeg" alt="LangChain logo" width=300/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="./images/openai-lockup-black.svg" alt="LangChain logo" width=250/>

# Build an AI Agent by using lakeFS, LangChain and OpenAI

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

### OpenAI API Key
##### If you do not have an API key then create a free OpenAI account and API key here: https://platform.openai.com/api-keys

In [None]:
openai_api_key = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "ai-agent-repo"

### Versioning Information 

In [None]:
sourceBranch = "main"
version1Branch = "version1"
version2Branch = "version2"
documentName = "lakeFS Brochure.pdf"

### Import libraries

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.document_loaders import LakeFSLoader

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores.faiss import FAISS
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.agents import AgentExecutor

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

### Create a function to load documents from lakeFS repository by using an [official lakeFS document loader for LangChain](https://python.langchain.com/docs/integrations/document_loaders/lakefs)
##### Split documents into smaller chunks, convert documents into OpenAI embeddings and store them in an in-memory vector database (Meta’s [FAISS](https://ai.meta.com/tools/faiss/))

In [None]:
def load_document(repo: str, ref: str, path: str) -> FAISS:
    lakefs_loader = LakeFSLoader(
        lakefs_access_key=lakefsAccessKey,
        lakefs_secret_key=lakefsSecretKey,
        lakefs_endpoint=lakefsEndPoint
    )
    lakefs_loader.set_repo(repo)
    lakefs_loader.set_ref(ref)
    lakefs_loader.set_path(path)
    docs = lakefs_loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.split_documents(docs)
    return FAISS.from_documents(docs, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))

### The setup_qa_agent function sets up LangChain's RetrievalQA chain, where a retriever is used to fetch the most relevant text from the document based on a query. 

In [None]:
def setup_qa_agent(vector_store):
    # Create a retriever for the document store
    retriever = vector_store.as_retriever()
    
    # Set up the language model (OpenAI GPT)
    llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0, openai_api_key=openai_api_key)

    # Create the QA chain with retrieval (answers based on document store)
    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    # Now we set up an agent that can respond using the QA chain
    tools = [
        Tool(
            name="DocumentQA",
            func=qa_chain.run,
            description="Use this tool to answer questions based on the content of the document."
        )
    ]
    
    agent = initialize_agent(
        tools=tools,
        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        llm=llm,
        verbose=True
    )
    
    return agent

### The run_agent function is where everything comes together. You provide the vector store and the question, and it will return the answer based on the document's content.

In [None]:
def run_agent(db: FAISS, question: str):
    # Setup the QA agent with the document store
    agent = setup_qa_agent(db)
    
    # Run the agent to answer the question
    result = agent.run(question)
    
    return result

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=sourceBranch, exist_ok=True)
branchMain = repo.branch(sourceBranch)
print(repo)

# Main demo starts here 🚦 👇🏻

### Create version1 branch

In [None]:
branchVersion1 = repo.branch(version1Branch).create(source_reference=sourceBranch, exist_ok=True)
print(f"{version1Branch} ref:", branchVersion1.get_commit().id)

### Upload [lakeFS Brochure.pdf](<./data/version1/lakeFS Brochure.pdf>) document to version1 branch

In [None]:
contentToUpload = open(f"/data/{version1Branch}/{documentName}", 'rb').read()
print(branchVersion1.object(documentName).upload(data=contentToUpload, mode='wb', pre_sign=False))

### Commit changes and attach some metadata

In [None]:
ref = branchVersion1.commit(message='Uploaded lakeFS Brochure', metadata={'version': 'version1'})
print_commit(ref.get_commit())

### Load [lakeFS Brochure.pdf](<./data/version1/lakeFS Brochure.pdf>) (version 1) document to vector database

In [None]:
db = load_document(repo_name, version1Branch, documentName)

### Let's ask a question

In [None]:
answer = run_agent(db, 'why should I use lakeFS')
print(answer)

### Merge version1 branch to main

In [None]:
res = branchVersion1.merge_into(branchMain)
print(res)

### Create version2 branch

In [None]:
branchVersion2 = repo.branch(version2Branch).create(source_reference=sourceBranch, exist_ok=True)
print(f"{version2Branch} ref:", branchVersion2.get_commit().id)

### Upload 2nd version of the [lakeFS Brochure.pdf](<./data/version2/lakeFS Brochure.pdf>) document

In [None]:
contentToUpload = open(f"/data/{version2Branch}/{documentName}", 'rb').read()
print(branchVersion2.object(documentName).upload(data=contentToUpload, mode='wb', pre_sign=False))

### Commit changes and attach some metadata

In [None]:
ref = branchVersion2.commit(message='Uploaded lakeFS Brochure', metadata={'version': 'version2'})
print_commit(ref.get_commit())

### Load [lakeFS Brochure.pdf](<./data/version1/lakeFS Brochure.pdf>) (version 2) document to vector database

In [None]:
db = load_document(repo_name, version2Branch, documentName)

### Ask the same question by using version2 document

In [None]:
answer = run_agent(db, 'why should I use lakeFS')
print(answer)

### Merge version2 branch to main

In [None]:
res = branchVersion2.merge_into(branchMain)
print(res)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack