In [5]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os

api_key = ""
os.environ["OPENAI_API_KEY"] = api_key

In [10]:
loader = DirectoryLoader('datasets', glob="*", show_progress=True)

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

embed_model = OpenAIEmbeddings(api_key=api_key,
                                  model='text-embedding-3-small')

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [12]:
def merge_docs(retrieved_docs):
    return "\n\n".join([d.page_content for d in retrieved_docs])
docs = loader.load()
documents = text_splitter.split_documents(docs)
vector_index = FAISS.from_documents(documents, embed_model)
retriever = vector_index.as_retriever(search_type="mmr")


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 19.21it/s][A


In [14]:
chain = RunnableParallel(context=retriever, question=RunnablePassthrough()) \
        | prompt
chain.invoke("Tell me something about sora, developed by OpenAI")

ChatPromptValue(messages=[HumanMessage(content='You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: Tell me something about sora, developed by OpenAI \nContext: [Document(metadata={\'source\': \'datasets/buffett_quotes.txt\'}, page_content="Warren Buffett: The smarter the journalists are, the better off society is. For to a degree, people read the press to inform themselves - and the better the teacher, the better the student body.\\n\\nWarren Buffett: The investor of today does not profit from yesterday\'s growth.\\n\\nWarren Buffett: We always live in an uncertain world. What is certain is that the United States will go forward over time."), Document(metadata={\'source\': \'datasets/buffett_quotes.txt\'}, page_content="Warren Buffett: We\'ve used up a lot of bullets. And we talk about sti

In [15]:
chain = RunnableParallel(context=retriever, question=RunnablePassthrough()) \
        | prompt \
        | llm \
        | StrOutputParser()
chain.invoke("Tell me something about sora, developed by OpenAI")

"I don't know."