In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("../../.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("../../files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the
            text is relevent to answer the question. Return any relevant text
            verbatim.
            -------
            {context}
            """,
        ),
        ("human","{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']

    return "\n\n".join(
        map_doc_chain.invoke(
            {
                "context": doc.page_content,
                "question": question
            }
        ).content
        for doc in documents
    )

map_chain = { 
    "documents": retriever, 
    "question": RunnablePassthrough(),
    } | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a
            question, create a final answer.
            If you don't know the answer, just say that you don't know. Don't try
            to make up an answer.
            -------
            {context}
            """,
        ),
        ("human","{question}"),
    ]
)

chain = {"context":map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a dilapidated building where Winston resides. It has cramped living conditions, faulty plumbing, and electricity issues. The apartments are small and run-down with peeling wallpaper and worn-out furniture. The building is located in a dreary and oppressive environment, reflecting the bleak atmosphere of the society in which Winston lives. The hallway smells of boiled cabbage and old rag mats, with a large colored poster of a man\'s face at one end. There are seven flights of stairs, and the elevator is rarely working. Each landing has a poster saying "BIG BROTHER IS WATCHING YOU." The building has a roof from which you can see other buildings, including the Ministry of Truth, and is surrounded by a grimy landscape with rotting houses and bombed sites. Inside Winston\'s flat, there is a telescreen that cannot be completely shut off, constantly broadcasting information.')

Failed to patch https://api.smith.langchain.com/runs/0dc5be41-68b5-47a6-9a59-d8f6a962c367 in LangSmith API. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/runs/0dc5be41-68b5-47a6-9a59-d8f6a962c367', '{"error":"Conflict: payload already received: payloads already received"}\n')
