In [4]:
from langchain.chat_models.openai import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.1,
)
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")
docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()
cache_dir = LocalFileStore("./.cache/")
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the 
            text is relevant to answer the question. Return any relevant text 
            verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)
map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)
final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a 
            question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try 
            to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)
chain = (
    {"context": map_chain, "question": RunnablePassthrough()}
    | final_prompt
    | llm
)
result = chain.invoke("Describe Victory Mansions")
print(result.content)

Victory Mansions is a residential building located in London, the chief city of Airstrip One. It has glass doors and a hallway that smells of boiled cabbage and old rag mats. The building has seven flights of stairs, with Winston Smith's flat being on the seventh floor. A large colored poster of a man with a heavy black mustache is displayed at one end of the hallway. The building is equipped with a telescreen, and the electricity is cut off during daylight hours as part of the economy drive for Hate Week. From the roof of Victory Mansions, one can see all four Ministries of the government simultaneously: the Ministry of Truth, Ministry of Peace, Ministry of Love, and Ministry of Plenty. The telescreen in Winston's living-room is placed in an unusual position, opposite the window, and there is a shallow alcove likely intended for bookshelves.
