In [1]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import load_prompt


from src.process import process_arxiv_paper_from_url



In [2]:
paper_url = "https://arxiv.org/abs/2204.08387"

In [3]:
tex_content, _ = process_arxiv_paper_from_url(paper_url)

2024-02-18 14:40:50.462 
  command:

    streamlit run /home/david/projects/paperlight/.venv/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-02-18 14:40:50.463 No runtime found, using MemoryCacheStorageManager




In [None]:
for section, content in tex_content.items():
    print(section)
    print("-"*20)
    print(content)
    print("\n")

In [None]:
docs = [
    Document(page_content=text, metadata={"section": section})
    for section, text in tex_content.items()
]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2024, chunk_overlap=256)

In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
vectorstore = Chroma.from_documents(splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [None]:
# demonstrate similarity search
question = "is there an email i can ask for help"
simmilar_docs = vectorstore.similarity_search(question, k=3)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
def unique_values_dict(list_):
    return list(dict.fromkeys(list_))

In [None]:
def format_section_metadata(docs):
    sections = unique_values_dict([doc.metadata.get("section", "Unknown") for doc in docs])
    return "Information was retrieved from following sections: " + ", ".join(sections)

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
prompt = load_prompt("prompts/pb.yaml")

In [None]:
from operator import itemgetter

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(
        context=(lambda x: format_docs(x["raw_context"])),
    )
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "raw_context": retriever,
        "question": RunnablePassthrough(),
    }
).assign(
    answer=rag_chain_from_docs,
    sources=(lambda x: format_section_metadata(x["raw_context"])),
)

In [None]:
rag_chain_with_source.invoke("What is this paper about?")

In [None]:
for chunk in rag_chain_with_source.stream("What is this paper about?"):
    if chunk.get("sources"):
        source = chunk.get("sources")
    if chunk.get("answer"):
        print(chunk.get("answer"), end="", flush=True)
print("\n" + source)

In [4]:
from src.qa_chain import get_qa_chain, _tex_to_splits
from src.display import streamify_qa_response

In [5]:
qa_chain = get_qa_chain(tex_content)

In [14]:
import time


In [None]:
def print_qa_response(chain, question):
    """
    Streamify an LLM response.

    Args:
    - llm_response: str: The LLM response

    Returns:
    - str: The streamified LLM response
    """
    for chunk in chain.stream(question):
        if chunk.get("sources"):
            source = chunk.get("sources")
        if chunk.get("answer"):
            print(chunk.get("answer"), end="", flush=True)
    print("\n" + source)

In [15]:
for chunk in streamify_qa_response(qa_chain, "What is this paper about?"):
    print(chunk, end="", flush=True)

The paper is about LayoutLMv3, a multimodal Transformer model designed for Document AI tasks that uses unified text and image masking pre-training objectives to learn multimodal representations. LayoutLMv3 does not rely on pre-trained CNN or Faster R-CNN backbones, achieving generality and superiority for both text-centric and image-centric Document AI tasks. The paper also discusses the effect of linear image embeddings and different pre-training objectives on the model's performance.

Information was retrieved from following sections: Conclusion and Future Work, Experiments, Introduction

In [None]:
qa_chain.invoke("What is this paper about?")

In [None]:
for chunk in qa_chain.stream("What is this paper about?"):
    if chunk.get("sources"):
        source = chunk.get("sources")
    if chunk.get("answer"):
        print(chunk.get("answer"), end="", flush=True)
print("\n" + source)

In [None]:
for chunk in qa_chain.stream("What is this paper about?"):
    print(chunk)