In [None]:
%pip install -qU langchain langchain-ollama langchain-community pypdf

In [12]:
from langchain_ollama import ChatOllama
from langchain_core.runnables import RunnableParallel, RunnablePassthrough


In [13]:
# Define a parallel execution where one key is passed through and another is modified
runnable = RunnableParallel(
    passed=RunnablePassthrough(),  # Passes the input unchanged
    modified=lambda x: x["num"] + 1,  # Modifies the input by adding 1
)

# Invoke with sample input
result = runnable.invoke({"num": 1})

# Print the result
print(result)


{'passed': {'num': 1}, 'modified': 2}


In [14]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [15]:
# Load the PDF file
pdf_path = "../00-example_data/layout-parser-paper.pdf"  # Update with your actual file path
loader = PyPDFLoader(pdf_path)

# Extract documents from the PDF
pdf_docs = loader.load()
print(f"Loaded {len(pdf_docs)} documents from the file.")

# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Each chunk contains up to 1000 characters
    chunk_overlap=20,  # Overlapping helps retain context
)

# Apply the text splitter
documents = text_splitter.split_documents(pdf_docs)
print(f"Total split documents: {len(documents)}")


Loaded 16 documents from the file.
Total split documents: 53


In [16]:
# Define a chat prompt template
template = """Answer the question based only on the following context:
{context}

Question: 
"""

# Create a prompt template
prompt = ChatPromptTemplate.from_template(template)

# Initialize Ollama LLM
model = ChatOllama(model="llama3.2:latest")


In [17]:
from langchain_core.runnables import RunnableLambda

# Define a function to retrieve relevant chunks
def retrieve_documents(query):
    # Here, we filter the documents to return only relevant ones
    return "\n".join([doc.page_content for doc in documents[:3]])  # Returning top 3 chunks

# Convert the function into a Runnable
retriever_runnable = RunnableLambda(retrieve_documents)


In [18]:

# Create a chain that passes the user question and retrieves relevant context
retrieval_chain = (
    {"context": retriever_runnable, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Invoke the chain with a query
result = retrieval_chain.invoke("What is discussed in this document?")
print(result)


This document discusses the development and introduction of a unified toolkit called LayoutParser for Deep Learning based Document Image Analysis (DIA). It aims to simplify the reuse of innovations in DIA research and applications, filling a gap in existing toolkits. The paper presents an open-source library with simple interfaces for applying and customizing deep learning models for layout detection, character recognition, and other document processing tasks.
