In [2]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 4.6 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.1 MB 5.1 MB/s eta 0:00:02
   --------- ------------------------------ 2.6/11.1 MB 4.6

In [4]:
# Cell 1: Imports
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.documents import Document
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.graph import StateGraph, END
from langchain.retrievers import MultiVectorRetriever
from langchain.vectorstores.utils import filter_complex_metadata
from typing import List, Dict, TypedDict
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers.merger_retriever import MergerRetriever
import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader

In [5]:
# ---- Load PDF ----
pdf_path = "Sample_Loan.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()


In [6]:
# ---- Chunk and Embed ----
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
persist_dir = "chroma_loan_summary"
if not os.path.exists(persist_dir):
    os.makedirs(persist_dir)

In [8]:
# Wipe existing Chroma DB if needed (fresh start for each run)
# if os.path.exists(persist_dir):
#     import shutil
#     shutil.rmtree(persist_dir)

vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=persist_dir)
vectorstore.persist()

In [9]:
# ---- Question Answering ----
llm = Ollama(model="llama3.2")

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 8, "fetch_k": 20})

In [10]:
def query_rag_latest(query: str) -> str:
    retrieved_docs = retriever.get_relevant_documents(query)
    combined_content = "\n\n".join([doc.page_content for doc in retrieved_docs])
    prompt = PromptTemplate.from_template(
        "Use the context below to answer the question accurately.\n\nContext:\n{context}\n\nQuestion: {question}"
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.invoke({"context": combined_content, "question": query})["text"]

# ---- Questions ----
questions = [
    "What is the loan amount?",
    "Who is the borrower?",
    "What is the interest rate?",
    "When is the loan due?",
    "Are there any late payment penalties?",
]

In [11]:
# ---- Get Answers ----
print("🔍 Answering predefined questions...\n")
qa_results = []
for q in questions:
    answer = query_rag_latest(q)
    qa_results.append({"Question": q, "Answer": answer})

df_qa = pd.DataFrame(qa_results)
display(df_qa)

🔍 Answering predefined questions...



Unnamed: 0,Question,Answer
0,What is the loan amount?,"The loan amount is $27,500.00."
1,Who is the borrower?,"According to the provided context, the borrowe..."
2,What is the interest rate?,The interest rate specified in the document is...
3,When is the loan due?,The loan agreement does not specify a specific...
4,Are there any late payment penalties?,"Yes, there is a 2% penalty for any late paymen..."


In [12]:


# ---- LangGraph Summary ----

# Define state
class State(TypedDict):
    docs: List[Document]
    accumulated_summaries: List[str]
    current_summary: str
    final_summary: str

map_prompt = PromptTemplate.from_template(
    "Write an excellent summary of the following, covering every critical point:\n\n{context}"
)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

reduce_prompt = PromptTemplate.from_template(
    "The following is a set of summaries:\n{summaries}\nTake these and distill it into a final, consolidated summary of the main themes."
)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

def map_node(state: State) -> State:
    print("🧩 Map step")
    doc = state["docs"].pop(0)
    result = map_chain.invoke({"context": doc.page_content})
    state["accumulated_summaries"].append(result["text"])
    return state

def reduce_node(state: State) -> State:
    print("\n🧪 Reduce step")
    combined = "\n".join(state["accumulated_summaries"])
    result = reduce_chain.invoke({"summaries": combined})
    state["final_summary"] = result["text"]
    return state

def should_continue(state: State) -> str:
    return "map" if len(state["docs"]) > 0 else "reduce"

graph_builder = StateGraph(State)
graph_builder.add_node("map", map_node)
graph_builder.add_node("reduce", reduce_node)
graph_builder.set_conditional_entry_point(should_continue)
graph_builder.add_edge("map", "reduce")
graph_builder.add_edge("reduce", END)
graph = graph_builder.compile()

initial_state: State = {
    "docs": docs.copy(),
    "accumulated_summaries": [],
    "current_summary": "",
    "final_summary": "",
}

print("\n🚀 Starting summarization...\n")
final_state = graph.invoke(initial_state)

# ---- Show Final Summary ----
print("\n✅ Final Summary Output:\n")
print(final_state["final_summary"])



🚀 Starting summarization...

🧩 Map step

🧪 Reduce step

✅ Final Summary Output:

Here is a consolidated summary of the Loan Agreement and Promissory Note:

**Loan Agreement and Promissory Note Summary**

Wharton Capital, LLC (the "Lender") has entered into a loan agreement with Sanguine Corp (the "Borrower") on July 1st, 2010. The key terms include:

* A principal amount of $27,500.00
* An interest rate of 7% per annum
* Three separate payments to be made according to a specified schedule

The Borrower agrees to repay the loan amount with interest and is bound by the terms of this agreement, which is governed by the laws of Nevada.


In [13]:
from IPython.display import display, Markdown, HTML

# ---- Final Output Display ----

# Format the summary as Markdown
final_summary_md = f"""
### 📝 Final Summary
{final_state["final_summary"]}
"""

# Format the QA table as HTML
qa_table_html = df_qa.to_html(index=False, escape=False)

# Combine everything
combined_html = f"""
<h2>📋 Extracted Answers</h2>
{qa_table_html}
<br><br>
{final_summary_md}
"""

display(HTML(combined_html))


Question,Answer
What is the loan amount?,"The loan amount is $27,500.00."
Who is the borrower?,"According to the provided context, the borrower (BORROWER) in this loan agreement and promissory note is SANGUINE CORP, a corporation organized under the laws of the State of Nevada."
What is the interest rate?,The interest rate specified in the document is 7% per annum.
When is the loan due?,"The loan agreement does not specify a specific date for repayment of the entire loan amount. However, it outlines a schedule of five payments:\n\n1. $7,500.00 on or before October 1, 2010\n2. $7,500.00 on or before November 25, 2010\n3. $7,500.00 on or before January 15, 2011\n4. $5,000.00 on or before March 1, 2011\n\nThese payments are part of the loan repayment terms, and the final payment will include interest."
Are there any late payment penalties?,"Yes, there is a 2% penalty for any late payment computed upon the amount of any principal and accrued interest whose payment to LENDER is overdue under this loan agreement."
