https://python.langchain.com/docs/tutorials/rag/#loading-documents 

In [1]:
%pip install langchain

Collecting langchainNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/f6/d5/4861816a95b2f6993f1360cfb605aacb015506ee2090433a71de9cca8477/langchain-0.3.27-py3-none-any.whl.metadata
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Obtaining dependency information for langchain-core<1.0.0,>=0.3.72 from https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl.metadata
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Obtaining dependency information for langchain-text-splitters<1.0.0,>=0.3.9 from https://files.pythonhosted.org/packages/4c/dc/d64c9990f6aeb209e8d47b34ebaa2b787f3e4c10c99b8a5568a10beda449/langchain_text_splitters-0.3

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-server 1.23.4 requires anyio<4,>=3.1.0, but you have anyio 4.10.0 which is incompatible.


In [3]:
%pip install langchain-text-splitters langchain-community langgraph -q

Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install pypdf

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.0.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
%pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-win_amd64.whl.metadata (5.2 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-win_amd64.whl (18.2 MB)
   ---------------------------------------- 0.0/18.2 MB ? eta -:--:--
   ----------------------- ---------------- 10.7/18.2 MB 67.2 MB/s eta 0:00:01
   ---------------------------------------- 18.2/18.2 MB 57.4 MB/s  0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [59]:
from langchain import hub
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.vectorstores import InMemoryVectorStore
from typing_extensions import List,TypedDict
from langchain_core.documents import Document

In [3]:
import os

In [1]:
import faiss

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-large')

In [5]:
embedding_dim = len(embeddings.embed_query('Hello HSBC'))
index = faiss.IndexFlatL2(embedding_dim)

In [60]:
vector_store = InMemoryVectorStore(embeddings)

# Loading the data

In [61]:
loader = DirectoryLoader("docs/", glob="*.pdf", loader_cls = PyPDFLoader)
docs = loader.load()
len(docs)

17

In [62]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

64

In [74]:
all_splits

[Document(metadata={'producer': 'macOS Versione 12.1 (Build 21C52) Quartz PDFContext, AppendMode 1.1', 'creator': 'PyPDF', 'creationdate': "D:20210316191726Z00'00'", 'author': 'kpoll', 'moddate': "D:20220515113647Z00'00'", 'title': 'Microsoft Word - PURE Fraud and Cyber Defense Coverage Endorsement', 'source': 'docs\\form_1.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content='Fraud and Cyber Defense Coverage \nThis endorsement changes the policy. Please read it carefully. \n \nPHVH-END-GEN-029 (03/2021)  Page 1  \n \n \nThis endorsement amends your High Value Homeowners Policy (“the Policy”) to add Fraud \nand Cyber Defense Coverage, subject to its own terms, definitions, exclusions, limits and \nconditions, as set forth herein. The Limits of Insurance shown in the Fraud and Cyber Defense \nCoverage Schedule (“the Schedule”) are separate from, and in addition to, the coverage limit \nunder the Policy. All amounts paid under this endorsement will reduc

In [63]:
_ = vector_store.add_documents(documents=all_splits)

In [66]:
prompt = hub.pull('rlm/rag-prompt')

In [67]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [68]:
print(prompt.messages[0].prompt.template)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:


In [69]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [70]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [71]:
from langgraph.graph import START, StateGraph

In [72]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()
    

In [None]:
graph.invoke({'question':"Is Cyber Extortion Coverage considered as Inclusions or Exclusion?"})

{'question': 'Is Cyber Extortion Coverage considered as Inclusions or Exclusion?',
 'context': [Document(id='9adc2750-8ddc-4c2e-9863-b3f26c923f44', metadata={'producer': 'macOS Versione 12.1 (Build 21C52) Quartz PDFContext, AppendMode 1.1', 'creator': 'PyPDF', 'creationdate': "D:20210316191726Z00'00'", 'author': 'kpoll', 'moddate': "D:20220515113647Z00'00'", 'title': 'Microsoft Word - PURE Fraud and Cyber Defense Coverage Endorsement', 'source': 'docs\\form_1.pdf', 'total_pages': 17, 'page': 7, 'page_label': '8', 'start_index': 757}, page_content='AGREEMENTS AND EXCLUSIONS \n1.  Data Recovery and System Restoration Coverage \nSubject to the applicable Deductible and Limit of Insurance, we will reimburse an insured \nfor data recovery costs and system restoration costs directly resulting from a cyber \nattack, but only if all of the following conditions are met: \na. the insured first discovers the cyber attack during the endorsement period; \nb. the insured or the insured’s representat