In [18]:
!pip install langchain langchain-community langchain-openai langchain-core langchain-tavily faiss-cpu pymupdf -q

In [19]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Enter OPENAI API KEY: ")
os.environ["TAVILY_API_KEY"] = getpass("Enter TAVILY API KEY: ")

Enter OPENAI API KEY: ··········
Enter TAVILY API KEY: ··········


In [20]:
from google.colab import files
import tempfile
import os

# upload the pdf file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# get the file path and store in temporary location
file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(file_path, "wb") as f:
        f.write(uploaded[file_name])

Saving ToK guide.pdf to ToK guide (1).pdf


In [21]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_tavily import TavilySearch
from langchain_text_splitters import RecursiveCharacterTextSplitter

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
llm = ChatOpenAI(model="gpt-5-mini-2025-08-07", temperature=0)
tool = TavilySearch(max_results=3, topic="general")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [22]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [23]:
from langchain_core.prompts import ChatPromptTemplate

answer_determination_prompt = ChatPromptTemplate.from_template("""
You are an AI assistant tasked with determining if the provided context from a PDF contains sufficient information to answer a user's question.

Context from PDF: {context}

User Question: {question}

First, carefully analyze if the context provides adequate information to answer the question.

If the context contains sufficient information to answer the question, respond with a complete and accurate answer based ONLY on the provided context.

If the context does NOT contain sufficient information to fully answer the question, respond with exactly: "[NEED_WEB_SEARCH]"

Your response:
""")

web_search_prompt = ChatPromptTemplate.from_template("""
You are an AI assistant helping a user with their question.

User Question: {question}

Web Search Results: {web_results}

Using the web search results, provide a comprehensive and accurate answer to the user's question.
Make sure to cite sources from the search results where appropriate.
""")

In [24]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS

# --- PDF Processing Function ---
def process_pdf(file_path):
    loader = PyMuPDFLoader(file_path)
    docs = loader.load()
    chunks = text_splitter.split_documents(docs)
    vector_store = FAISS.from_documents(chunks, embedding_model)
    return vector_store

In [25]:
vector_store = process_pdf(file_path)

In [26]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [27]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

determination_chain = (
  {
    "context": retriever | format_docs,
    "question": RunnablePassthrough(),
  }
  | answer_determination_prompt
  | llm
  | StrOutputParser()
)

web_search_chain = (
  {
    "question": RunnablePassthrough(),
    "web_results": lambda x: tool.invoke({"query": x})
  }
  | web_search_prompt
  | llm
  | StrOutputParser()
)

In [28]:
def agent(question):
  pdf_response = determination_chain.invoke(question)
  if "[NEED_WEB_SEARCH]" in pdf_response:
    print("\n\u2139\ufe0f Info not found in PDF. Searching the web...")
    return web_search_chain.invoke(question)
  else:
    return pdf_response

In [29]:
# Ask questions interactively
while True:
  query = input("\nAsk a question about your PDF (or type 'exit'): ")
  if query.lower() == 'exit':
    break
  answer = agent(query)
  print("\n✉️ Answer:", answer)


✉️ Answer: The document is the International Baccalaureate (IB) Diploma Programme Theory of Knowledge (TOK) guide. It is intended to guide the planning, teaching and assessment of the DP TOK course, with TOK teachers as the primary audience (and also to inform students and parents). It sits within the DP core, aligns with the IB mission, and is available via the programme resource centre for first assessment in 2022.

✉️ Answer: The 12 TOK concepts are: evidence, certainty, truth, interpretation, power, justification, explanation, objectivity, perspective, culture, values, and responsibility.

ℹ️ Info not found in PDF. Searching the web...

✉️ Answer: Brief answer
- In IB DP Theory of Knowledge (TOK), "evidence" is a central concept used to evaluate knowledge claims: what counts as evidence, how strong it is, how it supports or undermines claims, and how it varies across Areas of Knowledge (AOKs) and Ways of Knowing (WOKs). (See TOK overview: IBO) (See TOK resources: Blen).  

What th

KeyboardInterrupt: Interrupted by user