In [4]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema.document import Document
from langchain.document_loaders import TextLoader
import os
from dotenv import load_dotenv

In [5]:
load_dotenv()
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [1]:
import requests
from langchain.document_loaders import PyPDFLoader

def download_pdf(url, local_path):
    response = requests.get(url)
    with open(local_path, 'wb') as f:
        f.write(response.content)

# Example
url = "https://openreview.net/pdf?id=VtmBAGCN7o"
local_pdf = "temp.pdf"

download_pdf(url, local_pdf)

# Load with LangChain
loader = PyPDFLoader(local_pdf)
docs = loader.load()

# Optional: clean up after use
# os.remove(local_pdf)


In [6]:
# 1. Prepare and split documents
# loader = TextLoader("data.txt")  # <- replace with your file
# document = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
split_docs = splitter.split_documents(docs)


In [7]:
# 2. Embed and persist vector store
persist_dir = "./db"
db = Chroma.from_documents(split_docs, embedding=OpenAIEmbeddings(), persist_directory=persist_dir)
db.persist()
retriever = db.as_retriever(search_kwargs={"k": 6})

  db = Chroma.from_documents(split_docs, embedding=OpenAIEmbeddings(), persist_directory=persist_dir)
  db.persist()


In [None]:

# 3. Setup LLMs and prompts
llm = ChatOpenAI(temperature=0)

tag_prompt = ChatPromptTemplate.from_template("""
    Extract structured metadata as JSON from the question below:\n\n{query}"""
)

qa_prompt = ChatPromptTemplate.from_template(
    """Use only the context below to answer the question.
If the answer is not directly in the context, use reasoning to infer a likely but informed answer."

Context:
{context}

Question: {input}
"""
)

In [13]:
def ask_question(query: str) -> str:
    # Tag extraction
    tag_response = llm(tag_prompt.format_prompt(query=query).to_messages())
    tags_text = tag_response.content.strip()

    # You can parse tags_text as JSON if you want structured tags
    print("Extracted tags:", tags_text)

    # Retrieve docs
    relevant_docs = retriever.get_relevant_documents(query)
    combined_context = "\n".join(doc.page_content for doc in relevant_docs)

    # QA
    qa_response = llm(qa_prompt.format_prompt(input=query, context=combined_context).to_messages())

    print("Answer:", qa_response.content)
    return qa_response.content

if __name__ == "__main__":
    question = "can you list down all the papers refer in the article with author name and name of paper"
    answer = ask_question(question)
    print("Final answer:", answer)

Extracted tags: {
  "company": null,
  "year": null
}
Answer: Based on the context provided, it is not possible to list down all the papers referred to in the article with author names and paper titles. The context mentions a long list of names of individuals who reviewed and edited the paper, contributed to the evaluation metric design, helped with experiments, and more. However, specific paper titles and author names are not mentioned in the context.
Final answer: Based on the context provided, it is not possible to list down all the papers referred to in the article with author names and paper titles. The context mentions a long list of names of individuals who reviewed and edited the paper, contributed to the evaluation metric design, helped with experiments, and more. However, specific paper titles and author names are not mentioned in the context.
