# PDF rag

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "genai-principles.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
print(len(docs))

12


In [6]:
print(docs[0].page_content[0:200])
print(docs[0].metadata)

 
Karan Singh, Assistant Professor of Operations Research 
Principles of Generative AI 
A Technical Introduction 
Generative artificial intelligence (GenAI) tools are an emerging class of new-age arti
{'source': 'genai-principles.pdf', 'page': 0}


### Load model

In [7]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

### In Memory vectorstore

In [10]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vector_store = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)

retriever = vector_store.as_retriever()

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

results = rag_chain.invoke({"input": "What images are present in this document?"})
results

{'input': 'What images are present in this document?',
 'context': [Document(id='98132e5e-0879-4ac7-b889-087a5beed791', metadata={'source': 'genai-principles.pdf', 'page': 1}, page_content='product deployment, for example, by Adobe for creating visual content and by Github as a \nprogramming assistance tool.   \n 2\nFigure 2: An image-\nbased GenAI model, \nMidjourney’s response to \nthe prompt — \n“Businessman in Tokyo \namidst rush hour, his \ngaze fixed ahead, \nsurrounded by a sea of \nblack umbrellas.”\nFigure 3: Based on a code-based GenAI model, OpenAI Codex, \nGithub Copilot is a commercial tool that can generate functional \ncode from specifications given as natural language. Reportedly, as \nof June 2023, it served over a million users.'),
  Document(id='1b8c1368-0d41-457d-b66d-63fff9fe0709', metadata={'source': 'genai-principles.pdf', 'page': 6}, page_content='representations. This permits us to (a) train a deep net to separate images of cats and dogs on \na large dataset an

In [25]:
results.keys()

dict_keys(['input', 'context', 'answer'])

In [26]:
print(results["answer"])

The document contains an image generated by Midjourney in response to the prompt “Businessman in Tokyo amidst rush hour, his gaze fixed ahead, surrounded by a sea of black umbrellas.”
