# Q&A chatbot RAG playbook

Get OpenAI key from `.env ` file

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(".env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Setting up OpenAI model

Define LLM model that we will use

In [2]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    openai_api_key=OPENAI_API_KEY,
)

Test model by asking simple question

In [3]:
model.invoke("What is the capital of France?")

AIMessage(content='The capital of France is Paris.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 14, 'total_tokens': 21, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-157e6460-afa9-477e-a15e-58fa5726d584-0', usage_metadata={'input_tokens': 14, 'output_tokens': 7, 'total_tokens': 21, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

Using langchain parser

In [4]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

chain = model | output_parser

chain.invoke("Who is the CEO of Tesla?")

'Elon Musk is the CEO of Tesla.'

# PDF to text coversion

Using pypdf to read pdf files and following instructions from https://python.langchain.com/docs/how_to/document_loader_pdf/

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("files/layout-parser-paper.pdf")
pages = []
async for page in loader.alazy_load():
    pages.append(page)
    
print(f"{pages[0].metadata}\n")
print(pages[0].page_content)

{'source': 'files/layout-parser-paper.pdf', 'page': 0}

LayoutParser : A Uniﬁed Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1(  ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain
Lee4, Jacob Carlson3, and Weining Li5
1Allen Institute for AI
shannons@allenai.org
2Brown University
ruochen zhang@brown.edu
3Harvard University
{melissadell,jacob carlson }@fas.harvard.edu
4University of Washington
bcgl@cs.washington.edu
5University of Waterloo
w422li@uwaterloo.ca
Abstract. Recent advances in document image analysis (DIA) have been
primarily driven by the application of neural networks. Ideally, research
outcomes could be easily deployed in production and extended for further
investigation. However, various factors like loosely organized codebases
and sophisticated model conﬁgurations complicate the easy reuse of im-
portant innovations by a wide audience. Though there have been on-going
eﬀorts to improve reusability and simplify deep learning (DL) model
develo

# Spliting PDF text

Since we can't send entire PDF as the context, we will split the PDF text in different chunks

In [6]:
text_document = "\n".join([page.page_content for page in pages])
print(len(pages))
print(len(text_document))

16
42533


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_text(text_document)

print(len(documents))
print(documents[10])


53
paddleOCR12usually do not come with comprehensive functionalities for other
DIA tasks like layout analysis.
Recent years have also seen numerous eﬀorts to create libraries for promoting
reproducibility and reusability in the ﬁeld of DL. Libraries like Dectectron2 [ 35],
6The number shown is obtained by specifying the search type as ‘code’.
7https://ocr-d.de/en/about
8https://github.com/BobLd/DocumentLayoutAnalysis
9https://github.com/leonlulu/DeepLayout
10https://github.com/hpanwar08/detectron2
11https://github.com/JaidedAI/EasyOCR
12https://github.com/PaddlePaddle/PaddleOCR
4 Z. Shen et al.
Efficient Data AnnotationC u s t o m i z e d  M o d e l  T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images 
T h e  C o r e  L a y o u t P a r s e r  L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e
Fig. 1: The overall architecture of LayoutParser . For an input document image,


# Create prompt template

Provide model some context and question

In [8]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

Chain the prompt with model and the parser

In [9]:
chain = prompt | model | output_parser

chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

# Generate OpenAI embeddings

Generating embeddings for arbitary text to test it

In [10]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:5])

Embedding length: 1536
[-0.0013731546932831407, -0.034482136368751526, -0.011498215608298779, 0.0012331805191934109, -0.0261743925511837]


Generating embedding for different senteces

In [11]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

Using cosine similarity to calculate similarities

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

query_s1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_s2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

print(f"Similarity between query and sentence1: {query_s1_similarity}")
print(f"Similarity between query and sentence2: {query_s2_similarity}")

Similarity between query and sentence1: 0.9173394718348283
Similarity between query and sentence2: 0.7680513114191245


# Setup Vector Store

1. Run docker using `docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16`
1. Connect to postgres pgvector db

In [13]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [14]:
vector_store.add_texts([
    
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
])

['c1602e21-5e10-4009-8fcf-1ef2e98eace3',
 '3465c4f7-0e7d-4ef5-8483-5d1b07c68338',
 '7be06c44-7297-47b4-befe-828d2caf4c6d',
 '771d8343-49da-4b21-bcac-67a07d0a5fb9',
 '7c87704d-6c75-4aee-9ecd-694dd28d2a8e',
 'e20d715b-e012-4531-9169-2966c4b8310e']

Searching for similar content

In [15]:
vector_store.similarity_search_with_score(query="Who is Mary's sister?", k=3)

[(Document(id='1567ab20-d8a5-4ab9-8a94-5631c8815e3c', metadata={}, page_content="Mary's sister is Susana"),
  0.0826606006633015),
 (Document(id='c1602e21-5e10-4009-8fcf-1ef2e98eace3', metadata={}, page_content="Mary's sister is Susana"),
  0.0826606006633015),
 (Document(id='937fb56e-016e-4f0c-acf5-e1efcbad3777', metadata={}, page_content='Mary has two siblings'),
  0.09552745303019272)]

Using with retriever

In [16]:
retrivier1 = vector_store.as_retriever(search_kwargs={"k": 3})

retrivier1.invoke("Who is Mary's sister?")

[Document(id='1567ab20-d8a5-4ab9-8a94-5631c8815e3c', metadata={}, page_content="Mary's sister is Susana"),
 Document(id='c1602e21-5e10-4009-8fcf-1ef2e98eace3', metadata={}, page_content="Mary's sister is Susana"),
 Document(id='937fb56e-016e-4f0c-acf5-e1efcbad3777', metadata={}, page_content='Mary has two siblings')]

Setting up chain

In [17]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(
    context=retrivier1,
    question = RunnablePassthrough()
)

setup.invoke("Who")

{'context': [Document(id='df146bc4-d600-4cbd-ba28-f77d4bf1d197', metadata={}, page_content='John and Tommy are brothers'),
  Document(id='3465c4f7-0e7d-4ef5-8483-5d1b07c68338', metadata={}, page_content='John and Tommy are brothers'),
  Document(id='937fb56e-016e-4f0c-acf5-e1efcbad3777', metadata={}, page_content='Mary has two siblings')],
 'question': 'Who'}

In [18]:
chain = setup | prompt | model | output_parser

chain.invoke("Who is Mary's sister?")

"Mary's sister is Susana."

# Connecting the dots

In [19]:
# Processing pdf file

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("files/monopoly.pdf")

pages = []
async for page in loader.alazy_load():
    pages.append(page)
    
print(f"{pages[0].metadata}\n")
# print(len(pages[0].page_content))
print(f"Total pages = {len(pages)}")
# print(f"Text Sample: \n {pages[0].page_content[:100]}")
# print(pages[0])

{'source': 'files/monopoly.pdf', 'page': 0}

Total pages = 8


In [20]:
# Splitting pages into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents):
    return RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    ).split_documents(documents)

docs = split_documents(pages)

# print(docs[0])
print(f"Total docs = {len(docs)}")


Total docs = 23


In [21]:
# Creating OpenAI embeddings

import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

model = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    openai_api_key=OPENAI_API_KEY,
)

embeddings = OpenAIEmbeddings()

In [22]:
# Creating vector store

from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "monopoly_docs"


monopoly_vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [23]:
# Adding docs to vector store

d =monopoly_vector_store.add_documents(docs)
print(len(d))

23


In [24]:
# Creating steps for the chain
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

output_parser = StrOutputParser()

retriver = monopoly_vector_store.as_retriever(search_kwargs={"k": 3})

setup = RunnableParallel(
    context=retriver,
    question = RunnablePassthrough()
)

setup.invoke("Who wins the Monopoly?")

{'context': [Document(id='03796ec7-152e-4711-a7e7-5b030e519008', metadata={'page': 1, 'source': 'files/monopoly.pdf'}, page_content='Bus: This lets you "get off the bus early." Look at the two white \ndice. You can move the value of one die, the other die, or the \nsum of both dice. So if you rolled a 1 and a 5, you can move \n1 space, 5 spaces, or 6 spaces: \\t\'s your choice. \nMr. Monopoly: First, move the sum of the two white dice \nand resolve the space you land on (such as drawing a card, \nbuying the property, paying rent, etc.). Then, one of two \nthings will happen depending on whether or not there is still \nproperty in the bank. \nYES, there is property in the bank -Advance to the NEXT \nproperty that the bank still holds and buy it if you wish. If you \ndon\'t want to buy this property, move to the space anyway \nand put the property up for auction. \nNO, there are no more properties in the bank - Advance to the \nNOCT property on which you will owe another player money. \n

In [25]:
# Creating chain

chain = setup | prompt | model | output_parser

chain.invoke("How to play Monopoly?")

"To play Monopoly, you can choose to play by the classic rules for buying, renting, and selling properties or use the Speed Die to get into the action faster. When starting the game, hand out an extra $1,000 to each player, and do not use the Speed Die until you've landed on or passed over a certain point."

In [26]:
# I don't know case

# chain.invoke("What is the meaning of life?")
chain.invoke("Who is Mary's sister?")

"I don't know."

In [27]:
chain.invoke("How to win the Monopoly?")

'To win Monopoly, you typically need to bankrupt your opponents by acquiring properties, charging rent, and making strategic decisions on buying and selling properties. Additionally, managing your money wisely, making smart trades, and investing in houses and hotels can also help you secure victory in the game.'

In [28]:
emb = OpenAIEmbeddings(model='text-embedding-ada-002')

emb_q = emb.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(emb_q)}")

Embedding length: 1536
