In [1]:
print("all ok")

all ok


### Keyword Search

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
documents=[
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [4]:
query="keyword-based search"

In [5]:
import re
def remove_punc(text):
    text=text.lower()
    text=re.sub(r'[^\w\s]','',text)
    return text

In [6]:
preprocessed_doc=[remove_punc(doc) for doc in documents]
preprocessed_doc

['this is a list which containing sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [7]:
print("Preprocessed documents:")
for doc in preprocessed_doc:
    print(doc)

Preprocessed documents:
this is a list which containing sample documents
keywords are important for keywordbased search
document analysis involves extracting keywords
keywordbased search relies on sparse embeddings


In [8]:
print("Preprocessed query:")
preprocessed_query=remove_punc(query)
print(preprocessed_query)

Preprocessed query:
keywordbased search


In [9]:
vector=TfidfVectorizer()
doc_vector=vector.fit_transform(preprocessed_doc)

In [10]:
doc_vector.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [11]:
doc_vector.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [12]:
query_vector=vector.transform([preprocessed_query])
query_vector.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [13]:
similarity=cosine_similarity(doc_vector,query_vector)

In [14]:
similarity

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [15]:
np.argsort(similarity,axis=0)

array([[0],
       [2],
       [3],
       [1]], dtype=int64)

In [16]:
reranked_indices=np.argsort(similarity,axis=0)[::-1].flatten()

In [17]:
reranked_indices

array([1, 3, 2, 0], dtype=int64)

In [18]:
ranked_documents=[documents[i] for i in reranked_indices]
ranked_documents

['Keywords are important for keyword-based search.',
 'Keyword-based search relies on sparse embeddings.',
 'Document analysis involves extracting keywords.',
 'This is a list which containing sample documents.']

In [19]:
for i , doc in enumerate(ranked_documents):
    print(f"Rank-{i+1}: {ranked_documents[i]}")

Rank-1: Keywords are important for keyword-based search.
Rank-2: Keyword-based search relies on sparse embeddings.
Rank-3: Document analysis involves extracting keywords.
Rank-4: This is a list which containing sample documents.


### Vector search

In [20]:
document_embedding=np.array([
    [0.524,0.364,0.452,0.247,0.854,0.451],
    [0.542,0.125,0.478,0.457,0.247,0.364],
    [0.142,0.257,0.542,0.654,0.985,0.845],
    [0.254,0.358,0.247,0.365,0.245,0.524]
])

In [21]:
query_embedding=np.array([[0.254,0.478,0.654,0.358,0.542,0.245]])

In [22]:
similariry_emb=cosine_similarity(document_embedding,query_embedding)
similariry_emb

array([[0.9101004 ],
       [0.84324332],
       [0.85976582],
       [0.84701513]])

In [23]:
reranked_indices=np.argsort(similariry_emb,axis=0)[::-1].flatten()

In [24]:
reranked_indices

array([0, 2, 3, 1], dtype=int64)

In [25]:
ranked_documents=[document_embedding[i] for i in reranked_indices]
ranked_documents

[array([0.524, 0.364, 0.452, 0.247, 0.854, 0.451]),
 array([0.142, 0.257, 0.542, 0.654, 0.985, 0.845]),
 array([0.254, 0.358, 0.247, 0.365, 0.245, 0.524]),
 array([0.542, 0.125, 0.478, 0.457, 0.247, 0.364])]

In [26]:
from langchain_community.document_loaders import PyPDFLoader

In [27]:
loader=PyPDFLoader(r"F:\Tapas\Learning\GenAI\data\2501.09136v3.pdf")
docs=loader.load()

In [28]:
len(docs)

39

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
spliter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)

In [30]:
splited_docs=spliter.split_documents(docs)
print(len(splited_docs))

250


In [31]:
splited_docs[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-05T01:26:00+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'F:\\Tapas\\Learning\\GenAI\\data\\2501.09136v3.pdf', 'total_pages': 39, 'page': 0, 'page_label': '1'}, page_content='AGENTIC RETRIEVAL -AUGMENTED GENERATION : A S URVEY ON\nAGENTIC RAG\nAditi Singh\nDepartment of Computer Science\nCleveland State University\nCleveland, OH, USA\na.singh22@csuohio.edu\nAbul Ehtesham\nThe Davey Tree Expert Company\nKent, OH, USA\nabul.ehtesham@davey.com\nSaket Kumar\nThe MathWorks Inc\nNatick, MA, USA\nsaketk@mathworks.com\nTala Talaei Khoei\nKhoury College of Computer Science\nRoux Institute at Northeastern University\nPortland, ME, USA\nt.talaeikhoei@northeastern.edu\nABSTRACT')

In [32]:
from langchain_openai.embeddings import OpenAIEmbeddings
import os
openai_api=os.getenv("OPENAI_API_KEY")

embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002",api_key=openai_api)

In [33]:
len(embedding_model.embed_query("Hello How are you?"))

1536

In [34]:
from langchain_chroma import Chroma

In [35]:
vector_store=Chroma(collection_name="documents_data",embedding_function=embedding_model)

In [36]:
vector_store.add_documents(splited_docs)

['552c3aff-b7a5-46b7-8200-8452bf84afe3',
 'c5831dff-61b3-4a01-9f04-a877843aa272',
 'e7a4a243-f19f-4f6d-aa6e-39bca45f5dfb',
 '88817a65-4ef1-4a05-ab42-1e5a1a63dcb3',
 'e66f2041-f66e-4eb2-9af3-83dacf9a7d98',
 '35248b46-282a-4fe4-9ce8-34bc0bbb42bf',
 '72a3612b-9bdb-4eb1-a2df-6ba1dd2c34ae',
 '5effb0e2-c527-4e2e-8a46-4348bfdf9366',
 '099b6c2f-34a7-477a-a6d6-41e990baa19f',
 '87285047-8f69-4cbe-ba02-8dbcd91180e7',
 '989261ee-6e42-48c3-ba72-69d00229808f',
 'aec7267e-3a77-4b65-b500-c7a95a8fed2a',
 '6d3f1c10-6f71-41bf-99e1-bc342f4759ce',
 '3b73ac18-909e-4465-890c-0e3894a69fc7',
 'eb446106-9ae9-4f33-9078-d9cc965cb7d9',
 '84f3b04f-7f45-4a23-919d-4b6fc2e5e18c',
 '5a7433cf-8c3f-4b6d-9d33-af6d36083bb7',
 '5ec446d4-d245-46ef-815d-9209f9a7d0be',
 '1c25aa77-cd24-44e2-871b-913cea6ca779',
 '29326bbf-6dea-4cea-8b6d-dc2b0a4f0c4c',
 '097f6143-e366-441a-aa27-bd7d0337ef18',
 'cfa9aa6f-cb77-484d-b79a-41ac71cae1d3',
 '2369c006-d3a6-49e0-b0b3-eeec2e8576bf',
 '98213ee9-1fce-478f-b788-2d09ad9cac5c',
 '76b30408-59e1-

In [37]:
retriever=vector_store.as_retriever(search_kwargs={"k":3})

In [38]:
from langchain.retrievers import BM25Retriever,EnsembleRetriever

In [39]:
keyword_retriever=BM25Retriever.from_documents(splited_docs)

In [40]:
keyword_retriever.k=3

In [41]:
ble_retriever=EnsembleRetriever(retrievers=[retriever,keyword_retriever],weights=[0.3,0.7])

In [42]:
ble_retriever.invoke("what is rag?")

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-05T01:26:00+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'F:\\Tapas\\Learning\\GenAI\\data\\2501.09136v3.pdf', 'total_pages': 39, 'page': 34, 'page_label': '35'}, page_content='Reflexion: Language agents with verbal reinforcement learning, 2023.\n[29] Taicheng Guo, Xiuying Chen, Yaqi Wang, Ruidi Chang, Shichao Pei, Nitesh V . Chawla, Olaf Wiest, and\nXiangliang Zhang. Large language model based multi-agents: A survey of progress and challenges, 2024.\n[30] Weaviate Blog. What is agentic rag? https://weaviate.io/blog/what-is-agentic-rag#:~:text=is%\n20Agentic%20RAG%3F-,%E2%80%8B,of%20the%20non%2Dagentic%20pipeline. Accessed: 2025-01-14.'),
 Document(metadata={'producer': 'pdfTeX-1.4

In [43]:
retriever.invoke("What is rag?")

[Document(id='5ec446d4-d245-46ef-815d-9209f9a7d0be', metadata={'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'page_label': '2', 'total_pages': 39, 'subject': '', 'trapped': '/False', 'keywords': '', 'creationdate': '2025-02-05T01:26:00+00:00', 'page': 1, 'creator': 'LaTeX with hyperref', 'source': 'F:\\Tapas\\Learning\\GenAI\\data\\2501.09136v3.pdf', 'producer': 'pdfTeX-1.40.25', 'author': '', 'title': ''}, page_content='Retrieval-Augmented Generation (RAG) represents a significant advancement in the field of artificial intelligence,\ncombining the generative capabilities of Large Language Models (LLMs) with real-time data retrieval. While LLMs\nhave demonstrated remarkable capabilities in natural language processing, their reliance on static pre-trained data\noften results in outdated or incomplete responses. RAG addresses this limitation by dynamically retrieving relevant'),
 Docume

In [44]:
keyword_retriever.invoke("What is rag?")

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-05T01:26:00+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'F:\\Tapas\\Learning\\GenAI\\data\\2501.09136v3.pdf', 'total_pages': 39, 'page': 34, 'page_label': '35'}, page_content='Reflexion: Language agents with verbal reinforcement learning, 2023.\n[29] Taicheng Guo, Xiuying Chen, Yaqi Wang, Ruidi Chang, Shichao Pei, Nitesh V . Chawla, Olaf Wiest, and\nXiangliang Zhang. Large language model based multi-agents: A survey of progress and challenges, 2024.\n[30] Weaviate Blog. What is agentic rag? https://weaviate.io/blog/what-is-agentic-rag#:~:text=is%\n20Agentic%20RAG%3F-,%E2%80%8B,of%20the%20non%2Dagentic%20pipeline. Accessed: 2025-01-14.'),
 Document(metadata={'producer': 'pdfTeX-1.4

In [45]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# prepare PromptTemplate
message = """
Answer the question below using only the context provided.

{question}

Context:
{context}
"""

prompt_template = ChatPromptTemplate.from_messages([("human", message)])

In [46]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [47]:
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(model='gpt-4o')

In [48]:
from langchain_core.runnables import RunnablePassthrough

In [49]:
from langchain_core.output_parsers import StrOutputParser

In [51]:
rag_hybrid_chain = (
    {"context": ble_retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [52]:
response=rag_hybrid_chain.invoke("What is RAG?")
response

'RAG, or Retrieval-Augmented Generation, represents a significant advancement in artificial intelligence. It combines the generative capabilities of Large Language Models (LLMs) with real-time data retrieval. While LLMs are capable in natural language processing, they often rely on static pre-trained data, which can result in outdated or incomplete responses. RAG addresses this limitation by dynamically retrieving relevant data to enhance the quality and relevance of the generated responses.'

In [53]:
print(response)

RAG, or Retrieval-Augmented Generation, represents a significant advancement in artificial intelligence. It combines the generative capabilities of Large Language Models (LLMs) with real-time data retrieval. While LLMs are capable in natural language processing, they often rely on static pre-trained data, which can result in outdated or incomplete responses. RAG addresses this limitation by dynamically retrieving relevant data to enhance the quality and relevance of the generated responses.


In [54]:
rag_vector_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [55]:
response=rag_vector_chain.invoke("What is RAG?")
response

'RAG stands for Retrieval-Augmented Generation. It represents a significant advancement in artificial intelligence that combines the generative capabilities of Large Language Models (LLMs) with real-time data retrieval. This approach addresses the limitation of LLMs that rely on static pre-trained data, which can result in outdated or incomplete responses by dynamically retrieving relevant information.'

In [56]:
rag_keyword_chain = (
    {"context": keyword_retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [57]:
response=rag_keyword_chain.invoke("What is RAG?")
response

'RAG in the provided context stands for "Retrieval-Augmented Generation." It is a paradigm that has evolved to address complex real-world applications requiring contextual accuracy, scalability, and multi-step reasoning.'

## Vector_search retriever response:
'RAG stands for Retrieval-Augmented Generation. It represents a significant advancement in artificial '
'intelligence that combines the generative capabilities of Large Language Models (LLMs) with real-time '
'data retrieval. This approach addresses the limitation of LLMs that rely on static pre-trained data, '
'which can result in outdated or incomplete responses by dynamically retrieving relevant information.'


## Hybrid search retriever response:
'RAG, or Retrieval-Augmented Generation, represents a significant advancement in artificial'
' intelligence. It combines the generative capabilities of Large Language Models (LLMs) with '
'real-time data retrieval. While LLMs are capable in natural language processing, they often '
'rely on static pre-trained data, which can result in outdated or incomplete responses. '
'RAG addresses this limitation by dynamically retrieving relevant data to enhance the quality '
'and relevance of the generated responses.'


## Keyword search retriever response
'RAG in the provided context stands for "Retrieval-Augmented Generation." It is a paradigm that '
'has evolved to address complex real-world applications requiring contextual accuracy, scalability, '
'and multi-step reasoning.'


In [60]:
response=rag_hybrid_chain.invoke("What is Abstractive Question Answering?")
response

'The context provided does not contain any information regarding "Abstractive Question Answering." Therefore, I cannot answer the question based on the available content.'