# Getting data

In [1]:
import langchain as lc
print("LangChain version:", lc.__version__)
!python -V

LangChain version: 1.2.6
Python 3.11.0


In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [3]:
data.to_pandas().head()

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1910.01108,0,"DistilBERT, a distilled version of BERT: small...",1910.01108,"DistilBERT, a distilled version of BERT: small...",As Transfer Learning from large-scale pre-trai...,http://arxiv.org/pdf/1910.01108,"[Victor Sanh, Lysandre Debut, Julien Chaumond,...",[cs.CL],February 2020 - Revision: fix bug in evaluatio...,,cs.CL,20191002,20200301,[{'id': '1910.01108'}]
1,1910.01108,1,"loss combining language modeling, distillation...",1910.01108,"DistilBERT, a distilled version of BERT: small...",As Transfer Learning from large-scale pre-trai...,http://arxiv.org/pdf/1910.01108,"[Victor Sanh, Lysandre Debut, Julien Chaumond,...",[cs.CL],February 2020 - Revision: fix bug in evaluatio...,,cs.CL,20191002,20200301,[{'id': '1910.01108'}]
2,1910.01108,2,in real-time has the potential to enable novel...,1910.01108,"DistilBERT, a distilled version of BERT: small...",As Transfer Learning from large-scale pre-trai...,http://arxiv.org/pdf/1910.01108,"[Victor Sanh, Lysandre Debut, Julien Chaumond,...",[cs.CL],February 2020 - Revision: fix bug in evaluatio...,,cs.CL,20191002,20200301,[{'id': '1910.01108'}]
3,1910.01108,3,through distillation via the supervision of a ...,1910.01108,"DistilBERT, a distilled version of BERT: small...",As Transfer Learning from large-scale pre-trai...,http://arxiv.org/pdf/1910.01108,"[Victor Sanh, Lysandre Debut, Julien Chaumond,...",[cs.CL],February 2020 - Revision: fix bug in evaluatio...,,cs.CL,20191002,20200301,[{'id': '1910.01108'}]
4,1910.01108,4,generalization capabilities of the model and h...,1910.01108,"DistilBERT, a distilled version of BERT: small...",As Transfer Learning from large-scale pre-trai...,http://arxiv.org/pdf/1910.01108,"[Victor Sanh, Lysandre Debut, Julien Chaumond,...",[cs.CL],February 2020 - Revision: fix bug in evaluatio...,,cs.CL,20191002,20200301,[{'id': '1910.01108'}]


In [4]:
from langchain_classic.docstore.document import Document

docs = []

for row in data:
    doc = Document(
        page_content=row["chunk"],
        metadata={
            "title": row["title"],
            "source": row["source"],
            "id": row["id"],
            "chunk-id": row["chunk-id"],
            "text": row["chunk"]
        }
    )
    docs.append(doc)

In [5]:
docs[0]

Document(metadata={'title': 'DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter', 'source': 'http://arxiv.org/pdf/1910.01108', 'id': '1910.01108', 'chunk-id': '0', 'text': 'DistilBERT, a distilled version of BERT: smaller,\nfaster, cheaper and lighter\nVictor SANH, Lysandre DEBUT, Julien CHAUMOND, Thomas WOLF\nHugging Face\n{victor,lysandre,julien,thomas}@huggingface.co\nAbstract\nAs Transfer Learning from large-scale pre-trained models becomes more prevalent\nin Natural Language Processing (NLP), operating these large models in on-theedge and/or under constrained computational training or inference budgets remains\nchallenging. In this work, we propose a method to pre-train a smaller generalpurpose language representation model, called DistilBERT, which can then be ﬁnetuned with good performances on a wide range of tasks like its larger counterparts.\nWhile most prior work investigated the use of distillation for building task-speciﬁc\nmodels, we leverage kn

# Model Setup

In [6]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    temperature=0.1,
)
chat_llm = ChatHuggingFace(llm=llm)

# Embedding and Vector DB Setup

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

prompt = "Represent this sentence for searching relevant passages: "
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
query_encode_kwargs = {"prompt": prompt}

embed = HuggingFaceEmbeddings(
    model_name=model_name, 
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs,
    query_encode_kwargs=query_encode_kwargs
)

2026-02-26 07:32:39.912969: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from pinecone import Pinecone

pc = Pinecone()

In [9]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [10]:
import time

index_name = "langchain-multi-query-demo"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of BAAI/bge-small-en
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [11]:
len(docs)

41584

In [12]:
from tqdm.auto import tqdm

batch_size = 100
docs = docs[:20000]  # for demo purposes, only use 20000 records

for i in tqdm(range(0, len(docs), batch_size)):
    i_end = min(len(docs), i+batch_size)
    docs_batch = docs[i:i_end]
    # get IDs
    ids = [f"{doc.metadata['id']}-{doc.metadata['chunk-id']}" for doc in docs_batch]
    # get text and embed
    texts = [d.page_content for d in docs_batch]
    embeds = embed.embed_documents(texts=texts)
    # get metadata
    metadata = [d.metadata for d in docs_batch]
    to_upsert = zip(ids, embeds, metadata)
    index.upsert(vectors=to_upsert)

  0%|          | 0/200 [00:00<?, ?it/s]

# Multi-Query with LangChain

In [13]:
from langchain_pinecone import PineconeVectorStore

# initialize the vector store object
vectorstore = PineconeVectorStore(index=index, embedding=embed)

In [14]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever

retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=chat_llm
)

In [15]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain_classic.retrievers.multi_query").setLevel(logging.INFO)

In [16]:
question = "tell me about llama 2?"

queried_docs = retriever.invoke(question)
len(queried_docs)

INFO:langchain_classic.retrievers.multi_query:Generated queries: ['Version 1: ', 'What are the key characteristics and features of Llama 2?', 'This version of the question focuses on retrieving documents that describe the essential attributes and capabilities of Llama 2, allowing the user to gain a deeper understanding of its capabilities.', 'Version 2: ', 'Can you provide information about the improvements and advancements made in Llama 2 compared to its predecessor?', 'This version of the question targets documents that highlight the differences and enhancements between Llama 2 and its previous version, enabling the user to identify the new features and capabilities.', 'Version 3: ', 'What are the applications and use cases where Llama 2 can be effectively utilized, and what are its potential benefits in those areas?', 'This version of the question aims to retrieve documents that showcase the practical applications and potential benefits of Llama 2, allowing the user to explore its r

23

In [17]:
queried_docs

[Document(id='2010.07079-76', metadata={'chunk-id': '76', 'id': '2010.07079', 'source': 'http://arxiv.org/pdf/2010.07079', 'title': 'Recipes for Safety in Open-domain Chatbots'}, page_content='1.0 0.1% 0.4% 83.4% 0.1% 0.4% 2.3% 0.187\nBST 2.7B Non-Sequitur (FT) 0.1 1.3% 7.5% 0.2% 0.1% 0.5% 0% 0.186\n0.3 0.9% 5.6% 12.6% 0.1% 0.7% 0% 0.188\n0.5 0.9% 3.3% 29.3% 0.1% 0.7% 0.1% 0.187\n1.0 0.6% 2.1% 49.1% 0.1% 0.7% 0.2% 0.186\n1.5\x030.2% 0.9% 66.1% 0.2% 0.9% 0.2% 0.187\nTable 11: Automatic Safety Metrics for baked-in models , varying the parameter that controls how often safe\nresponses ﬁre. We report the % of the time those responses are produced for different hyperparameter choices\n(Safe%). The models marked with\x03were chosen for human evaluations.\nModel Non-Seq%\nTwo-stage models with classiﬁers\nBST 2.7B + Multi-Turn Safety Cl. 4.9'),
 Document(id='2211.09110-279', metadata={'chunk-id': '279', 'id': '2211.09110', 'source': 'http://arxiv.org/pdf/2211.09110', 'title': 'Holistic Evalua

# Adding the Generation in RAG

In [18]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Create prompt template with source formatting
template = """You are a helpful assistant who answers user queries using the
    contexts provided. If the question cannot be answered using the information
    provided say "I don't know".

    Contexts:
    {contexts}

    Question: {query}"""

prompt = ChatPromptTemplate.from_template(template)

# Create LCEL chain
retrieval_chain = (
    {
        "query": lambda x: x["query"],
        "contexts": lambda x: "\n---\n".join([d.page_content for d in retriever.invoke(x["query"])])
    }
    | prompt
    | chat_llm
    | StrOutputParser()
)

print("Query: ", question)

response = retrieval_chain.invoke({"query": question})
print("Response: ", response)

Query:  tell me about llama 2?


INFO:langchain_classic.retrievers.multi_query:Generated queries: ['Version 1: ', 'What are the key characteristics and features of Llama 2?', 'This version of the question focuses on retrieving documents that describe the essential attributes and capabilities of Llama 2, rather than just its name. This can help to overcome the limitation of distance-based similarity search, which may not always return relevant results if the query is too specific or contains typos.', 'Version 2: ', 'Can you provide information about the second generation of Llama models?', "This version of the question adds more context to the original query, specifying that it's about the second generation of Llama models. This can help to disambiguate the query and retrieve documents that are more relevant to the specific topic.", 'Version 3: ', 'What are the differences and improvements between Llama 2 and its predecessor?', 'This version of the question focuses on the comparison between Llama 2 and its previous ver

Response:  I don't know.


In [19]:
pc.delete_index(index_name)