# Getting the data for our knowledge base

In [1]:
import langchain as lc
import pinecone as pc
print("LangChain version:", lc.__version__)
print("Pinecone version:", pc.__version__)
!python -V

LangChain version: 1.2.6
Pinecone version: 7.3.0
Python 3.11.0


In [2]:
from datasets import load_dataset

data = load_dataset(
    "aurelio-ai/reddit-finance",
    split="train",
)
data

Dataset({
    features: ['id', 'subreddit', 'title', 'selftext'],
    num_rows: 107
})

In [3]:
data[0]

{'id': '1j0w73o',
 'subreddit': 'stocks',
 'title': 'Rate My Portfolio - r/Stocks Quarterly Thread March 2025',
 'selftext': "Please use this thread to discuss your portfolio, learn of other stock tickers &amp; portfolios like [Warren Buffet's](https://buffett.online/en/portfolio/), and help out users by giving constructive criticism.\n\nWhy quarterly?  Public companies report earnings quarterly; many investors take this as an opportunity to rebalance their portfolios.  We highly recommend you do some reading:  Check out our wiki's list of [relevant posts &amp; book recommendations.](https://www.reddit.com/r/stocks/wiki/index/#wiki_relevant_posts.2C_books.2C_wiki_recommendations)\n\nYou can find stocks on your own by using a scanner like your broker's or [Finviz.](https://finviz.com/screener.ashx)  To help further, here's a list of [relevant websites.](https://www.reddit.com/r/stocks/wiki/index/#wiki_relevant_websites.2Fapps)\n\nIf you don't have a broker yet, see our [list of brokers]

# Creating Chunks

In [4]:
import tiktoken

# We use gpt-4.1-mini as standard but tiktoken does not support gpt-4.1.
# Fortunately, 4.1 and 4o models all use the same underlying tokenizer and so
# we can use gpt-4o here
tokenizer = tiktoken.encoding_for_model('gpt-4o')

In [5]:
tokenizer.name

'o200k_base'

In [6]:
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

27

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

2026-02-12 07:31:53.892385: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
chunks = text_splitter.split_text(data[5]['selftext'])
chunks

['Alphabet\xa0reported Thursday that Waymo, its autonomous vehicle unit, is now delivering more than 250,000 paid robotaxi rides per week in the U.S.\n\nCEO Sundar Pichai said Waymo has options in terms of “business models across geographies,” and the robotaxi company is building partnerships with ride-hailing app Uber, automakers and operations and maintenance businesses that tend to its vehicle fleets.\n\n“We can’t possibly do it all ourselves,” said Pichai on a call with analysts for Alphabet’s\xa0first-quarter earnings.\xa0\n\nPichai noted that Waymo has not entirely defined its long-term business model, and there is “future optionality around personal ownership” of vehicles equipped with Waymo’s self-driving technology. The company is also exploring the ways it can scale up its operations, he said.\n\nThe 250,000 paid rides per week are up from 200,000 in February, before Waymo opened in\xa0Austin\xa0and expanded in the\xa0San Francisco Bay Area\xa0in March.\xa0\n\nWaymo, which is

In [9]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1])

(279, 291)

# Creating Embeddings

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

prompt = "Represent this sentence for searching relevant passages: "
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
query_encode_kwargs = {"prompt": prompt}

embed = HuggingFaceEmbeddings(
    model_name=model_name, 
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs,
    query_encode_kwargs=query_encode_kwargs
)

In [11]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 384)

# Vector Database

In [12]:
from pinecone import Pinecone

pc = Pinecone()

In [13]:
from pinecone import AwsRegion, CloudProvider, Metric, ServerlessSpec

index_name = 'langchain-retrieval-augmentation'

# check if index already exists (it shouldn't if this is first time)
if not pc.has_index(name=index_name):
    # if does not exist, create index
    pc.create_index(
        name=index_name,
        dimension=384,  # dimensionality of text-embedding-3-small
        metric=Metric.DOTPRODUCT,
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        )
    )

# connect to index
index = pc.Index(name=index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

## Indexing

In [14]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

count = 0

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    url = f"https://reddit.com/r/{record['subreddit']}/comments/{record['id']}"
    metadata = {
        'thread_id': str(record['id']),
        'source': url,
        'subreddit': record['subreddit']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['selftext'])

    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)

    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []
        count += 1

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/107 [00:00<?, ?it/s]

In [15]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {'': {'vector_count': 170}},
 'total_vector_count': 170,
 'vector_type': 'dense'}

## Creating a Vector Store and Querying

In [16]:
from langchain_pinecone import PineconeVectorStore

# initialize the vector store object
vectorstore = PineconeVectorStore(index=index, embedding=embed)

In [17]:
query = "how many robotaxi rides did waymo report in the US?"

vectorstore.similarity_search_with_relevance_scores(
    query,  # our search query
    k=6 # return 3 most relevant docs
)

[(Document(id='4ba3c603-272d-45cf-a52b-9444ae7c1d7b', metadata={'chunk': 0.0, 'source': 'https://reddit.com/r/stocks/comments/1k7782l', 'subreddit': 'stocks', 'thread_id': '1k7782l'}, page_content='Alphabet\xa0reported Thursday that Waymo, its autonomous vehicle unit, is now delivering more than 250,000 paid robotaxi rides per week in the U.S.\n\nCEO Sundar Pichai said Waymo has options in terms of “business models across geographies,” and the robotaxi company is building partnerships with ride-hailing app Uber, automakers and operations and maintenance businesses that tend to its vehicle fleets.\n\n“We can’t possibly do it all ourselves,” said Pichai on a call with analysts for Alphabet’s\xa0first-quarter earnings.\xa0\n\nPichai noted that Waymo has not entirely defined its long-term business model, and there is “future optionality around personal ownership” of vehicles equipped with Waymo’s self-driving technology. The company is also exploring the ways it can scale up its operations

# Retrieval Augmented Generation

In [18]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Use HuggingFaceEndpoint - automatic routing based on availability
llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    max_new_tokens=300,
    temperature=0.7,
)

chat_llm = ChatHuggingFace(llm=llm)

# Create prompt template with source formatting
template = """Answer the question based on the following context. Include the source URLs in your answer.

{context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# Create LCEL chain
retrieval_chain = (
    {"context": vectorstore.as_retriever(), "question": lambda x: x}
    | prompt
    | chat_llm
    | StrOutputParser()
)

print("Query: ", query)

response = retrieval_chain.invoke(query)

print("Response: ", response)

Query:  how many robotaxi rides did waymo report in the US?
Response:  According to the provided context, Waymo reported delivering more than 250,000 paid robotaxi rides per week in the U.S.

Source: 
https://www.cnbc.com/2025/04/24/waymo-reports-250000-paid-robotaxi-rides-per-week-in-us.html


In [19]:
# delete index, if you want to start fresh
index_name = 'langchain-retrieval-augmentation'
pc.delete_index(name=index_name)