In [4]:
!pip install --upgrade --quiet pinecone-client pinecone pinecone-text pinecone-notebooks "numpy>=2.0.0,<3.0.0"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [23]:
!pip install --upgrade --quiet langchain-huggingface sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [10]:
from tqdm.auto import tqdm as notebook_tqdm

In [12]:
import streamlit as st

api_key = st.secrets["PINECONE_KEY"]

In [13]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [17]:
from pinecone import Pinecone, ServerlessSpec
index_name = "langchain-hybrid-search"

### Initialize Pinecone client

In [14]:
pc = Pinecone(api_key=api_key)

### Creating a new index if not exists

In [18]:
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,     # dimension of dense vector 
        metric="dotproduct",  # sparse values are supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [22]:
index = pc.Index(index_name)
print(f"Index {index_name, index} created.")
index.describe_index_stats()

Index ('langchain-hybrid-search', <pinecone.data.index.Index object at 0x717b7a8ba240>) created.


{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [None]:
from pinecone_text.sparse import BM25Encoder   # for sparse vectorization

bm25_encoder = BM25Encoder().default() # TF-IDF technique

In [28]:
print(bm25_encoder)

<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x717b0bc079e0>


In [39]:
# Download required NLTK data
import nltk
nltk.download('punkt_tab', quiet=True)


True

In [40]:

# List of sample sentences
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold.",
    "The only thing we have to fear is fear itself.",
]

# TF-IDF values on these sentences
bm25_encoder.fit(sentences)

# store in a json file
bm25_encoder.dump("bm25_values.json")

# load the json file
bm25_encoder.load("bm25_values.json")

100%|██████████| 5/5 [00:00<00:00, 209.55it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x717b0bc079e0>

In [43]:
from langchain_huggingface import HuggingFaceEmbeddings

# Create LangChain compatible embeddings wrapper for our SentenceTransformer model
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

retriever = PineconeHybridSearchRetriever(
    embeddings=hf_embeddings,
    sparse_encoder=bm25_encoder,
    index=index
)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x717b0bc079e0>, index=<pinecone.data.index.Index object at 0x717b7a8ba240>)

In [44]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:01<00:00,  1.46s/it]


In [None]:
retriever.invoke(
    "What is the only thing we have to fear?",
    # k=3,
    # sparse_k=3,
    # dense_k=3,
    # hybrid_k=3,
    # include_metadata=True
)

[Document(metadata={'score': 0.733545}, page_content='The only thing we have to fear is fear itself.'),
 Document(metadata={'score': 0.147580177}, page_content='To be or not to be, that is the question.'),
 Document(metadata={'score': 0.0415346622}, page_content='A journey of a thousand miles begins with a single step.'),
 Document(metadata={'score': 0.032869257}, page_content='The quick brown fox jumps over the lazy dog.')]