# Hybrid Search walkthrough for better LLM RAG Retrieval

In [1]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


# Keyword Search

In [2]:
from rank_bm25 import BM25Okapi

corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [3]:
query = "The cat"
tokenized_query = query.split(" ")

tokenized_query

['The', 'cat']

In [4]:
doc_scores = bm25.get_scores(tokenized_query)

# scores for documents 1, 2, 3, and 4
print(doc_scores)

[0.92061135 0.20898199 0.         0.18788848]


### We can try with another query.

In [5]:
query_2 = "The dog"
tokenized_query_2 = query_2.split(" ")

tokenized_query_2

['The', 'dog']

In [6]:
doc_scores = bm25.get_scores(tokenized_query_2)

# scores for documents 1, 2, 3, and 4
print(doc_scores)

[0.15633023 1.23067172 0.         0.18788848]


# Semantic Similiarity Search

In [7]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
2024-08-17 17:34:01.541425: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 17:34:01.541568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 17:34:01.696925: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# The documents to encode
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]

# Calculate embeddings by calling model.encode()
document_embeddings = model.encode(corpus)

# Sanity check
print(document_embeddings.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(4, 384)


In [10]:
query = "The cat"
query_embedding = model.encode(query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
from sentence_transformers.util import cos_sim

# Compute cosine_similarity between documents and query
scores = cos_sim(document_embeddings, query_embedding)
scores

tensor([[0.5716],
        [0.2904],
        [0.0942],
        [0.3157]])

### Rerun the query with the word 'feline'

In [12]:
query_embedding = model.encode("feline")

scores = cos_sim(document_embeddings, query_embedding)

print(scores)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[0.4007],
        [0.3837],
        [0.0966],
        [0.3804]])


# Hybrid Search

## Combining Keyword and Semantic Searching

We can combine the results of BM25 and cosine similarity using Reciprocal Rank Fusion (RRF). RRF is a simple algorithm for combining the rankings of different scoring functions

In [13]:
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758",
]
query = "The cat"

bm25_ranking = [1, 2, 4, 3] # scores = [0.92932018 0.21121974 0. 0.1901173]
cosine_ranking = [1, 3, 4, 2] # scores = [0.5716, 0.2904, 0.0942, 0.3157]

## First, let’s define functions for RRF and a helper function to convert float scores to int rankings.

In [14]:
import numpy as np

def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (rank 1 is the best)"""
    return np.argsort(scores)[::-1] + 1


def rrf(keyword_rank: int, semantic_rank: int) -> float:
    """Combine keyword rank and semantic rank into a hybrid score."""
    k = 60
    rrf_score = 1 / (k + keyword_rank) + 1 / (k + semantic_rank)
    return rrf_score

In [15]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(
    query: str, corpus: list[str], encoder_model: SentenceTransformer
) -> list[int]:
    # bm25
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    tokenized_query = query.split(" ")
    print(f"tokenized query: {tokenized_query}")
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(tokenized_query)
    print(f"bm25 scores: {bm25_scores}")
    bm25_ranking = scores_to_ranking(bm25_scores)
    print(f"bm25 ranking: {bm25_ranking}")

    # embeddings
    document_embeddings = model.encode(corpus)
    query_embedding = model.encode(query)
    cos_sim_scores = cos_sim(document_embeddings, query_embedding).flatten().tolist()
    print(f"cosine sim scores: {cos_sim_scores}")
    cos_sim_ranking = scores_to_ranking(cos_sim_scores)
    print(f"cosine sim ranking: {cos_sim_ranking}")

    # combine rankings into RRF scores
    hybrid_scores = []
    for i, doc in enumerate(corpus):
        document_ranking = rrf(bm25_ranking[i], cos_sim_ranking[i])
        print(f"Document {i} has the rrf score {document_ranking}")
        hybrid_scores.append(document_ranking)

    # convert RRF scores into final rankings
    hybrid_ranking = scores_to_ranking(hybrid_scores)
    return hybrid_ranking

In [16]:
hybrid_ranking = hybrid_search(
    query="What is the scientifc name for cats?", corpus=corpus, encoder_model=model
)
print(hybrid_ranking)

tokenized query: ['What', 'is', 'the', 'scientifc', 'name', 'for', 'cats?']
bm25 scores: [0.15633023 0.20898199 0.26805423 0.9185659 ]
bm25 ranking: [4 3 2 1]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cosine sim scores: [0.5001032948493958, 0.3556439280509949, 0.1755552589893341, 0.5748380422592163]
cosine sim ranking: [4 1 2 3]
Document 0 has the rrf score 0.03125
Document 1 has the rrf score 0.032266458495966696
Document 2 has the rrf score 0.03225806451612903
Document 3 has the rrf score 0.032266458495966696
[4 2 3 1]


You'll see the ranking of documents from first (best rank) to last (worst rank) based on the query.