Example of doing Vector Search in Redis
Create an account in the Redis cloud service and obtain the required credentials for connecting.
Create a DB. Free tier provides you with one DB.

In [None]:
# Install redis, pandas, sentence-transformers, tabulate. This may take a few minutes.
!pip install redis pandas sentence-transformers tabulate

In [None]:
# Prepare to prompt for input
from getpass import getpass

In [None]:
# Enter hostname for the Redis DB
redisdb = getpass('Enter Redis DB hostname: ')

In [None]:
# Enter password for the Redis DB
redispass = getpass('Enter Redis DB password: ')

In [None]:
# Enter port for the Redis DB
redisport = getpass('Enter Redis DB port: ')

In [None]:
# Import the required libraries. This may take a few minutes.
import json
import time

import numpy as np
import pandas as pd
import redis
import requests
from redis.commands.search.field import (
    NumericField,
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer

In [None]:
# Load the example json "bikes" dataset
url = "https://raw.githubusercontent.com/bsbodden/redis_vss_getting_started/main/data/bikes.json"
response = requests.get(url)
bikes = response.json()

In [None]:
# Take a look at the first record
json.dumps(bikes[0], indent=2)

In [None]:
# Specify the pre-trained model to use to generate embeddings. This may take a few minutes.
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('msmarco-distilbert-base-v4')

In [None]:
# Connect to Redis DB and check that we are connected
r = redis.Redis(host=redisdb, port=redisport, password=redispass, decode_responses=True)
res = r.ping()
print(res)


In [None]:
# Iterate through the bikes dataset and add each record to the Redis DB
pipeline = r.pipeline()
for i, bike in enumerate(bikes, start=1):
    redis_key = f"bikes:{i:03}"
    pipeline.json().set(redis_key, "$", bike)
res = pipeline.execute()

# Check that the records were added
res = r.json().get("bikes:010", "$.model")
print(res)


In [None]:
# Collect the Redis keys for the bikes dataset
keys = sorted(r.keys("bikes:*"))

In [None]:
# Use the keys collected to generate embeddings for each record
descriptions = r.json().mget(keys, "$.description")
descriptions = [item for sublist in descriptions for item in sublist]
embeddings = embedder.encode(descriptions).astype(np.float32).tolist()

# Get the dimensions of the embeddings
VECTOR_DIMENSION = len(embeddings[0])
print(f"Embedding dimension: {VECTOR_DIMENSION}")


In [None]:
# Add the embeddings to the Redis DB
pipeline = r.pipeline()
for key, embedding in zip(keys, embeddings):
    pipeline.json().set(key, "$.description_embeddings", embedding)
pipeline.execute()

In [None]:
# Check that the embeddings were added
res = r.json().get("bikes:010", "$.description_embeddings")
print(res)

In [None]:
# Create the index definition
schema = (
    TextField("$.model", no_stem=True, as_name="model"),
    TextField("$.brand", no_stem=True, as_name="brand"),
    NumericField("$.price", as_name="price"),
    TagField("$.type", as_name="type"),
    TextField("$.description", as_name="description"),
    VectorField(
        "$.description_embeddings",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
res = r.ft("idx:bikes_vss").create_index(
    fields=schema, definition=definition
)
print(res)

In [None]:
# Check that the index was created
info = r.ft("idx:bikes_vss").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]
print(f"Number of documents indexed: {num_docs}")
print(f"Number of indexing failures: {indexing_failures}")

In [None]:
# Run a query that matches a certain bike model
query = Query("@brand:Peaknetic")
res = r.ft("idx:bikes_vss").search(query).docs
print(res)

In [None]:
# Run a query that returns specific fields
query = Query("@brand:Peaknetic").return_fields("id", "brand", "model", "price")
res = r.ft("idx:bikes_vss").search(query).docs
print(res)

In [None]:
# Run a query with multiple conditions
query = Query("@brand:Peaknetic @price:[0 1000]").return_fields(
    "id", "brand", "model", "price"
)
res = r.ft("idx:bikes_vss").search(query).docs
print(res)

In [None]:
# Create a list of queries and generate embeddings for each
queries = [
    "Bike for small kids",
    "Best Mountain bikes for kids",
    "Cheap Mountain bike for kids",
    "Female specific mountain bike",
    "Road bike for beginners",
    "Commuter bike for people over 60",
    "Comfortable commuter bike",
    "Good bike for college students",
    "Mountain bike for beginners",
    "Vintage bike",
    "Comfortable city bike",
]

encoded_queries = embedder.encode(queries)
len(encoded_queries)

In [None]:
# Run the queries against the Redis DB
def create_query_table(query, queries, encoded_queries, extra_params={}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            r.ft("idx:bikes_vss")
            .search(
                query,
                {
                    "query_vector": np.array(
                        encoded_query, dtype=np.float32
                    ).tobytes()
                }
                | extra_params,
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "id": doc.id,
                    "brand": doc.brand,
                    "model": doc.model,
                    "description": doc.description,
                }
            )

    # Convert the table to Markdown using Pandas
    queries_table = pd.DataFrame(results_list)
    queries_table.sort_values(
        by=["query", "score"], ascending=[True, False], inplace=True
    )
    queries_table["query"] = queries_table.groupby("query")["query"].transform(
        lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
    )
    queries_table["description"] = queries_table["description"].apply(
        lambda x: (x[:497] + "...") if len(x) > 500 else x
    )
    print(queries_table.to_markdown(index=False))
    


In [None]:
# Run a vector similarity search query
query = (
    Query("(*)=>[KNN 3 @vector $query_vector AS vector_score]")
    .sort_by("vector_score")
    .return_fields("vector_score", "id", "brand", "model", "description")
    .dialect(2)
)

create_query_table(query, queries, encoded_queries)


In [None]:

# Run a hybrid query that combines text search and vector search
hybrid_query = (
    Query("(@brand:Peaknetic)=>[KNN 3 @vector $query_vector AS vector_score]")
    .sort_by("vector_score")
    .return_fields("vector_score", "id", "brand", "model", "description")
    .dialect(2)
)
create_query_table(hybrid_query, queries, encoded_queries)


In [None]:

# Run a range query
range_query = (
    Query(
        "@vector:[VECTOR_RANGE $range $query_vector]=>{$YIELD_DISTANCE_AS: vector_score}"
    )
    .sort_by("vector_score")
    .return_fields("vector_score", "id", "brand", "model", "description")
    .paging(0, 4)
    .dialect(2)
)
create_query_table(
    range_query, queries[:1], encoded_queries[:1], {"range": 0.55}
)

In [None]:
# Cleanup (delete all keys in DB)
r.flushdb()