In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import openai
import json
import tiktoken

### Load Amazon dataset (Items)

In [None]:
df_items = pd.read_json("../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [None]:
df_items_sample = df_items.sample(n=50, random_state=25)

### Define functions to preprocess title and features data and extract image url from the first large image in the images list


In [None]:
def preprocess_items_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [None]:
def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [None]:
df_items_sample["preprocessed_data"] = df_items_sample.apply(preprocess_items_data, axis=1)
df_items_sample["first_large_image"] = df_items_sample.apply(extract_first_large_image, axis=1)

In [None]:
df_items_sample.head(2)

### Load Amazon dataset (reviews)

In [None]:
df_reviews = pd.read_json("../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [None]:
len(df_reviews)

In [None]:
df_reviews_sample = df_reviews[df_reviews['parent_asin'].isin(df_items_sample['parent_asin'])]

In [None]:
df_reviews_sample.head(2)

In [None]:
len(df_reviews_sample)

### Define functions to preprocess reviews data

In [None]:
def preprocess_reviews_data(row):
    return f"{row['title']} {row['text']}"

In [None]:
def token_count(row, model="text-embedding-3-small"):

    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row["preprocessed_data"]))

In [None]:
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)

In [None]:
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)

In [None]:
df_reviews_sample = df_reviews_sample[df_reviews_sample["preprocessed_data_token_count"] < 8192]

In [None]:
df_reviews_sample.head(2)

### Create a new Qdrant collection for items

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-items",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

### Create a new Qdrant collection for reviews

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

### Embedding functions

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [None]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Embed the text data and add additional fields to the payload of each vector (items)

In [None]:
data_to_embed_items = df_items_sample[["preprocessed_data", "first_large_image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")

In [None]:
data_to_embed_items[0]

In [None]:
text_to_embed_items = [data["preprocessed_data"] for data in data_to_embed_items]

In [None]:
embeddings_items = get_embeddings_batch(text_to_embed_items)

In [None]:
embeddings_items

In [None]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_items, data_to_embed_items):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "first_large_image": data["first_large_image"],
                "average_rating": data["average_rating"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [None]:
pointstructs

In [None]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-items",
    wait=True,
    points=pointstructs
)

### Embed the text data and add additional fields to the payload of each vector (reviews)

In [None]:
data_to_embed_reviews = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(orient="records")

In [None]:
text_to_embed_reviews = [data["preprocessed_data"] for data in data_to_embed_reviews]

In [None]:
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews)

In [None]:
len(embeddings_reviews)

In [None]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [None]:
pointstructs

In [None]:
batch_size_qdrant = 100
counter = 1
for i in range(0, len(pointstructs), batch_size_qdrant):
    batch = pointstructs[i:i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-02-reviews",
        wait=True,
        points=batch
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

### Perform hybrid search and perform rrf rank fusion on the retrieved results

In [None]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-items",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [None]:
result = retrieve_data("earphones")

In [None]:
result.points

In [None]:
parent_asins = []
for data in result.points:
    parent_asins.append(data.payload["parent_asin"])

In [None]:
parent_asins

### A function to run search agains reviews on a prefiltered set of product IDs

In [None]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin",
                            match=MatchAny(
                                any=parent_asins
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [None]:
reviews = retrieve_prefiltered_reviews_data("bad quality", ['B09NLTDHQ6', 'B098K6N6TX', 'B0C6KBJMHP', 'B0B1DM4Y5C', 'B09Q5W9HPQ'], k=5)

In [None]:
reviews.points

In [None]:
reviews_1 = retrieve_prefiltered_reviews_data("bad quality", ['B09NLTDHQ6', 'B098K6N6TX'], k=5)
reviews_2 = retrieve_prefiltered_reviews_data("bad quality", ['B0C6KBJMHP', 'B0B1DM4Y5C'], k=5)
reviews_3 = retrieve_prefiltered_reviews_data("bad quality", ['B09Q5W9HPQ'], k=5)
reviews_3 = retrieve_prefiltered_reviews_data("bad quality", ['B09NLTDHQ6'], k=5)

In [None]:
reviews_1.points

In [None]:
reviews_4 = retrieve_prefiltered_reviews_data("bad", ['B09NLTDHQ6'], k=5)

In [None]:
reviews_4.points