### import Dependencies

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import numpy as np
import openai
import json
import tiktoken

### Retrieve all item IDs from Amazon Items Qdrant Collection

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
dummy_vector = np.zeros(1536).tolist()

In [None]:
payload = qdrant_client.query_points(
    collection_name="Amazon-items-collection-01-hybrid-search",
    query=dummy_vector,
    using="text-embedding-3-small",
    limit=1000,
    with_payload=["parent_asin"],
    with_vectors=False
)

In [None]:
payload.points

In [None]:
parent_asin_list = [item.payload["parent_asin"] for item in payload.points]

In [None]:
parent_asin_list

In [None]:
len(parent_asin_list)

### Load Amazon Reviews Dataset

In [None]:
df_reviews = pd.read_json("../../data/Electornics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [None]:
df_reviews.head()

In [None]:
len(df_reviews)

In [None]:
df_reviews_sample = df_reviews[df_reviews["parent_asin"].isin(parent_asin_list)]

In [None]:
len(df_reviews_sample)

### Define functions to preprocess reviews data

In [None]:
def preprocess_reviews_data(row):
    return f"{row['title']} {row['text']}"

In [None]:
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

In [None]:
encoding.encode("Can I gert some earphones?")

In [None]:
def token_count(row, model="text-embedding-3-small"):

    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row["preprocessed_data"]))

In [None]:
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)

In [None]:
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)

In [None]:
df_reviews_sample.head()

In [None]:
len(df_reviews_sample)

In [None]:
df_reviews_sample = df_reviews_sample[df_reviews_sample["preprocessed_data_token_count"] < 8192]

In [None]:
len(df_reviews_sample)

In [None]:
total_tokens = df_reviews_sample["preprocessed_data_token_count"].sum()

In [None]:
total_tokens

### Create a new Qdrant collection for reviews

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

### Embedding functions

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [None]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Embed the text and add additional fields to the payload of each vector for reviews

In [None]:
data_to_embed_reviews = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(orient="records")

In [None]:
data_to_embed_reviews

In [None]:
text_to_embed_reviews = [data["preprocessed_data"] for data in data_to_embed_reviews]

In [None]:
text_to_embed_reviews

In [None]:
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews, batch_size=500)

In [None]:
len(embeddings_reviews)

In [None]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [None]:
batch_size_qdrant = 100
counter = 1
for i in range(0, len(pointstructs), batch_size_qdrant):
    batch = pointstructs[i:i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-01-reviews",
        wait=True,
        points=batch
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

### A function to run search against reviews on a prefiltered set of product IDs

In [None]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin",
                            match=MatchAny(
                                any=parent_asins
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

In [None]:
reviews = retrieve_prefiltered_reviews_data("bad quality", ["B09WCFC5D9"])

In [None]:
reviews.points

In [None]:
reviews = retrieve_prefiltered_reviews_data("bad quality", ["B09WCFC5D9", "B0CF1WM24K"])

In [None]:
reviews.points

In [None]:
reviews = retrieve_prefiltered_reviews_data("bad quality", ["B09WCFC5D9", "B0CF1WM24K"], k=20)

In [None]:
for point in reviews.points:
    print(point.payload["parent_asin"])
    print(point.payload["text"])
    print("-"*100)