Use Qdrant client, Qdrant is a vector database to store embeedings as vectors

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import pandas as pd
import openai

Read the sampled dataset with Amazon inventory data

In [None]:
df_items = pd.read_json("../../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)


In [None]:
df_items.head()

Examine the first item of the head recoreds for the feature description

In [None]:
list(df_items["features"].items())[0]

Check the images as well for the first item

In [None]:
list(df_items["images"].items())[0]

Function to concatenate - Features and title

In [None]:
def preprocess_description(row):
    return f"{row['title']} {' '.join(row['features'])}"

Function to extract first large image from list of images

In [None]:
def extract_first_large_image(row):
    return row["images"][0].get("large", "")

Add new columns for the df_items to have descrption and image

In [None]:
df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [None]:
df_items.head()

In [None]:
list(df_items["description"].items())[0]

In [None]:
list(df_items["image"])[0]

Sample 50 items from the dataset

In [None]:
df_sample = df_items.sample(50, random_state=42)

In [None]:

len(df_sample)

Take specific columns that we are interestd and covert into records. Each element in the record is a dictionary

In [None]:
data_to_embed = df_sample[["description", "image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")


In [None]:
data_to_embed

Define Embedding function (random)

In [None]:
response = openai.embeddings.create(
    input="Random text",
    model="text-embedding-3-small",
)


In [None]:
len(response.data[0].embedding)

In [None]:
response

In [None]:
response.data[0].embedding

Create an embedding function for a text that returns the actual vector map

In [None]:

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model,
    )
    return response.data[0].embedding



In [None]:
get_embedding("Venkatadri")

Define Qdrant Database and collection

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-00",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-00",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)


Embed data

In [None]:
pointstruct = PointStruct(
    id=0,
    vector=get_embedding("Venkatadri Ganesan"),
    payload={
        "text": "Test text",
        "model": "text-embedding-3-small",
    },
)

In [None]:
pointstruct

Amazon Data

In [None]:
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["description"])
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload=data,
        )
    )


In [None]:
pointstructs

In [None]:
len(pointstructs)

Write embedded data to Qdrant collection

In [None]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-00",
    wait=True,
    points=pointstructs,
)

Define function for data retrieval (Top K is the top K nearest search items)

In [None]:
def retrieve_data(query, k=5):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query=query_embedding,
        limit=k,
    )
    return results

Test retrieval

In [None]:
retrieve_data("What kind of charging cords do you offer?", k=10).points

