
# Retrieval-Augmented Shopping Assistant - Embeddings Experimentations

In [1]:
# Generate Embeddings from embedding_input

!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [2]:
# Use the parquet file generated in EDA phase.

import pandas as pd

inScopeMetadata = pd.read_parquet("/kaggle/input/abo-english-metadata-parquet/inScopeMetadata_with_embeddings.parquet")

In [3]:
# check the type before resuming.

type(inScopeMetadata['embedding_vector'].iloc[0])


numpy.ndarray

In [8]:
print(inScopeMetadata.shape)

(122734, 41)


In [9]:
inScopeMetadata.sample(1)

Unnamed: 0,brand,bullet_point,color,item_id,item_name,item_weight,material,model_name,model_number,product_type,...,item_name_flat,item_weight_flat,material_flat,model_name_flat,model_number_flat,product_type_flat,item_keywords_flat,node_flat,embedding_input,embedding_vector
64145,"[{'language_tag': 'en_IN', 'value': 'Amazon Br...","[{'language_tag': 'en_IN', 'value': '3D Printe...","[{'language_tag': 'en_IN', 'standardized_value...",B07TRX487W,"[{'alternate_representations': None, 'language...","[{'normalized_value': {'unit': 'pounds', 'valu...",,"[{'language_tag': 'en_IN', 'value': 'Oppo F11 ...",[{'value': 'gz8637-SL40780'}],[{'value': 'CELLULAR_PHONE_CASE'}],...,Amazon Brand - Solimo Designer Multicolor Circ...,50,,Oppo F11 Pro,gz8637-SL40780,CELLULAR_PHONE_CASE,"mobile cover, back cover, mobile case, phone c...",,Amazon Brand - Solimo Designer Multicolor Circ...,"[-0.112759896, 0.02776274, 0.032392133, -0.028..."


In [10]:
duplicate_counts = inScopeMetadata['item_id'].value_counts()
duplicate_counts = duplicate_counts[duplicate_counts > 1]
print(duplicate_counts)

item_id
B07WC622LH    28
B0746MMVXW    28
B07797D9MW    27
B073S3R169    26
B01928HSB4    26
              ..
B07GN662BP     7
B07RRWD8QB     7
B07HFTZ8YW     7
B07RR3RFHT     7
B01719EX2S     7
Name: count, Length: 9225, dtype: int64


In [12]:
!pip install faiss-cpu --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [19]:
#Load model and encode

from sentence_transformers import SentenceTransformer

# Load the model (compact + effective)
model = SentenceTransformer('all-MiniLM-L12-v2')

# Generate dense embeddings (512-dimensional vectors)
embedding_list = model.encode(
    inScopeMetadata['embedding_input'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/3836 [00:00<?, ?it/s]

In [20]:
# Attach to the dataframe

import numpy as np

# Save as separate column or matrix
inScopeMetadata['embedding_vector'] = list(embedding_list)

In [21]:
import numpy as np

embedding_matrix = np.vstack(embedding_list)

In [22]:
print(embedding_matrix.shape)

(122734, 384)


In [23]:
# Create vector index

import faiss

dim = embedding_matrix.shape[1]
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embedding_matrix)  # 🔥 fast vector search

In [14]:
np.save("embedding_matrix.npy", embedding_matrix)
#embedding_matrix = np.load("embedding_matrix.npy")
faiss.write_index(index, "faiss_index.index")
# index = faiss.read_index("faiss_index.index")

With above copies, we have:

faiss_index.index → FAISS binary index
inScopeMetadata_with_embeddings.parquet → all metadata + embedding inputs
embedding_matrix.npy → optional fallback

In [24]:

# 3. Sample record
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [25]:
import numpy as np
import faiss
import pandas as pd
from collections import OrderedDict

def rerank_faiss_results(query_embedding, faiss_index, metadata_df, top_k=50, final_k=10):
    """
    query_embedding: np.array of shape (384,) from all-MiniLM-L12-v2
    faiss_index: FAISS index object
    metadata_df: DataFrame with a column 'item_id' aligned by FAISS vector index
    top_k: how many closest vectors to fetch initially
    final_k: how many unique item_ids to return
    """

    # Reshape query embedding
    query_embedding = np.array([query_embedding]).astype('float32')

    # Get top_k nearest embeddings
    D, I = faiss_index.search(query_embedding, top_k)
    distances = D[0]
    indices = I[0]

    # Track top unique items
    seen_item_ids = OrderedDict()
    
    for dist, idx in zip(distances, indices):
        item_id = metadata_df.iloc[idx]['item_id']
        if item_id not in seen_item_ids:
            seen_item_ids[item_id] = {
                'item_id': item_id,
                'index': idx,
                'distance': dist,
                'metadata': metadata_df.iloc[idx].to_dict()
            }
        if len(seen_item_ids) >= final_k:
            break

    # Convert to DataFrame or list
    results = list(seen_item_ids.values())
    return results

In [26]:
print(inScopeMetadata.columns.tolist())

['brand', 'bullet_point', 'color', 'item_id', 'item_name', 'item_weight', 'material', 'model_name', 'model_number', 'product_type', 'main_image_id', 'other_image_id', 'item_keywords', 'country', 'marketplace', 'domain_name', 'node', 'style', 'item_dimensions', 'model_year', 'color_code', 'spin_id', '3dmodel_id', 'fabric_type', 'item_shape', 'pattern', 'product_description', 'finish_type', 'brand_flat', 'bullet_point_flat', 'color_flat', 'item_name_flat', 'item_weight_flat', 'material_flat', 'model_name_flat', 'model_number_flat', 'product_type_flat', 'item_keywords_flat', 'node_flat', 'embedding_input', 'embedding_vector']


In [30]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L12-v2')
query = "Shoe"
query_embedding = model.encode(query)

top_results = rerank_faiss_results(query_embedding, faiss_index, inScopeMetadata, top_k=50, final_k=10)

for res in top_results:
    print(res['item_id'], res['distance'], res['metadata']['item_name_flat'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

B07D451GP9 0.7125936 Amazon Brand - find. Women's Ankle boots
B07WKBW4RR 0.78247225 find. Women's Lace-s-2-46 Open Toe Sandals
B07KMRBT8Q 0.7946671 find. Round Toe Block Heel Leather Court, Women’s Closed-Toe Pumps
B0812BSW74 0.79647666 find. Men's Cupsole Boat Shoe, Blue Navy Suede, women 2


In [39]:
from openai import OpenAI

def openai_chat_completion(prompt, model="gpt-4", temperature=0.7, max_tokens=300):
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300
    )
    
    return response['choices'][0]['message']['content'].strip()

In [36]:
def generate_image_url(main_image_id):
    # Update this to your actual image bucket/path
    base_url = "https://amazon-berkeley-objects.s3.amazonaws.com/images/"
    return f"{base_url}{main_image_id}.jpg"

In [37]:
def rag_response(query, faiss_index, model, metadata_df, top_k=50, final_k=5):
    # Embed query
    query_embedding = model.encode(query)

    # Retrieve top results
    top_items = rerank_faiss_results(query_embedding, faiss_index, metadata_df, top_k=top_k, final_k=final_k)

    # Build context
    context_snippets = []
    for item in top_items:
        meta = item['metadata']
        image_url = generate_image_url(meta.get('main_image_id', ''))
        snippet = (
            f"Item: {meta.get('item_name_flat', '')}\n"
            f"Brand: {meta.get('brand_flat', '')}\n"
            f"Color: {meta.get('color_flat', '')}\n"
            f"Material: {meta.get('material_flat', '')}\n"
            f"Style: {meta.get('style', '')}\n"
            f"Description: {meta.get('product_description', '') or meta.get('bullet_point_flat', '')}\n"
            f"Image: {image_url}\n"
        )
        context_snippets.append(snippet.strip())

    context = "\n\n".join(context_snippets)

    # Prompt
    prompt = f"""
You are a shopping assistant. Based on the following product data, recommend options to answer the query: "{query}"

{context}

List the product name, brand, and one-line summary for each. Do not hallucinate or invent items.
"""

    return openai_chat_completion(prompt)

In [41]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."  # only if not already set

In [42]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L12-v2')
query = "Shoe"

top_results = rag_response(query, faiss_index, model, inScopeMetadata, top_k=50, final_k=10)

for res in top_results:
    print(res['item_id'], res['distance'], res['metadata']['item_name_flat'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}