# Retrieval-Augmented Shopping Assistant - EDA

This notebook explores the ABO dataset for initial insights.

In [None]:
import pandas as pd

In [None]:
# Load ABO image dataset
df_img = pd.read_csv("/kaggle/input/amazon-berkeley-objects/images/metadata/images.csv")  # or csv if applicable

In [None]:
# 1. Check structure
print(df_img.shape)
print(df_img.columns)
print(df_img.dtypes)

# 2. Check missing values
print(df_img.notnull().sum())

# 3. Sample record
df_img.sample(5)

# 4. (If possible) display an image
from PIL import Image
import matplotlib.pyplot as plt

img_loc = str(df_img.loc[df_img['path']== '8c/8ccb5859.jpg']['path']).split()[1]
img_path = '/kaggle/input/amazon-berkeley-objects/images/small/' + img_loc  # based on image_id field
img = Image.open(img_path)
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
# Load ABO metadata dataset

import os, glob
full_metadata = pd.DataFrame()
for json_file in glob.glob("/kaggle/input/listing/listings/metadata/*.json"):
    print('Loading file: ' + json_file + '\n')
    df_metadata = pd.read_json(json_file, lines = True)
    full_metadata = pd.concat([full_metadata, df_metadata])

In [None]:
# 1. Check structure
print(full_metadata.shape)
print(full_metadata.columns)
print(full_metadata.dtypes)

In [None]:
# 2. Check missing values
print(full_metadata.notnull().sum())

From above, "finish_type" column has the lowest number of values present i.e. 1536 out of 147702 product entries.

In [None]:
print(full_metadata.loc[full_metadata['item_id'] == 'B07TGZZMDK'])

In [None]:
# 3. Sample record
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
#full_metadata.sample(5)

From above, it is clear that:
1. Only **item_name** column is present for all products.
2. We will use **item_name** to filter for in scope language - English.

In [None]:
# 4. Display an image using image id from the metadata

from PIL import Image
import matplotlib.pyplot as plt
print(full_metadata.loc[full_metadata['main_image_id'] == '81iZlv3bjpL'])
img_path = full_metadata.loc[full_metadata['main_image_id'] == '81iZlv3bjpL']
img_id = img_path.iloc[0]['main_image_id']
img_location = str(df_img.loc[df_img['image_id'] == img_id]['path'])
img = Image.open('/kaggle/input/amazon-berkeley-objects/images/small/' + img_location.split()[1])
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
# Analyse the number of languages supported. US-English in scope only.
count_US_lang_key = 0
count_prodDesc = 0
count_total=0
lang_set = set()
for val in full_metadata['item_name']:
    count_total += 1
    for key, value in val[0].items():
        if(key == 'language_tag'):
            lang_set.add(value)
print(lang_set)

Amongst these, only English language will be used. i.e,
en_SG,
en_CA,
en_AU,
en_GB, 
en_AE, 
en_US,
en_IN

In [None]:
# Analyse the number of languages supported. US-English in scope only.
countTotalProdDesc = 0
countTotalEngDesc = 0
for val in full_metadata['item_name']:
    for key, value in val[0].items():
        if(key == 'language_tag'and value in ('en_SG', 'en_CA','en_AU','en_GB','en_AE','en_US','en_IN')):
            #list_items = list(val[0].items())
            #print(list_items[1])
            countTotalEngDesc += 1
        countTotalProdDesc += 1 

print('countTotalEngDesc: ', countTotalEngDesc)
print('countTotalProdDesc: ', countTotalProdDesc)
print('Percentage of data under scope:', countTotalEngDesc/countTotalProdDesc * 100)

In [None]:
# Reduce the metadata to in scope languages only

# Step 1: Store indices of rows that match the language criteria
matching_indices = []

valid_languages = {'en_SG', 'en_CA','en_AU','en_GB','en_AE','en_US','en_IN'}

for idx, row in full_metadata.iterrows():
    item_name = row['item_name']
    if isinstance(item_name, list):
        if any(d.get('language_tag') in valid_languages for d in item_name if isinstance(d, dict)):
            matching_indices.append(idx)

# Step 2: Filter all at once using .iloc
inScopeMetadata = full_metadata.iloc[matching_indices].reset_index(drop=True)

In [None]:
def auto_flatten_json_columns(df, keys_to_try=['value', 'name']):
    """
    Detects and flattens columns containing lists of JSON objects,
    extracting specified keys.
    Adds new columns with a `_flat` suffix.
    """
    def extract_from_list(ld, keys):
        if isinstance(ld, list):
            for key in keys:
                values = [str(d.get(key)) for d in ld if isinstance(d, dict) and key in d]
                if values:  # found at least one valid value
                    return ", ".join(values)
        return None

    # Track flattened columns
    flattened = []

    for col in df.columns:
        sample = df[col].iloc[0]
        if isinstance(sample, list) and all(isinstance(i, dict) for i in sample):
            flat_col = f"{col}_flat"
            df[flat_col] = df[col].apply(lambda x: extract_from_list(x, keys_to_try))
            flattened.append(flat_col)

    return df, flattened

In [None]:
print(inScopeMetadata.shape)

In [None]:
inScopeMetadata, flattened_cols = auto_flatten_json_columns(inScopeMetadata)
print("Flattened columns:", flattened_cols)


In [None]:
print(inScopeMetadata.columns)

In [None]:
# Construct the Embedding Input Text

embedding_cols = [
    'item_name_flat',
    'brand_flat',
    'product_type_flat',
    'material_flat',
    'bullet_point_flat',
    'color_flat',
    'item_keywords_flat'
]

inScopeMetadata['embedding_input'] = inScopeMetadata[embedding_cols].fillna('').agg(' '.join, axis=1)

In [None]:
inScopeMetadata.notnull().sum()

In [None]:
# Generate Embeddings from embedding_input

!pip install -U sentence-transformers

In [None]:
#Load model and encode

from sentence_transformers import SentenceTransformer

# Load the model (compact + effective)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate dense embeddings (512-dimensional vectors)
embedding_list = model.encode(
    inScopeMetadata['embedding_input'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

In [None]:
# Attach to the dataframe

import numpy as np

# Save as separate column or matrix
inScopeMetadata['embedding_vector'] = list(embedding_list)