## Import libraries

In [1]:
import faiss
import numpy as np
import pandas as pd
from read_s3 import read_s3_file

## Process data

In [None]:
BATCH_SIZE = 50000
BATCH_NUMBER = 10
BATCH_COUNT = 0
BATCH_COUNT_LIMIT = 10

df = pd.DataFrame(columns=["parent_asin", "embedding"])

# append all batches into one file
for i in range(BATCH_COUNT_LIMIT):
    key = f"data_vectors/file_0_7M_{BATCH_COUNT * BATCH_SIZE}_{(BATCH_COUNT + 1) * BATCH_SIZE - 1}"

    data = read_s3_file(key)

    if BATCH_COUNT == 0:
        df = data
    else:
        df = pd.concat([df, data], ignore_index=True)
    BATCH_COUNT += 1

print(df.shape)


(500000, 2)


In [None]:
## unpack embeddings into matrix

is_valid = df['embedding'].apply(lambda x: isinstance(x, np.ndarray) and x.shape == (384,))
df_clean = df[is_valid]

matrix = np.stack(df_clean['embedding'].values).astype('float32')

print(matrix.shape)
# (499971, 384)

(499971, 384)


## Initialize Index using FAISS

In [4]:
index = faiss.IndexFlatL2(384)
index.add(matrix)

## Query

In [None]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# example
query = "soften and brighten your skin with this vitamin C serum"

# convert query to vector
query_embedding = model.encode(query, convert_to_numpy=True).astype('float32')

# number of results to retrieve
k = 20

## Search
D, I = index.search(query_embedding.reshape(1, -1), k)

print("Distances:", D)
print("Indices:", I)

# Retrieve the top-k results
top_k_results = df_clean.iloc[I[0]]
print("Top-k results:")
print(top_k_results[['parent_asin', 'embedding']])

  from .autonotebook import tqdm as notebook_tqdm


Distances: [[0.3288989  0.3613476  0.38152355 0.3880829  0.39246958 0.39396772
  0.39432865 0.40481067 0.4073006  0.40756983 0.41131586 0.42026377
  0.42201316 0.42642474 0.4339104  0.43516773 0.43556416 0.44140053
  0.44345325 0.44444698]]
Indices: [[210587  70315  45819 437083 234842 149415  39628 139357 313286 457380
  496676 106026   1519 437368 342703 127453 481490 123321 453238  63114]]
Top-k results:
       parent_asin                                          embedding
210594  B0BCNMB5FX  [-0.034809195, 0.006670833, 0.006502685, 0.072...
70316   B003L8RVZ0  [-0.07871929, -0.009345709, -0.012235099, 0.05...
45819   B01DSV2DZW  [-0.06418097, 0.00893156, -0.018984204, 0.0595...
437108  B0BC6WBLNL  [-0.014649218, -0.035720766, -0.01795996, 0.10...
234849  B0C548K7T5  [-0.07057112, 0.012313232, -0.022379879, 0.023...
149421  B08QFM3ZNR  [-0.0701939, -0.03036184, -0.010968303, 0.0493...
39628   B07X37B5V6  [-0.06915631, -0.013314929, -0.03038595, 0.068...
139362  B08CV94WQR  [-0.03131

## Validate

In [None]:
# retrieve review data
key_new = "data/beauty_personal_care_0_7000000.parquet"

validate_data = read_s3_file(key_new)

validate_data = validate_data[validate_data['parent_asin'].isin(top_k_results['parent_asin'])]

# print all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
# retrieve meta data for product names
metadata_key = 'meta_data.parquet'

metadata = read_s3_file(metadata_key)

# top k parent_asins
top_k_parent_asins = top_k_results['parent_asin'].unique()

# search relevant metadata
metadata_top_k = metadata[metadata['parent_asin'].isin(top_k_parent_asins)]

# calculate weighted average rating using bayesian average
# https://www.algolia.com/doc/guides/managing-results/must-do/custom-ranking/how-to/bayesian-average/
# C (confidence number): 25% percentile
C = metadata_top_k['average_rating'].quantile(0.25)
# m: smoothing factor -- median of rating_number
m = metadata_top_k['rating_number'].median()

metadata_top_k['bayes_average_rating'] = (
    (metadata_top_k['average_rating'] * metadata_top_k['rating_number'] + C * m) /
    (metadata_top_k['rating_number'] + m)
)

# sort by bayesian average rating and return top 5
metadata_top_k_sorted = metadata_top_k.sort_values(by='bayes_average_rating', ascending=False)
metadata_top_k_sorted.head(5)

# print(C)
# print(top_k_parent_asins)
# metadata_top_k


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_top_k['bayes_average_rating'] = (


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,bayes_average_rating
151948,All Beauty,"TruSkin Vitamin C-Plus Super Serum, Anti Aging...",4.5,11546,[PREMIUM AGE DEFENSE SERUM – Powered by a syne...,[],24.79,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Our Point of View on the TruSkin V...,TruSkin Naturals,"[Beauty & Personal Care, Skin Care, Face, Trea...","{""Skin Type"": ""All"", ""Product Benefits"": ""Anti...",B0BNR35C6C,,,,4.492439
182641,Premium Beauty,Mario Badescu Vitamin C Serum for All Skin Typ...,4.5,1288,[],[],45.0,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Why I Love This Vitamin C Serum', ...",,"[Beauty & Personal Care, Skin Care, Face, Trea...","{""Skin Type"": ""All"", ""Product Benefits"": ""Brig...",B003L8RVZ0,,,,4.442818
259042,All Beauty,Poppy Austin 60mL Face Serum- Pure Hyaluronic ...,4.4,11729,[],[],19.99,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Vitamin C Serum - Try On and Revie...,poppy austin,"[Beauty & Personal Care, Skin Care, Face, Trea...","{""Skin Type"": ""Sensitive Skin,All Types Of Ski...",B0C548K7T5,,,,4.394845
611640,All Beauty,[SKIN&LAB] Vitamin C Brightening Serum for Fac...,4.4,1171,[🍊 Fill Your Skin with Vitamin C: A lightweigh...,[],21.98,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['so natural and glowy!', 'SKIN&LAB ...",SKIN & LAB SKIN SCIENCE SOLUTION,"[Beauty & Personal Care, Skin Care, Face, Trea...","{""Skin Type"": ""All, Oily, Combination, Normal""...",B08CV94WQR,,,,4.35721
201822,All Beauty,TruSkin Vitamin C Serum for Face – Anti Aging ...,4.3,139747,[SKIN BRIGHTENING SERUM: Our most popular face...,[],21.97,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Perfection in a bottle', 'DOES IT ...",TruSkin Naturals,"[Beauty & Personal Care, Skin Care, Face, Trea...","{""Skin Type"": ""Oily, Combination, Sensitive, D...",B0BS71PXPX,,,,4.299755
