# Huggingface Amazon Reviews Dataset

https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023

https://amazon-reviews-2023.github.io/


Note: Huggingface loading is not supported, we will use curl to download manually

We will download datasets in the beauty / personal care category

In [None]:
!curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz
!curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz

# !curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Beauty_and_Personal_Care.jsonl.gz
# !curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Beauty_and_Personal_Care.jsonl.gz

# !curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Health_and_Personal_Care.jsonl.gz
# !curl -O https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Health_and_Personal_Care.jsonl.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 90.0M  100 90.0M    0     0  40.2M      0  0:00:02  0:00:02 --:--:-- 40.2M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 38.0M  100 38.0M    0     0  29.7M      0  0:00:01  0:00:01 --:--:-- 29.7M


In [None]:
!gunzip All_Beauty.jsonl.gz
# !gunzip Beauty_and_Personal_Care.jsonl.gz
# !gunzip Health_and_Personal_Care.jsonl.gz
!gunzip meta_All_Beauty.jsonl.gz
# !gunzip meta_Beauty_and_Personal_Care.jsonl.gz
# !gunzip meta_Health_and_Personal_Care.jsonl.gz

In [None]:
!pip install jsonlines pinecone openai azure-core -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.9/745.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.0/214.0 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.9/280.9 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import userdata

In [None]:
import json
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)

In [None]:
import jsonlines

def load_jsonl(file_path):

  data_list = []

  with jsonlines.open(file_path) as reader:
      for obj in reader:
          data_list.append(obj)
  return data_list


In [None]:
all_beauty_reviews_raw = load_jsonl('All_Beauty.jsonl')

In [None]:
all_beauty_items_raw = load_jsonl('meta_All_Beauty.jsonl')

In [None]:
# Health_and_Personal_Care_reviews_raw = load_jsonl('Health_and_Personal_Care.jsonl')

In [None]:
# Health_and_Personal_Care_items_raw = load_jsonl('meta_Health_and_Personal_Care.jsonl')

## Data Processing

- Remove Irrelevant Keys

JSON objects need to be converted to text before they can be embedded. We need to extract 2 types of information:
- Semantic text (Names, descriptions etc). These will be used to generate embeddings.
- Product ID, Price, main category, pricing. These will be inserted as metadata.

In [None]:
def replace_empty_with_zero(data_list):
    """
    Replaces empty/None values with 0 in a list of first-layer dictionaries.
    """
    for item in data_list:
        if isinstance(item, dict):
            for key, value in item.items():
                # Check for None or empty string values
                if value is None or value == "":
                    item[key] = 0
                # You can add more conditions here (e.g., if value == [])
    return data_list

In [None]:
import re

def remove_invalid_chars(text):
    # This pattern keeps only alphanumeric characters and spaces
    # You can customize the allowed characters within the square brackets
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)


    # 2. Fix the "34" quote artifact (replace 34 followed by text with a quote)
    # This regex looks for 34 surrounding words or sticking to them
    cleaned_text = re.sub(r'34', '"', cleaned_text)

    # 3. Fix the "br" artifact (replace 'br' at end of words with a space)
    cleaned_text = re.sub(r'br\s+', ' ', cleaned_text) # Fix "br "
    cleaned_text = re.sub(r'br(?=[A-Z])', ' ', cleaned_text) # Fix "br" before Capital letter (end of sentence)

    # 4. Remove extra whitespace
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [None]:
def extract_keys_items(data):

  keys_to_extract = ["main_category", "title", "parent_asin", "average_rating", "rating_number", "price", "store"]

  new_data = [
      {key: value for key, value in item.items() if key in keys_to_extract}
      for item in data
  ]

  return new_data

In [None]:
def extract_keys_reviews(data):

  keys_to_extract = ["text", "rating", "parent_asin", "helpful_vote", "verified_purchase"]

  new_data = [
      {key: value for key, value in item.items() if key in keys_to_extract}
      for item in data
  ]

  return new_data

In [None]:
def serialize_item(item):
    # Parse details JSON if string, filter out UPC
    details_dict = json.loads(item['details']) if isinstance(item['details'], str) else {}
    valid_details = {k: v for k, v in details_dict.items() if "UPC" not in k}

    # Construct rich text
    text = f"Product: {item['title']}\n"
    text += f"Category: {item['main_category']}\n"
    text += f"Features: {', '.join(item['features'])}\n"
    text += f"Description: {', '.join(item['description'])}\n"
    text += f"Details: {valid_details}"

    return text

In [None]:
def serialize_review(review):
    return f"Title: {remove_invalid_chars(review['title'])}\nReview: {remove_invalid_chars(review['text'])}"

In [None]:
# all_beauty_reviews_text = []
# all_beauty_items_text =[]


# for i in all_beauty_reviews_raw:
#   all_beauty_reviews_text.append(serialize_review(i))


# for j in all_beauty_items_raw:
#   all_beauty_items_text.append(serialize_item(j))

In [None]:
# all_beauty_reviews_meta = extract_keys_reviews(all_beauty_reviews_raw)
# all_beauty_items_meta = extract_keys_items(all_beauty_items_raw)

## Filter out items for Sanity

There are 112590 All Beauty Items.

It takes 45 minutes to generate embeddings and the embedding file is around 600mb

```
Memory consumed by array elements (nbytes): 659.70703125 mb
Memory consumed by the NumPy object (sys.getsizeof): 659.7071533203125 mb
```

For the purposes of this demo we will limit it to the first 5000 items.

In [None]:
all_beauty_items_raw_short = all_beauty_items_raw[:5000]

In [None]:
unique_parent_asin = set([item['parent_asin'] for item in all_beauty_items_raw_short])

In [None]:
all_beauty_reviews_raw_short = [review for review in all_beauty_reviews_raw if review['parent_asin'] in unique_parent_asin]

In [None]:
#remove empty reviews to prevent errors
all_beauty_reviews_raw_short = [review for review in all_beauty_reviews_raw_short if review['text'] and review['text'].strip() and review['title'] and review['title'].strip()]

In [None]:
len(all_beauty_reviews_raw_short)

40348

In [None]:
all_beauty_reviews_text = []
all_beauty_items_text =[]

# limit to 600 characters per review to prevent overload
for i in all_beauty_reviews_raw_short:
  all_beauty_reviews_text.append(serialize_review(i)[:600])


for j in all_beauty_items_raw_short:
  all_beauty_items_text.append(serialize_item(j))

In [None]:
all_beauty_reviews_meta = extract_keys_reviews(all_beauty_reviews_raw_short)
all_beauty_items_meta = extract_keys_items(all_beauty_items_raw_short)

In [None]:
all_beauty_items_meta = replace_empty_with_zero(all_beauty_items_meta)
all_beauty_reviews_meta = replace_empty_with_zero(all_beauty_reviews_meta)

## Generate Embeddings

In [None]:
import os
from openai import AzureOpenAI


endpoint = "https://openai-gdig.cognitiveservices.azure.com/"
model_name = "text-embedding-3-small"
deployment = "text-embedding-3-small"


api_version = "2024-02-01"


client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=endpoint,
    api_key=userdata.get('azure_openai')
)


# response = client.embeddings.create(
#     input=["first phrase","second phrase","third phrase"],
#     model=deployment
# )


# for item in response.data:
#     length = len(item.embedding)
#     print(
#         f"data[{item.index}]: length={length}, "
#         f"[{item.embedding[0]}, {item.embedding[1]}, "
#         f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
#     )
# print(response.usage)

We need to modify the code to avoid hitting rate limits. Alternatively, increase our quota on Azure

In [None]:
from tqdm import tqdm

In [None]:
import time

batch_size = 256                # tune this to be under rate limits
delay = 2.5                    # seconds to wait between calls

item_embeddings = []

for i in tqdm(range(0, len(all_beauty_items_text), batch_size)):
    batch = all_beauty_items_text[i : i + batch_size]

    response = client.embeddings.create(
        model=deployment,
        input=batch
    )

    item_embeddings.extend(response.data)

    # wait before the next batch
    time.sleep(delay)


100%|██████████| 20/20 [01:55<00:00,  5.80s/it]


In [None]:

batch_size = 128                # tune this to be under rate limits
delay = 2.5                    # seconds to wait between calls

review_embeddings = []

for i in tqdm(range(0, len(all_beauty_reviews_text), batch_size)):
    batch = all_beauty_reviews_text[i : i + batch_size]

    try:
      response = client.embeddings.create(
          model=deployment,
          input=batch
      )

    except Exception as e:
      print(f"Error occurred with input: {batch}")
      print(f"Error details: {e}")


    review_embeddings.extend(response.data)

    # wait before the next batch
    time.sleep(delay)


100%|██████████| 316/316 [20:22<00:00,  3.87s/it]


In [None]:
item_embeddings = [item.embedding for item in item_embeddings]
review_embeddings = [item.embedding for item in review_embeddings]

In [None]:
import sys
import numpy as np

# Convert the list to a NumPy array for efficient memory calculation
embeddings_np = np.array(item_embeddings, dtype=np.float32)
# np.save('item_embeddings.npy', embeddings_np)

# Get the number of bytes consumed by the elements of the array
memory_bytes_nbytes = embeddings_np.nbytes
megabyte_value = memory_bytes_nbytes / (1024 * 1024)
print(f"Memory consumed by array elements (nbytes): {megabyte_value} mb")

# Use sys.getsizeof for a more complete but less precise measure of the entire object
# This includes Python overhead, but may not recursively count all referenced objects
memory_bytes_sys = sys.getsizeof(embeddings_np)
megabyte_value_sys = memory_bytes_sys / (1024 * 1024)
print(f"Memory consumed by the NumPy object (sys.getsizeof): {megabyte_value_sys} mb")


Memory consumed by array elements (nbytes): 29.296875 mb
Memory consumed by the NumPy object (sys.getsizeof): 29.2969970703125 mb


Check for Duplicate parent_asin for products

In [None]:
import json

def has_duplicates_by_key(data_list, key_name):
    """
    Checks if a specific key has duplicate values in a list of dictionaries.

    Args:
        data_list (list): The list of dictionaries (from JSON data).
        key_name (str): The key to check for duplicate values.

    Returns:
        bool: True if duplicates exist for the key, False otherwise.
    """
    # Extract all values for the specified key into a new list
    key_values = [item[key_name] for item in data_list if key_name in item]

    # Compare the length of the list with the length of a set (which removes duplicates)
    return len(key_values) != len(set(key_values))

In [None]:
if has_duplicates_by_key(all_beauty_items_meta, 'parent_asin'):
    print("Duplicate IDs found.")
else:
    print("No duplicate IDs.")

No duplicate IDs.


## Upsert Embeddings into Pinecone with Metadata

In [None]:
def build_vector_objects(vectors, json_objects):
    """
    Constructs a list of dicts with id, vector values, and metadata.

    vectors: List of lists (vectors)
    json_objects: List of dicts (metadata)

    Both lists must be the same length and aligned.
    """
    result = []
    for vector, obj in zip(vectors, json_objects):
        item = {
            "id": obj["parent_asin"],       # id from parent_asin
            "values": vector,               # vector values
            "metadata": obj                 # entire json object as metadata
        }
        result.append(item)
    return result


In [None]:
def build_vector_reviews(vectors, json_objects):
    """
    Constructs a list of dicts with id, vector values, and metadata.

    vectors: List of lists (vectors)
    json_objects: List of dicts (metadata)

    Both lists must be the same length and aligned.
    """
    result = []
    count = 0
    for vector, obj in zip(vectors, json_objects):
        count += 1
        item = {
            "id": str(count),      # id from parent_asin
            "values": vector,               # vector values
            "metadata": obj                 # entire json object as metadata
        }
        result.append(item)
    return result


In [None]:
item_vectors = build_vector_objects(item_embeddings, all_beauty_items_meta)

In [None]:
review_vectors = build_vector_reviews(review_embeddings, all_beauty_reviews_meta)

In [None]:
from pinecone import Pinecone
import itertools

# Initialize Pinecone client
pc = Pinecone(api_key=userdata.get('pinecone'))
index_name = "amazon-beauty-items"
index = pc.Index(index_name)

# Helper function to break a list into smaller chunks
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = list(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(itertools.islice(it, batch_size))

# Upsert the vectors in batches of 100
# (You can try 200 if your metadata is very small)
for ids_vectors_chunk in chunks(item_vectors, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

# Check index stats after upsert
print("Upsert complete.")
print(index.describe_index_stats())

Upsert complete.
{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '188',
                                    'content-type': 'application/json',
                                    'date': 'Fri, 09 Jan 2026 17:13:08 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '40',
                                    'x-pinecone-request-id': '3889210117587860160',
                                    'x-pinecone-request-latency-ms': '39',
                                    'x-pinecone-response-duration-ms': '41'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 5000}},
 'storageFullness': 0.0,
 'total_vector_count': 5000,
 'vector_type': 'dense'}


In [None]:
from pinecone import Pinecone
import itertools

# Initialize Pinecone client
pc = Pinecone(api_key=userdata.get('pinecone'))
index_name = "amazon-beauty-reviews"
index = pc.Index(index_name)

# Helper function to break a list into smaller chunks
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = list(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(itertools.islice(it, batch_size))

# Upsert the vectors in batches of 100
# (You can try 200 if your metadata is very small)
for ids_vectors_chunk in chunks(review_vectors, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

# Check index stats after upsert
print("Upsert complete.")
print(index.describe_index_stats())

Upsert complete.
{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '190',
                                    'content-type': 'application/json',
                                    'date': 'Fri, 09 Jan 2026 17:36:09 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '42',
                                    'x-pinecone-request-id': '1937050039581043016',
                                    'x-pinecone-request-latency-ms': '41',
                                    'x-pinecone-response-duration-ms': '43'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 40348}},
 'storageFullness': 0.0,
 'total_vector_count': 40348,
 'vector_type': 'dense'}
