In [1]:
import sys

# This will print the exact path to the Python executable that is running this notebook's code.
print(sys.executable)

/Users/qyxmacmini/Documents/GitHub/e-commerce-chatbot/.venv/bin/python


In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path

# --- Load Project-Specific Environment Variables ---
# This is the key step: It searches for a .env file and loads it.
# It's smart enough to search up from the current directory to find it.
load_dotenv()

# You can now verify that the environment variable is set for this session
hf_home = os.getenv("HF_HOME")
print(f"Hugging Face cache is set to: {hf_home}")

# !rm /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/datasets/_Volumes_ExtremeSSD_workingspace_ChatBotAmazon_data_datasets_McAuley-Lab___amazon-reviews-2023_raw_meta_Electronics_0.0.0_16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.lock
#
# !rm /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/datasets/_Volumes_ExtremeSSD_workingspace_ChatBotAmazon_data_datasets_McAuley-Lab___amazon-reviews-2023_raw_review_Electronics_0.0.0_16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.lock

Hugging Face cache is set to: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data


In [2]:
import pandas as pd
from datasets import load_dataset

# --- Load Metadata Lazily ---
# This operation is fast and memory-efficient. It creates a pointer to the
# dataset on disk but does not load it into RAM.
print("Loading metadata dataset object...")
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True)
print("Metadata object loaded.")
print(dataset_meta)

# --- Load Reviews Lazily ---
print("\nLoading reviews dataset object...")
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True)
print("Reviews object loaded.")
print(dataset_reviews)

Loading metadata dataset object...
Metadata object loaded.
DatasetDict({
    full: Dataset({
        features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
        num_rows: 1610012
    })
})

Loading reviews dataset object...


Generating full split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

Reviews object loaded.
DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 43886944
    })
})


In [10]:
import json
import pandas as pd
from bs4 import BeautifulSoup

# --- Helper Functions (CORRECTED) ---

def is_empty_definitive(value):
    """
    Definitive function that correctly handles all known empty cases.
    The order of checks is important to avoid the ValueError.
    """
    # 1. Check for list or dict first to handle them by length.
    if isinstance(value, (list, dict)):
        return len(value) == 0

    # 2. Check for standard None or NaN.
    # pd.isnull handles both None and numpy.nan
    if pd.isnull(value):
        return True

    # 3. Check for empty or "None" strings.
    if isinstance(value, str):
        return value.strip() == '' or value.lower() == 'none'

    # 4. If none of the above, it's not empty.
    return False

def robust_clean_html(data):
    """Safely handles list or string input and cleans HTML."""
    text_to_clean = ""
    if isinstance(data, list) and len(data) > 0:
        text_to_clean = data[0]
    elif isinstance(data, str):
        text_to_clean = data

    if not isinstance(text_to_clean, str):
        return ""

    soup = BeautifulSoup(text_to_clean, "lxml")
    return soup.get_text(separator=" ", strip=True)

def create_knowledge_document(product: dict) -> str:
    """
    Creates a single text document for a product, formatted for a knowledge base.
    """
    doc = f"Product Title: {product.get('title', 'N/A')}\n"

    if not is_empty_definitive(product.get('store')):
        doc += f"Brand: {product.get('store')}\n"

    if not is_empty_definitive(product.get('categories')):
        # This part is now safe because is_empty_definitive handles the list correctly
        categories_str = ", ".join(product['categories'])
        doc += f"Categories: {categories_str}\n"

    price = product.get('price', 'N/A')
    if is_empty_definitive(price):
        price = 'N/A'
    doc += f"Price: {price}\n"

    doc += f"Average Rating: {product.get('average_rating', 'N/A')} ({product.get('rating_number', 0)} ratings)\n"

    if not is_empty_definitive(product.get('features')):
        cleaned_features = [robust_clean_html(f) for f in product['features']]
        features_str = "\n- ".join(filter(None, cleaned_features)) # Use filter to remove empty strings
        if features_str:
             doc += f"\nFeatures:\n- {features_str}\n"

    if not is_empty_definitive(product.get('description')):
        cleaned_description = robust_clean_html(product['description'])
        if cleaned_description:
            doc += f"\nDescription:\n{cleaned_description}\n"

    details_json = product.get('details')
    if not is_empty_definitive(details_json):
        try:
            details_dict = json.loads(details_json)
            if details_dict:
                doc += "\nTechnical Details:\n"
                for key, value in details_dict.items():
                    doc += f"- {key}: {value}\n"
        except (json.JSONDecodeError, TypeError):
            pass

    return doc

# --- Preview the Transformation on a few sample rows ---
print("--- Previewing Knowledge Document Creation (with fix) ---\n")
# Using the globally defined sample_df from the previous cell
for index, row in sample_df.head(3).iterrows():
    product_dict = row.to_dict()
    knowledge_doc = create_knowledge_document(product_dict)

    print(f"parent_asin: {product_dict.get('parent_asin')}")
    print("-------------------- knowledge_doc --------------------")
    print(knowledge_doc)
    print("="*60 + "\n")

--- Previewing Knowledge Document Creation (with fix) ---

parent_asin: B00SXU9RTO
-------------------- knowledge_doc --------------------
Product Title: AOER Acoustic Tube Earpiece Headset Mic for Motorola XPR6500 XPR6550 XPR6580 APX7000 APX6000 Radio Security Door Supervisor
Brand: AOER
Categories: Electronics, Portable Audio & Video, CB & Two-Way Radios, Accessories, Headsets & Microphones
Price: 24.99
Average Rating: 3.9 (9 ratings)

Features:
- Clear Acoustic Tube Covert Earpiece Earphone Headset PTT(Push To Talk) with built-in line mic microphone for Motorola XPR6500 XPR6550 XPR6580 APX7000 APX6000 Radio Security Door Supervisor.
- Clip this hands free earpiece onto your shirt and begin using your radio device to communicate discretely and easily, whether you're in security, retail, or hospitality.
- Provides excellent durability, reliability and performance in low-profile environment.
- Ideal for demanding covert security applications where sound clarity and earpiece durability 

  soup = BeautifulSoup(text_to_clean, "lxml")


In [3]:
# --- Inspect the Metadata Schema ---
# This is the formal, definitive way to see the structure and data types.
print("--- Electronics Metadata Schema ---")
print(dataset_meta['full'].features)

# --- Inspect the Reviews Schema ---
print("\n--- Electronics Reviews Schema ---")
print(dataset_reviews['full'].features)

--- Electronics Metadata Schema ---
{'main_category': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'average_rating': Value(dtype='float64', id=None), 'rating_number': Value(dtype='int64', id=None), 'features': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'description': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'price': Value(dtype='string', id=None), 'images': Sequence(feature={'hi_res': Value(dtype='string', id=None), 'large': Value(dtype='string', id=None), 'thumb': Value(dtype='string', id=None), 'variant': Value(dtype='string', id=None)}, length=-1, id=None), 'videos': Sequence(feature={'title': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'user_id': Value(dtype='string', id=None)}, length=-1, id=None), 'store': Value(dtype='string', id=None), 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'details': Value(dtype='string', id=None), 'parent_

In [4]:
# --- Inspect a Single Record from Metadata ---
# Accessing an element by index is highly efficient and doesn't load other data.
print("--- First Record from Metadata ---")
print(dataset_meta['full'][0])

--- First Record from Metadata ---
{'main_category': 'All Electronics', 'title': 'FS-1051 FATSHARK TELEPORTER V3 HEADSET', 'average_rating': 3.5, 'rating_number': 6, 'features': [], 'description': ['Teleporter V3 The “Teleporter V3” kit sets a new level of value in the FPV world with Fat Shark renowned performance and quality. The fun of FPV is experienced firsthand through the large screen FPV headset with integrated NexwaveRF receiver technology while simultaneously recording onboard HD footage with the included “PilotHD” camera. The “Teleporter V3” kit comes complete with everything you need to step into the cockpit of your FPV vehicle. We’ve included our powerful 250mW 5.8Ghz transmitter, 25 degree FOV headset (largest QVGA display available), the brand new “PilotHD” camera with live AV out and all the cables, antennas and connectors needed.'], 'price': 'None', 'images': {'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/41qrX56lsYL._AC_.jpg'], 'thumb': ['https://m.m

In [6]:
# --- Create a Manageable Sample for Analysis ---
# We shuffle the dataset and select 5000 random records.
# This sample is small enough to fit in memory and is representative of the whole dataset.
print("Creating a random sample of 5000 records for exploration...")
meta_sample_ds = dataset_meta['full'].shuffle(seed=42).select(range(5000))

# NOW, it's safe to convert the SMALL sample to a pandas DataFrame.
sample_df = pd.DataFrame(meta_sample_ds)

print("Sample DataFrame created successfully.")
sample_df.info()

Creating a random sample of 5000 records for exploration...
Sample DataFrame created successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   main_category    4661 non-null   object 
 1   title            5000 non-null   object 
 2   average_rating   5000 non-null   float64
 3   rating_number    5000 non-null   int64  
 4   features         5000 non-null   object 
 5   description      5000 non-null   object 
 6   price            5000 non-null   object 
 7   images           5000 non-null   object 
 8   videos           5000 non-null   object 
 9   store            4981 non-null   object 
 10  categories       5000 non-null   object 
 11  details          5000 non-null   object 
 12  parent_asin      5000 non-null   object 
 13  bought_together  0 non-null      object 
 14  subtitle         2 non-null      object 
 15  author 

In [7]:
import numpy as np

# Note: All analysis is now performed on the much smaller `sample_df`

def is_empty_definitive(value):
    """Definitive function that correctly handles all known empty cases."""
    if value == 'None':
        return True
    if isinstance(value, (list, dict, np.ndarray)):
        return len(value) == 0
    if pd.isnull(value):
        return True
    if isinstance(value, str):
        return value.strip() == ''
    return False

# Calculate empty counts on our sample
empty_counts = sample_df.map(is_empty_definitive).sum()

# Create the completeness report
completeness_report = pd.DataFrame({
    'empty_count (in sample)': empty_counts,
    'fill_rate_in_sample (%)': (1 - (empty_counts / len(sample_df))) * 100
})

print("--- Realistic Fill Rate Analysis (based on a 5000-record sample) ---")
print(completeness_report.sort_values(by='fill_rate_in_sample (%)', ascending=False))

--- Realistic Fill Rate Analysis (based on a 5000-record sample) ---
                 empty_count (in sample)  fill_rate_in_sample (%)
title                                  0                   100.00
average_rating                         0                   100.00
rating_number                          0                   100.00
images                                 0                   100.00
videos                                 0                   100.00
details                                0                   100.00
parent_asin                            0                   100.00
store                                 19                    99.62
main_category                        339                    93.22
categories                           381                    92.38
features                            1283                    74.34
description                         2123                    57.54
price                               3421                    31.58
subtitl

In [11]:
# --- How to Process the ENTIRE Dataset Without Loading it All ---
# The .iter() method creates a generator that yields batches of data.
# Each batch can be comfortably processed in memory.

print("Demonstrating safe iteration over the full dataset in batches.")
print("We will process 15,000 records in batches of 5,000 to show the concept.")

total_processed_count = 0
# Create a smaller dataset view to iterate over for this example
subset_to_process = dataset_meta['full'].select(range(15000))

for batch in subset_to_process.iter(batch_size=5000):
    # 'batch' is a dictionary of lists (e.g., {'title': [...], 'price': [...]})
    # We can easily convert it to a DataFrame for processing.
    batch_df = pd.DataFrame(batch)

    # Here, you would do your transformation, like creating knowledge docs.
    # For this example, we'll just print the progress.
    print(f"Processed a batch of {len(batch_df)} records.")
    total_processed_count += len(batch_df)

print(f"\nFinished iterating. Total records processed: {total_processed_count}")

Demonstrating safe iteration over the full dataset in batches.
We will process 15,000 records in batches of 5,000 to show the concept.
Processed a batch of 5000 records.
Processed a batch of 5000 records.
Processed a batch of 5000 records.

Finished iterating. Total records processed: 15000


In [12]:
batch_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Computers,MOSISO Laptop Shoulder Bag Compatible with Mac...,4.8,3242,[Internal Dimensions: 15.74 x 0.79 x 11.4 inch...,[],25.99,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Mandala Pattern Laptop Shoulder Ba...,MOSISO,"[Electronics, Computers & Accessories, Laptop ...","{""Standing screen display size"": ""15.6 Inches""...",B0893864XN,,,
1,All Electronics,Professional Carrying Case for DJI Spark and D...,5.0,1,[► The new DJI Spark & Goggles Case by MC-CASE...,[The new DJI Spark & Goggles Case by MC-CASES ...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",mc-cases,"[Electronics, Camera & Photo]","{""Brand Name"": ""mc-cases"", ""Item Weight"": ""5.9...",B075ZVQBX2,,,
2,Cell Phones & Accessories,Case for Samsung Galaxy Tab A 8.4 (2020) SM-T3...,4.6,647,[【Compatibility】Designed for Samsung Galaxy Ta...,[],,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['DUX DUCIS Case for Samsung Galaxy ...,DUX DUCIS,"[Electronics, Computers & Accessories, Tablet ...","{""Product Dimensions"": ""9.06 x 6.3 x 0.59 inch...",B0873594CB,,,
3,Cell Phones & Accessories,Catalyst Ring Clip case for AirPods 1 & 2 Sili...,3.2,43,"[✅ EASY ""CLIP AND GO"" - Carry your AirPods aro...",[],,"{'hi_res': [None, 'https://m.media-amazon.com/...",{'title': ['Ring Clip case for AirPods by Cata...,Catalyst,"[Electronics, Headphones, Earbuds & Accessorie...","{""Product Dimensions"": ""2.36 x 1.93 x 1.02 inc...",B083LBRSZ2,,,
4,Cell Phones & Accessories,Graphic4You Fireworks United States 4th of Jul...,5.0,1,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",GARDWEN,"[Electronics, Car & Vehicle Electronics, Vehic...","{""Other display features"": ""Wireless"", ""Color""...",B01I4B5CYU,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Camera & Photo,SmallRig Rotatable Cold Shoe Mount Adapter (Si...,4.5,15,[【360° Adjustable】SmallRig 2935 can rotate 360...,[],,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Rotatable Cold Shoe Mount Adapter ...,SMALLRIG,"[Electronics, Camera & Photo, Accessories, Tri...","{""Product Dimensions"": ""1.18 x 0.98 x 0.85 inc...",B08K7HC75M,,,
4996,Computers,"Vintage Canvas Business Laptop Backpack,Tezoo ...",4.2,8,[Multifunction: the tezoo backpack with USB ba...,[Description: material: nylon inner pocket mat...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",gracosy,"[Electronics, Computers & Accessories, Laptop ...","{""Product Dimensions"": ""18.67 x 14.61 x 4.33 i...",B072QBWQT6,,,
4997,Cell Phones & Accessories,Sennheiser VMX 100 Bluetooth Headset (Black),2.7,19,[Includes USB wall charger as well as car char...,[Improvements in speech clarity and noise redu...,,"{'hi_res': [None, None], 'large': ['https://m....","{'title': [], 'url': [], 'user_id': []}",Sennheiser,"[Electronics, Headphones, Earbuds & Accessorie...","{""Product Dimensions"": ""4 x 1 x 4 inches"", ""It...",B000VZN9HA,,,
4998,Industrial & Scientific,"YOTENKO CB Coax Cable,RG58 Coaxial Cable 25Ft,...",4.6,1328,[【Cable Type & Details】- Cable & Connector: RG...,[],18.68,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['UHF PL259 to UHF PL259 Cable RG58 ...,YOTENKO,"[Electronics, Home Audio, Home Audio Accessori...","{""Brand"": ""YOTENKO"", ""Connector Type"": ""Pl259 ...",B0B3XN9ZQN,,,


In [15]:
print(batch_df.columns)
print(subset_to_process.features)

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
      dtype='object')
{'main_category': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'average_rating': Value(dtype='float64', id=None), 'rating_number': Value(dtype='int64', id=None), 'features': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'description': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'price': Value(dtype='string', id=None), 'images': Sequence(feature={'hi_res': Value(dtype='string', id=None), 'large': Value(dtype='string', id=None), 'thumb': Value(dtype='string', id=None), 'variant': Value(dtype='string', id=None)}, length=-1, id=None), 'videos': Sequence(feature={'title': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'user_id': Value(dtype='st