In [1]:
import sys

# This will print the exact path to the Python executable that is running this notebook's code.
print(sys.executable)

/Users/qyxmacmini/Documents/GitHub/e-commerce-chatbot/.venv/bin/python


In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

# --- Load Project-Specific Environment Variables ---
# This is the key step: It searches for a .env file and loads it.
# It's smart enough to search up from the current directory to find it.
load_dotenv()

# You can now verify that the environment variable is set for this session
hf_home = os.getenv("HF_HOME")
print(f"Hugging Face cache is set to: {hf_home}")


# --- Now you can proceed with your imports and data loading ---
from datasets import load_dataset
import pandas as pd

# This load_dataset call will now automatically use the path defined in your .env file
print("\nLoading Electronics metadata...")
meta_dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_meta_Electronics",
    streaming=True,
    trust_remote_code=True
)

# ... rest of your notebook

Hugging Face cache is set to: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data

Loading Electronics metadata...


In [2]:
from datasets import load_dataset
import pandas as pd
import json
from pathlib import Path
import os

# --- 1. Load Environment Variables (if needed for authentication) ---
from dotenv import load_dotenv
load_dotenv()
print("Environment variables loaded.")

# --- 2. Load the FULL dataset (non-streaming) ---
# This will download the entire ~5.5 GB metadata file to your cache directory
# on your portable drive. This will take some time on the first run.
print("Loading Electronics metadata (non-streaming mode)...")
print("This may take several minutes as it downloads the full dataset...")

try:
    # We are now using the default, non-streaming method which is more robust.
    # We specify split='full' to get the entire dataset.
    meta_dataset = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        name="raw_meta_Electronics",
        split="full",
        trust_remote_code=True
    )
    print("\nDataset loaded successfully!")
    print(f"Total number of products in Electronics metadata: {len(meta_dataset)}")

except Exception as e:
    print(f"An error occurred: {e}")


# --- 3. Now you can explore the data ---
if 'meta_dataset' in locals():
    # Print the first product to see the structure
    print("\n--- First Product Data ---")
    first_product = meta_dataset[0]
    print(json.dumps(first_product, indent=2))

    # You can now create a Pandas DataFrame from a larger slice for better EDA
    print("\n--- Sample as DataFrame ---")
    meta_sample_df = pd.DataFrame(meta_dataset[:1000]) # Analyze first 1000 rows
    display(meta_sample_df.head())

    # Now you can do more powerful analysis, like checking for missing values across the whole sample
    print("\nMissing values in the first 1000 products:")
    display(meta_sample_df.isnull().sum())

Environment variables loaded.
Loading Electronics metadata (non-streaming mode)...
This may take several minutes as it downloads the full dataset...


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

meta_Electronics.jsonl:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

Generating full split:   0%|          | 0/1610012 [00:00<?, ? examples/s]


Dataset loaded successfully!
Total number of products in Electronics metadata: 1610012

--- First Product Data ---
{
  "main_category": "All Electronics",
  "title": "FS-1051 FATSHARK TELEPORTER V3 HEADSET",
  "average_rating": 3.5,
  "rating_number": 6,
  "features": [],
  "description": [
    "Teleporter V3 The \u201cTeleporter V3\u201d kit sets a new level of value in the FPV world with Fat Shark renowned performance and quality. The fun of FPV is experienced firsthand through the large screen FPV headset with integrated NexwaveRF receiver technology while simultaneously recording onboard HD footage with the included \u201cPilotHD\u201d camera. The \u201cTeleporter V3\u201d kit comes complete with everything you need to step into the cockpit of your FPV vehicle. We\u2019ve included our powerful 250mW 5.8Ghz transmitter, 25 degree FOV headset (largest QVGA display available), the brand new \u201cPilotHD\u201d camera with live AV out and all the cables, antennas and connectors needed

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,FS-1051 FATSHARK TELEPORTER V3 HEADSET,3.5,6,[],[Teleporter V3 The “Teleporter V3” kit sets a ...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Fat Shark,"[Electronics, Television & Video, Video Glasses]","{""Date First Available"": ""August 2, 2014"", ""Ma...",B00MCW7G9M,,,
1,All Electronics,Ce-H22B12-S1 4Kx2K Hdmi 4Port,5.0,1,"[UPC: 662774021904, Weight: 0.600 lbs]",[HDMI In - HDMI Out],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",SIIG,"[Electronics, Television & Video, Accessories,...","{""Product Dimensions"": ""0.83 x 4.17 x 2.05 inc...",B00YT6XQSE,,,
2,Computers,Digi-Tatoo Decal Skin Compatible With MacBook ...,4.5,246,[WARNING: Please IDENTIFY MODEL NUMBER on the ...,[],19.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['AL 2Sides Video', 'MacBook Protect...",Digi-Tatoo,"[Electronics, Computers & Accessories, Laptop ...","{""Brand"": ""Digi-Tatoo"", ""Color"": ""Fresh Marble...",B07SM135LS,,,
3,AMAZON FASHION,NotoCity Compatible with Vivoactive 4 band 22m...,4.5,233,[☛NotoCity 22mm band is designed for Vivoactiv...,[],9.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",NotoCity,"[Electronics, Wearable Technology, Clips, Arm ...","{""Date First Available"": ""May 29, 2020"", ""Manu...",B089CNGZCW,,,
4,Cell Phones & Accessories,Motorola Droid X Essentials Combo Pack,3.8,64,"[New Droid X Essentials Combo Pack, Exclusive ...",[all Genuine High Quality Motorola Made Access...,14.99,"{'hi_res': [None, None, None, None, None], 'la...","{'title': [], 'url': [], 'user_id': []}",Verizon,"[Electronics, Computers & Accessories, Compute...","{""Product Dimensions"": ""11.6 x 6.9 x 3.1 inche...",B004E2Z88O,,,



Missing values in the first 1000 products:


main_category        11
title                 0
average_rating        0
rating_number         0
features              0
description           0
price                 0
images                0
videos                0
store                 5
categories            0
details               0
parent_asin           0
bought_together    1000
subtitle            999
author             1000
dtype: int64