In [5]:
import json
import pandas as pd
from datasets import load_dataset
from tqdm.notebook import tqdm
import ast  # Import the ast module

# Setup for progress bars in pandas
tqdm.pandas()

In [6]:
# The qa_Electronics.json file uses single quotes, so we'll parse it line by line
# using ast.literal_eval, which safely evaluates it as a Python dictionary.

qa_data_path = '../data/qa_Electronics.json'
qa_asins = set()

with open(qa_data_path, 'r') as f:
    for line in tqdm(f, desc="Parsing qa_Electronics.json"):
        try:
            # Use ast.literal_eval to parse the string as a Python dictionary
            data = ast.literal_eval(line)
            if 'asin' in data:
                qa_asins.add(data['asin'])
        except (ValueError, SyntaxError) as e:
            # This will catch any lines that are not valid Python literals
            print(f"Skipping line due to error: {e}")

print(f"Found {len(qa_asins)} unique ASINs in {qa_data_path}")

Parsing qa_Electronics.json: 0it [00:00, ?it/s]

Found 39371 unique ASINs in ../data/qa_Electronics.json


In [7]:
# This file is also in a JSON-lines format, with one object per line.
# We will read it line-by-line and parse each line individually using ast.literal_eval.

qa_1_to_many_path = '../data/QA_Electronics_1_to_many.json'
qa_1_to_many_asins = set()

with open(qa_1_to_many_path, 'r') as f:
    for line in tqdm(f, desc="Parsing QA_Electronics_1_to_many.json"):
        try:
            # Use ast.literal_eval to parse each line as a Python dictionary
            data = ast.literal_eval(line)
            if 'asin' in data:
                qa_1_to_many_asins.add(data['asin'])
        except (ValueError, SyntaxError) as e:
            # This will catch any lines that are not valid Python literals
            print(f"Skipping line due to error: {e}")


print(f"Found {len(qa_1_to_many_asins)} unique ASINs in {qa_1_to_many_path}")

Parsing QA_Electronics_1_to_many.json: 0it [00:00, ?it/s]

Found 38959 unique ASINs in ../data/QA_Electronics_1_to_many.json


In [8]:
import os
import glob
from dotenv import load_dotenv

# Load environment variables to find the cache path
load_dotenv()
cache_directory = os.getenv("HF_HOME", "../.cache")

print(f"Checking for and removing .lock files in cache: {cache_directory}")

# Construct a pattern to find all .lock files recursively
# The '**' pattern searches through all subdirectories
lock_file_pattern = os.path.join(cache_directory, '**', '*.lock')

# Find all files matching the pattern
lock_files = glob.glob(lock_file_pattern, recursive=True)

if not lock_files:
    print("No .lock files found. Cache is clean.")
else:
    for lock_file in lock_files:
        try:
            os.remove(lock_file)
            print(f"Removed lock file: {lock_file}")
        except OSError as e:
            print(f"Error removing file {lock_file}: {e}")
    print("Lock cleaning complete.")

Checking for and removing .lock files in cache: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data
Removed lock file: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/_Volumes_ExtremeSSD_workingspace_ChatBotAmazon_data_McAuley-Lab___amazon-reviews-2023_raw_review_Electronics_0.0.0_16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.lock
Removed lock file: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/McAuley-Lab___amazon-reviews-2023/raw_review_Electronics/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8_builder.lock
Removed lock file: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/McAuley-Lab___amazon-reviews-2023/raw_review_Electronics/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.incomplete_info.lock
Lock cleaning complete.


In [9]:
import os
import pandas as pd
import pyarrow.parquet as pq
from datasets import load_dataset
from huggingface_hub import hf_hub_download, list_repo_files
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# Load environment variables from .env file
load_dotenv()

# Use the HF_HOME environment variable for the cache directory
cache_directory = os.getenv("HF_HOME", "../.cache")
print(f"Using Hugging Face cache directory: {cache_directory}")

# --- Part 1: Process Reviews by loading the full dataset (from cache if available) ---

print("\nExtracting ASINs from reviews (loading full dataset)...")
reviews_asins = set()
try:
    print("Loading review dataset (this will be fast if cached)...")
    reviews_dataset = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        "raw_review_Electronics",
        trust_remote_code=True,
        cache_dir=cache_directory
    )['full']

    print("Extracting unique ASINs from reviews...")
    reviews_asins = set(reviews_dataset['asin'])

    print(f"Found {len(reviews_asins)} unique ASINs in reviews.")
except Exception as e:
    print(f"An error occurred while processing reviews: {e}")
    reviews_asins = None


# --- Part 2: Process Metadata by dynamically finding the file and reading from cache ---

print("\nExtracting ASINs from metadata (using direct Parquet read from cache)...")
meta_asins = set()
try:
    # Step A: Find the correct file path instead of hardcoding it.
    print("Discovering the correct metadata file path in the repository...")
    repo_id = "McAuley-Lab/Amazon-Reviews-2023"
    all_files = list_repo_files(repo_id, repo_type="dataset")

    # Filter to find the parquet file within the raw_meta_Electronics directory
    meta_filename = next((f for f in all_files if "raw_meta_Electronics" in f and f.endswith('.parquet')), None)

    if not meta_filename:
        raise FileNotFoundError("Could not dynamically find the Parquet file for raw_meta_Electronics.")

    print(f"Found metadata file: {meta_filename}")

    # Step B: Download (if not cached) and get the local path.
    parquet_file_path = hf_hub_download(
        repo_id=repo_id,
        repo_type="dataset",
        filename=meta_filename,
        cache_dir=cache_directory,
    )
    print(f"Reading from local file: {parquet_file_path}")

    # Step C: Read only the 'parent_asin' column from the local Parquet file.
    parquet_file = pq.ParquetFile(parquet_file_path)
    for i in tqdm(range(parquet_file.num_row_groups), desc="Reading metadata from cache"):
        row_group = parquet_file.read_row_group(i, columns=['parent_asin'])
        meta_asins.update(row_group.column('parent_asin').unique().to_pylist())

    print(f"Found {len(meta_asins)} unique ASINs in metadata.")
except Exception as e:
    print(f"An error occurred while processing metadata: {e}")
    meta_asins = None

# --- Combine ASINs ---
if meta_asins is not None and reviews_asins is not None:
    all_huggingface_asins = meta_asins.union(reviews_asins)
    print(f"\nFound {len(all_huggingface_asins)} unique ASINs in total from the Hugging Face datasets.")
else:
    all_huggingface_asins = set()
    print("\nCould not process one or both Hugging Face datasets. The set of Hugging Face ASINs will be empty.")

Using Hugging Face cache directory: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data

Extracting ASINs from reviews (loading full dataset)...
Loading review dataset (this will be fast if cached)...


Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

Extracting unique ASINs from reviews...
Found 1946161 unique ASINs in reviews.

Extracting ASINs from metadata (using direct Parquet read from cache)...
Discovering the correct metadata file path in the repository...
Found metadata file: raw_meta_Electronics/full-00000-of-00010.parquet
Reading from local file: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data/datasets--McAuley-Lab--Amazon-Reviews-2023/snapshots/2b6d039ed471f2ba5fd2acb718bf33b0a7e5598e/raw_meta_Electronics/full-00000-of-00010.parquet


Reading metadata from cache:   0%|          | 0/162 [00:00<?, ?it/s]

Found 161002 unique ASINs in metadata.

Found 1980815 unique ASINs in total from the Hugging Face datasets.


In [11]:
# You have these variables from the previous cells:
# qa_asins: ASINs from qa_Electronics.json
# qa_1_to_many_asins: ASINs from QA_Electronics_1_to_many.json
# all_huggingface_asins: Combined ASINs from meta and reviews

print("--- Verifying Overlap ---")
print(f"Unique ASINs in qa_Electronics.json: {len(qa_asins)}")
print(f"Unique ASINs in QA_Electronics_1_to_many.json: {len(qa_1_to_many_asins)}")
print(f"Unique ASINs in Hugging Face (meta & reviews): {len(all_huggingface_asins)}")

# First, find the ASINs that are common to both Q&A files
common_qa_asins = qa_asins.intersection(qa_1_to_many_asins)
print(f"\nASINs found in BOTH Q&A files: {len(common_qa_asins)}")

# Now, find the final intersection with the products from Hugging Face
final_intersection_asins = common_qa_asins.intersection(all_huggingface_asins)

print(f"----------------------------------------------------")
print(f"Final number of products with rich data (meta, reviews, and both Q&As): {len(final_intersection_asins)}")
print("These are the products we will use to build the final dataset.")

--- Verifying Overlap ---
Unique ASINs in qa_Electronics.json: 39371
Unique ASINs in QA_Electronics_1_to_many.json: 38959
Unique ASINs in Hugging Face (meta & reviews): 1980815

ASINs found in BOTH Q&A files: 38959
----------------------------------------------------
Final number of products with rich data (meta, reviews, and both Q&As): 29178
These are the products we will use to build the final dataset.


In [8]:

import os
import glob
import pandas as pd
import pyarrow.parquet as pq
import ast
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# --- 1. Setup and Configuration ---
print("--- Step 1: Initializing Setup ---")
load_dotenv()
cache_directory = os.getenv("HF_HOME", "../.cache")
qa_file_1 = '../data/qa_Electronics.json'
qa_file_2 = '../data/QA_Electronics_1_to_many.json'
repo_id = "McAuley-Lab/Amazon-Reviews-2023"
output_dir = "../data/gold_standard_data"
os.makedirs(output_dir, exist_ok=True)
print(f"Final data will be saved in: {output_dir}")

# --- 2. Get ASINs from ALL Original Sources ---
print("\n--- Step 2: Extracting ASINs from all original sources ---")

# Get ASINs from Q&A files (with cleaning)
def get_asins_from_qa_file(path):
    with open(path, 'r') as f:
        return {str(ast.literal_eval(line).get('asin')).strip() for line in f}
qa1_asins = get_asins_from_qa_file(qa_file_1)
qa2_asins = get_asins_from_qa_file(qa_file_2)
qa_asins = qa1_asins.union(qa2_asins) # Combine all Q&A ASINs
print(f"Found {len(qa_asins)} unique ASINs in all Q&A files.")

# Get ASINs from the full reviews dataset
print("Loading review ASINs...")
reviews_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True, cache_dir=cache_directory)['full']
# Use the efficient .unique() method and clean the ASINs
reviews_asins = {str(asin).strip() for asin in reviews_dataset.unique('asin')}
print(f"Found {len(reviews_asins)} unique ASINs in reviews dataset.")

# Get ASINs from all 10 metadata shards
print("Loading metadata ASINs...")
meta_asins = set()
for i in tqdm(range(10), desc="Scanning meta shards for ASINs"):
    shard_filename = f"raw_meta_Electronics/full-{i:05d}-of-00010.parquet"
    local_path = hf_hub_download(repo_id, filename=shard_filename, repo_type="dataset", cache_dir=cache_directory)
    # Read only the 'parent_asin' column for efficiency
    asin_column = pq.read_table(local_path, columns=['parent_asin']).to_pandas()['parent_asin']
    meta_asins.update(asin_column.astype(str).str.strip())
print(f"Found {len(meta_asins)} unique ASINs in metadata.")

# --- 3. Calculate the TRUE, STRICT Intersection ---
print("\n--- Step 3: Calculating the strict intersection of all sources ---")
gold_standard_asins = list(qa_asins.intersection(reviews_asins).intersection(meta_asins))
print(f"Found {len(gold_standard_asins)} products that exist in ALL three original datasets.")

# --- 4. Filter Original Data Sources ONCE with the Gold Standard Set ---
print("\n--- Step 4: Filtering original data sources with the gold standard set ---")

# Filter Metadata
print("Filtering metadata...")
meta_df_list = []
for i in tqdm(range(10), desc="Filtering meta shards"):
    shard_filename = f"raw_meta_Electronics/full-{i:05d}-of-00010.parquet"
    local_path = hf_hub_download(repo_id, filename=shard_filename, repo_type="dataset", cache_dir=cache_directory)
    # Filter the data on disk before loading into pandas
    table = pq.read_table(local_path, filters=[('parent_asin', 'in', gold_standard_asins)])
    meta_df_list.append(table.to_pandas())
gold_meta_df = pd.concat(meta_df_list, ignore_index=True)
print(f"Final metadata count: {len(gold_meta_df)}")


# Filter Reviews
print("Filtering reviews...")
filtered_reviews_dataset = reviews_dataset.filter(
    lambda example: str(example["asin"]).strip() in gold_standard_asins,
    num_proc=4
)
gold_reviews_df = filtered_reviews_dataset.to_pandas()
print(f"Final reviews count: {len(gold_reviews_df)}")


# Filter Q&A
print("Filtering Q&A...")
def load_and_filter_qa(path, asins_to_keep):
    with open(path, 'r') as f:
        records = [ast.literal_eval(line) for line in f if str(ast.literal_eval(line).get('asin')).strip() in asins_to_keep]
    return pd.DataFrame(records)
qa_df1 = load_and_filter_qa(qa_file_1, gold_standard_asins)
qa_df2 = load_and_filter_qa(qa_file_2, gold_standard_asins)
gold_qa_df = pd.concat([qa_df1, qa_df2], ignore_index=True)
print(f"Final Q&A count: {len(gold_qa_df)}")



--- Step 1: Initializing Setup ---
Final data will be saved in: ../data/gold_standard_data

--- Step 2: Extracting ASINs from all original sources ---
Found 39371 unique ASINs in all Q&A files.
Loading review ASINs...


Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

Found 1946161 unique ASINs in reviews dataset.
Loading metadata ASINs...


Scanning meta shards for ASINs:   0%|          | 0/10 [00:00<?, ?it/s]

Found 1610012 unique ASINs in metadata.

--- Step 3: Calculating the strict intersection of all sources ---
Found 22974 products that exist in ALL three original datasets.

--- Step 4: Filtering original data sources with the gold standard set ---
Filtering metadata...


Filtering meta shards:   0%|          | 0/10 [00:00<?, ?it/s]

Final metadata count: 22974
Filtering reviews...


Filter (num_proc=4):   0%|          | 0/43886944 [00:00<?, ? examples/s]

Final reviews count: 3473796
Filtering Q&A...
Final Q&A count: 203738

--- Step 5: Saving the final 'gold standard' files ---


UnicodeEncodeError: 'utf-8' codec can't encode character '\ude03' in position 71: surrogates not allowed

In [9]:
# This cell assumes the previous steps have created:
# - gold_meta_df (DataFrame)
# - gold_reviews_df (DataFrame)
# - gold_qa_df (DataFrame)
# - output_dir (string path)

print("\n--- Step 5: Cleaning and Saving the final 'gold standard' files ---")

# --- Clean the Q&A DataFrame for Unicode errors ---
print("Cleaning text data in the Q&A DataFrame...")

def clean_surrogates(text):
    """
    Cleans invalid surrogate characters from a string to ensure UTF-8 compatibility.
    """
    if not isinstance(text, str):
        return text
    # The 'surrogatepass' error handler correctly handles these broken characters
    return text.encode('utf-8', 'surrogatepass').decode('utf-8')

# Apply the cleaning function to all object (likely text) columns in the Q&A data
for col in gold_qa_df.select_dtypes(include=['object']).columns:
    gold_qa_df[col] = gold_qa_df[col].astype(str).apply(clean_surrogates)

print("Q&A data has been cleaned.")


# --- Save the final, consistent DataFrames ---
try:
    print("Saving the final, clean files...")
    # Save the metadata (which was already clean)
    gold_meta_df.to_parquet(os.path.join(output_dir, "gold_metadata.parquet"), index=False)

    # Save the reviews (which was also clean)
    gold_reviews_df.to_parquet(os.path.join(output_dir, "gold_reviews.parquet"), index=False)

    # Save the now-clean Q&A data
    gold_qa_df.to_parquet(os.path.join(output_dir, "gold_qa.parquet"), index=False)

    print("\n✅ Process complete. You now have three perfectly aligned and clean Parquet files.")

except Exception as e:
    print(f"\nAn error occurred during saving: {e}")


--- Step 5: Cleaning and Saving the final 'gold standard' files ---
Cleaning text data in the Q&A DataFrame...
Q&A data has been cleaned.
Saving the final, clean files...

✅ Process complete. You now have three perfectly aligned and clean Parquet files.


In [12]:
import pandas as pd
import os

print("--- Loading Gold Standard Files ---")

# Define the path to your final, clean data
data_dir = "../data/gold_standard_data"

# Load each of the final, consistent Parquet files
meta_df = pd.read_parquet(os.path.join(data_dir, "gold_metadata.parquet"))
reviews_df = pd.read_parquet(os.path.join(data_dir, "gold_reviews.parquet"))
qa_df = pd.read_parquet(os.path.join(data_dir, "gold_qa.parquet"))

print(f"Loaded {len(meta_df)} metadata records.")
print(f"Loaded {len(reviews_df)} review records.")
print(f"Loaded {len(qa_df)} Q&A records.")

--- Loading Gold Standard Files ---
Loaded 22974 metadata records.
Loaded 3473796 review records.
Loaded 203738 Q&A records.


In [13]:
print("\n--- Aggregating and Merging Data ---")

# 1. Aggregate reviews: Group by 'asin' and create a list of all review data for each product.
print("Aggregating reviews...")
# Using .apply(list) after grouping is a standard way to create these lists of records.
reviews_agg = (reviews_df.groupby('asin')
               .apply(lambda x: x.to_dict(orient='records'))
               .reset_index(name='reviews'))
print(f"Aggregated reviews for {len(reviews_agg)} unique products.")


# 2. Aggregate Q&A data in the same way.
print("Aggregating Q&A data...")
qa_agg = (qa_df.groupby('asin')
          .apply(lambda x: x.to_dict(orient='records'))
          .reset_index(name='q_and_a'))
print(f"Aggregated Q&A for {len(qa_agg)} unique products.")


# 3. Merge into the final master DataFrame
print("Merging all data sources...")
# Start with the metadata as our base
master_df = meta_df.copy()
# Rename the metadata's 'parent_asin' to 'asin' for a clean merge
master_df.rename(columns={'parent_asin': 'asin'}, inplace=True)

# Merge the aggregated reviews
master_df = pd.merge(master_df, reviews_agg, on='asin', how='left')

# Merge the aggregated Q&A data
master_df = pd.merge(master_df, qa_agg, on='asin', how='left')


print("\n--- Merge Complete ---")
print(f"Final master dataset has {len(master_df)} rows (products).")
print("Columns:", master_df.columns.tolist())
print("\nSample of the final master dataset:")
display(master_df.head())


--- Aggregating and Merging Data ---
Aggregating reviews...


  .apply(lambda x: x.to_dict(orient='records'))


Aggregated reviews for 22974 unique products.
Aggregating Q&A data...
Aggregated Q&A for 22974 unique products.
Merging all data sources...

--- Merge Complete ---
Final master dataset has 22974 rows (products).
Columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'asin', 'bought_together', 'subtitle', 'author', 'reviews', 'q_and_a']

Sample of the final master dataset:


  .apply(lambda x: x.to_dict(orient='records'))


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,asin,bought_together,subtitle,author,reviews,q_and_a
0,Portable Audio & Accessories,Dock Audio Extender Adapter Converter Cable fo...,3.6,45,[],[],,"{'hi_res': [None, None], 'large': ['https://m....","{'title': [], 'url': [], 'user_id': []}",iteck,"[Electronics, Computers & Accessories, Tablet ...","{""Brand Name"": ""iTeck"", ""Item Weight"": ""0.8 ou...",B00CFGZAT8,,,,"[{'rating': 1.0, 'title': 'One Star', 'text': ...","[{'questionType': 'yes/no', 'asin': 'B00CFGZAT..."
1,Camera & Photo,GGS Swivi HD DSLR LCD Universal Foldable Viewf...,3.7,52,[],[],149.0,"{'hi_res': [None, None, None, None, None, None...","{'title': [], 'url': [], 'user_id': []}",Swivi,"[Electronics, Camera & Photo, Accessories, Vie...","{""Product Dimensions"": ""3.8 x 4.3 x 6.9 inches...",B00BGFTSGU,,,,"[{'rating': 4.0, 'title': 'Great except for on...","[{'questionType': 'yes/no', 'asin': 'B00BGFTSG..."
2,Camera & Photo,BM Premium 2-Pack of NP-85 Batteries and Charg...,4.7,345,[NP85 Li-ion Battery for FujiFilm FinePix SL24...,[Bring your digital camera back to life with a...,22.99,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Wasabi Power Fujifilm NP-95 2 Pack...,BM Premium,"[Electronics, Camera & Photo, Accessories, Bat...","{""Package Dimensions"": ""6.46 x 3.94 x 1.85 inc...",B00AZOIG9I,,,,"[{'rating': 5.0, 'title': 'Five Stars', 'text'...","[{'questionType': 'open-ended', 'asin': 'B00AZ..."
3,Computers,ZAGG InvisibleShield HD – EZ Apply Film Screen...,4.1,158,[100% Clear: Independent light transmission te...,"[Exceptionally clear, unbelievably thin, and v...",,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['ZAGG invisibleSHIELD Screen Protec...,InvisibleShield,"[Electronics, Computers & Accessories, Tablet ...","{""Standing screen display size"": ""9.7 Inches"",...",B00F361KLO,,,,"[{'rating': 5.0, 'title': 'Five Stars', 'text'...","[{'questionType': 'yes/no', 'asin': 'B00F361KL..."
4,Computers,Mobile Edge ECO Laptop Messenger (Eco-Friendly...,3.5,12,[Fits 17.3-Inch laptops. Laptop compartment di...,[The 17.3-Inch ECO Messenger is the latest add...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Mobile Edge,"[Electronics, Computers & Accessories, Laptop ...","{""Product Dimensions"": ""4.25 x 18 x 13.5 inche...",B0029L7N76,,,,"[{'rating': 2.0, 'title': 'Too big!', 'text': ...","[{'questionType': 'yes/no', 'asin': 'B0029L7N7..."


In [14]:
# Define the final output path for the master dataset
master_output_path = os.path.join(data_dir, "master_chatbot_dataset.parquet")

print(f"\n--- Saving Final Master Dataset ---")
print(f"Saving to: {master_output_path}")

try:
    # Save the final DataFrame to a single Parquet file
    master_df.to_parquet(master_output_path, index=False)
    print("\n✅ Project Complete! ✅")
    print("Your master dataset is now saved and ready for building your chatbot.")
except Exception as e:
    print(f"\nAn error occurred during the final save: {e}")


--- Saving Final Master Dataset ---
Saving to: ../data/gold_standard_data/master_chatbot_dataset.parquet

✅ Project Complete! ✅
Your master dataset is now saved and ready for building your chatbot.
