# Download Amazon Reviews 2023 - Books Dataset

**Dataset:** [Amazon Reviews 2023](https://amazon-reviews-2023.github.io/)

**Method:** Direct download (Hugging Face datasets no longer supports trust_remote_code)

**Purpose:** Download and load Books dataset for Semantic ID generation


In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import json
import gzip
import os
import urllib.request
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# PyTorch with MPS support
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

print("✅ Libraries imported!")


In [None]:
# Check MPS (Metal Performance Shaders) support
print("🔍 Checking M4 GPU support...")
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"✅ Using M4 GPU: {device}")
    
    # Test GPU computation
    x = torch.randn(1000, 1000, device=device)
    y = torch.randn(1000, 1000, device=device)
    z = torch.mm(x, y)
    print(f"✅ GPU test successful! Result shape: {z.shape}")
else:
    device = torch.device("cpu")
    print(f"⚠️  MPS not available, using CPU: {device}")


In [None]:
# Create data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)

print("📁 Data directories created")


In [None]:
# Download Books category
CATEGORY = "Books"

print(f"📥 Downloading {CATEGORY} category...")
print("This may take a few minutes due to file size...")

# Download reviews
reviews_url = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/{CATEGORY}.jsonl.gz"
reviews_path = f"data/raw/{CATEGORY}_reviews.jsonl.gz"

print(f"Downloading reviews from: {reviews_url}")
urllib.request.urlretrieve(reviews_url, reviews_path)
print(f"✅ Reviews downloaded to: {reviews_path}")

# Download metadata
meta_url = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{CATEGORY}.jsonl.gz"
meta_path = f"data/raw/{CATEGORY}_meta.jsonl.gz"

print(f"Downloading metadata from: {meta_url}")
urllib.request.urlretrieve(meta_url, meta_path)
print(f"✅ Metadata downloaded to: {meta_path}")
