# Download Amazon Reviews 2023 - Books Dataset

**Dataset:** [Amazon Reviews 2023](https://amazon-reviews-2023.github.io/)

In [5]:
# Standard libraries
import pandas as pd
import numpy as np
import json
import gzip
import os
import urllib.request
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# PyTorch with MPS support
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

print("Libraries imported!")

Libraries imported!


In [6]:
# Check MPS (Metal Performance Shaders) support
print("🔍 Checking M4 GPU support...")
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using M4 GPU: {device}")
    
    # Test GPU computation
    x = torch.randn(1000, 1000, device=device)
    y = torch.randn(1000, 1000, device=device)
    z = torch.mm(x, y)
    print(f"GPU test successful! Result shape: {z.shape}")
else:
    device = torch.device("cpu")
    print(f"MPS not available, using CPU: {device}")


🔍 Checking M4 GPU support...
PyTorch version: 2.9.0
MPS available: True
Using M4 GPU: mps
GPU test successful! Result shape: torch.Size([1000, 1000])


In [7]:
# Create data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)

print("Data directories created")

Data directories created


In [9]:
# Download Books category with progress bars
CATEGORY = "Books"

print(f"Downloading {CATEGORY} category...")

def download_with_progress(url, filename):
    """Download file with progress bar using tqdm"""
    def progress_hook(block_num, block_size, total_size):
        if total_size > 0:
            downloaded = block_num * block_size
            percent = min(100, (downloaded * 100) / total_size)
            print(f"\rDownloading: {percent:.1f}% ({downloaded/1024/1024:.1f}MB/{total_size/1024/1024:.1f}MB)", end="")
    
    urllib.request.urlretrieve(url, filename, progress_hook)
    print(f"\n Downloaded: {filename}")

# Download reviews
reviews_url = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/{CATEGORY}.jsonl.gz"
reviews_path = f"data/raw/{CATEGORY}_reviews.jsonl.gz"

print(f" Downloading reviews from: {reviews_url}")
download_with_progress(reviews_url, reviews_path)

# Download metadata
meta_url = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{CATEGORY}.jsonl.gz"
meta_path = f"data/raw/{CATEGORY}_meta.jsonl.gz"

print(f" Downloading metadata from: {meta_url}")
download_with_progress(meta_url, meta_path)

print(f"Files saved to: data/raw/")
print(f"   - Reviews: {reviews_path}")
print(f"   - Metadata: {meta_path}")

Downloading Books category...
 Downloading reviews from: https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Books.jsonl.gz
Downloading: 100.0% (5928.8MB/5928.8MB)
 Downloaded: data/raw/Books_reviews.jsonl.gz
 Downloading metadata from: https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Books.jsonl.gz
Downloading: 100.0% (4713.2MB/4713.2MB)
 Downloaded: data/raw/Books_meta.jsonl.gz
Files saved to: data/raw/
   - Reviews: data/raw/Books_reviews.jsonl.gz
   - Metadata: data/raw/Books_meta.jsonl.gz
