In [7]:
#!/usr/bin/env python3
import os, json, time
import requests
from google.cloud import storage

BUCKET = os.environ.get("GCS_BUCKET", "qst843-project")
PREFIX = os.environ.get("GCS_PREFIX", "amazon_reviews_2023")

BASE_REVIEW = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/{cat}.jsonl.gz"
BASE_META   = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{cat}.jsonl.gz"

CATEGORIES = [
    "All_Beauty",
    "Arts_Crafts_and_Sewing",
    "Automotive",
    "Baby_Products",
    "Books",
    "Cell_Phones_and_Accessories",
    "Clothing_Shoes_and_Jewelry",
    "Electronics",
    "Grocery_and_Gourmet_Food",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Musical_Instruments",
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Sports_and_Outdoors",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
]

def stream_url_to_gcs(url: str, bucket: storage.Bucket, dest_blob: str):
    """Stream download from URL directly into a GCS blob (no local file)."""
    print(f"→ {dest_blob}  ({url})")
    blob = bucket.blob(dest_blob)
    blob.chunk_size = 8 * 1024 * 1024  # 8MB resumable chunks
    with requests.get(url, stream=True, timeout=120) as r:
        r.raise_for_status()
        # open a resumable writer to GCS and stream in chunks
        with blob.open("wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)

def compose_into(bucket: storage.Bucket, dest_blob: str, part_paths):
    """Compose existing blobs into a single destination blob."""
    print(f"Composing {len(part_paths)} parts → {dest_blob}")
    parts = [bucket.blob(p) for p in part_paths]
    bucket.blob(dest_blob).compose(parts)

def main():
    client = storage.Client()
    bucket = client.bucket(BUCKET)

    # 1) Stream each category to GCS under tmp/…
    review_parts = []
    meta_parts   = []
    for cat in CATEGORIES:
        r_blob = f"{PREFIX}/tmp/reviews/{cat}.jsonl.gz"
        m_blob = f"{PREFIX}/tmp/meta/{cat}.jsonl.gz"

        # Skip if already uploaded (resume-friendly)
        if not bucket.blob(r_blob).exists(client):
            stream_url_to_gcs(BASE_REVIEW.format(cat=cat), bucket, r_blob)
        else:
            print(f"✓ exists: {r_blob}")
        if not bucket.blob(m_blob).exists(client):
            stream_url_to_gcs(BASE_META.format(cat=cat), bucket, m_blob)
        else:
            print(f"✓ exists: {m_blob}")

        review_parts.append(r_blob)
        meta_parts.append(m_blob)

    # 2) Compose parts into the two big files
    all_reviews = f"{PREFIX}/all_reviews.ndjson.gz"
    all_meta    = f"{PREFIX}/all_meta.ndjson.gz"
    compose_into(bucket, all_reviews, review_parts)
    compose_into(bucket, all_meta, meta_parts)

    print("\nDone.")
    print(f"Reviews → gs://{BUCKET}/{all_reviews}")
    print(f"Meta    → gs://{BUCKET}/{all_meta}")

    # 3) (Optional) delete tmp parts to save bucket space
    if os.environ.get("DELETE_PARTS", "0") == "1":
        print("Deleting tmp parts…")
        for p in review_parts + meta_parts:
            bucket.blob(p).delete()
        print("Tmp parts deleted.")

if __name__ == "__main__":
    main()


→ amazon_reviews_2023/tmp/reviews/All_Beauty.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz)
→ amazon_reviews_2023/tmp/meta/All_Beauty.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz)
→ amazon_reviews_2023/tmp/reviews/Arts_Crafts_and_Sewing.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Arts_Crafts_and_Sewing.jsonl.gz)
→ amazon_reviews_2023/tmp/meta/Arts_Crafts_and_Sewing.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Arts_Crafts_and_Sewing.jsonl.gz)
→ amazon_reviews_2023/tmp/reviews/Automotive.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Automotive.jsonl.gz)
→ amazon_reviews_2023/tmp/meta/Automotive.jsonl.gz  (https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Automotive.js

NameError: name 'spark' is not defined