**## Batch processing of Ethereum data from AWS S3**

In [0]:
# Databricks notebook source
# === SIMPLE PARAMETERIZATION (ONLY NECESSARY VARIABLES) ===
# Parameterize only what needs to be variable for reusability
dbutils.widgets.text("catalog_name", "blockchain", "Catalog Name")
dbutils.widgets.text("schema_name", "ethereum", "Schema Name")
dbutils.widgets.text("num_files", "20", "Number of Files to Download")

# === CONFIGURATION ===
# Get widget values
CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")
NUM_FILES = int(dbutils.widgets.get("num_files"))

# Hard-coded values (no need to parameterize constants)
AWS_BUCKET = "aws-public-blockchain"
S3_PREFIX = "v1.0/eth/blocks/"

# Unity Catalog volume paths for data organization
DATA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum"
CHECKPOINT_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_checkpoints"
SCHEMA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_schemas"

print(f"🔧 Using Catalog: {CATALOG}, Schema: {SCHEMA}")
print(f"📦 Downloading {NUM_FILES} files from s3://{AWS_BUCKET}/{S3_PREFIX}")

# === UNITY CATALOG SETUP ===
stmts = [
    f"CREATE CATALOG IF NOT EXISTS {CATALOG}",
    f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_checkpoints",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_schemas",
]

for i, s in enumerate(stmts, 1):
    print(f"[{i}/{len(stmts)}] {s}")
    try:
        spark.sql(s)
        print("  ✅ Success")
    except Exception as e:
        print(f"  ❌ Error: {e}")

print(f"\nCreated/verified UC objects. Paths available:")
print(f"  Data: {DATA_VOLUME}")
print(f"  Checkpoints: {CHECKPOINT_VOLUME}")
print(f"  Schemas: {SCHEMA_VOLUME}")

# === DATA DOWNLOAD ===
import os
import boto3
from botocore import UNSIGNED
from botocore.client import Config

print(f"\n📥 Downloading to: {DATA_VOLUME}")
os.makedirs(DATA_VOLUME, exist_ok=True)

# Configure anonymous S3 client (no AWS credentials needed!)
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

# Collect parquet files from S3
keys = []
token = None

while len(keys) < NUM_FILES:
    params = {
        "Bucket": AWS_BUCKET, 
        "Prefix": S3_PREFIX, 
        "MaxKeys": min(1000, NUM_FILES - len(keys))
    }
    if token:
        params["ContinuationToken"] = token
    
    resp = s3.list_objects_v2(**params)
    
    for obj in resp.get("Contents", []) or []:
        if obj["Key"].endswith(".parquet"):
            keys.append(obj["Key"])
            if len(keys) >= NUM_FILES:
                break
    
    if not resp.get("IsTruncated"):
        break
    token = resp.get("NextContinuationToken")

if not keys:
    raise RuntimeError(f"No parquet files found under s3://{AWS_BUCKET}/{S3_PREFIX}")

# Download files with progress tracking
for i, key in enumerate(keys, 1):
    rel_path = key.replace("v1.0/eth/", "")
    dest_path = os.path.join(DATA_VOLUME, rel_path)
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    
    print(f"[{i}/{len(keys)}] {os.path.basename(key)} ...", end=" ", flush=True)
    s3.download_file(AWS_BUCKET, key, dest_path)
    print("✓")

print("✅ Download complete!")

🔧 Using Catalog: blockchain, Schema: ethereum
📦 Downloading 20 files from s3://aws-public-blockchain/v1.0/eth/blocks/
[1/5] CREATE CATALOG IF NOT EXISTS blockchain
  ✅ Success
[2/5] CREATE SCHEMA IF NOT EXISTS blockchain.ethereum
  ✅ Success
[3/5] CREATE VOLUME IF NOT EXISTS blockchain.ethereum.ethereum
  ✅ Success
[4/5] CREATE VOLUME IF NOT EXISTS blockchain.ethereum.ethereum_checkpoints
  ✅ Success
[5/5] CREATE VOLUME IF NOT EXISTS blockchain.ethereum.ethereum_schemas
  ✅ Success

Created/verified UC objects. Paths available:
  Data: /Volumes/blockchain/ethereum/ethereum
  Checkpoints: /Volumes/blockchain/ethereum/ethereum_checkpoints
  Schemas: /Volumes/blockchain/ethereum/ethereum_schemas

📥 Downloading to: /Volumes/blockchain/ethereum/ethereum
[1/20] part-00000-32767f69-9150-49ac-9c03-45f34b103c34-c000.snappy.parquet ... ✓
[2/20] part-00000-62c9c86c-8a10-4196-b54c-01a2a139f4ec-c000.snappy.parquet ... ✓
[3/20] part-00000-5438c668-b9c9-4b0a-8a35-64ff30b73cdf-c000.snappy.parquet ... 

In [0]:
CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")
NUM_FILES = int(dbutils.widgets.get("num_files"))


# Unity Catalog volume paths for data organization
DATA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum"
CHECKPOINT_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_checkpoints"
SCHEMA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_schemas"

DATA_VOLUME

'/Volumes/blockchain/ethereum/ethereum'

In [0]:
reader = (
    spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "parquet")  
        .option("cloudFiles.schemaLocation", SCHEMA_VOLUME)  
        .option("checkpointLocation", CHECKPOINT_VOLUME)  # Required
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")  
        .option("cloudFiles.schemaHints", "number BIGINT, baseFeePerGas BIGINT")  
        .load(f"dbfs:{DATA_VOLUME}/blocks/")
)

# Use append mode for non-aggregated streaming
#display(reader, checkpointLocation=CHECKPOINT_VOLUME, outputMode="append")

# Extract and transform block-level fields if needed
blocks_df = reader.select(
   "*",
   "_metadata"
)

# Write to Delta using Structured Streaming
blocks_query = (
    blocks_df.writeStream
        .format("delta")  # Delta Lake sink for ACID transactions
        .outputMode("append")  # Append new blocks as they arrive
        .option("checkpointLocation", f"{CHECKPOINT_VOLUME}/blocks/")  # State management
        .trigger(availableNow=True)  # Process all available data
        .table(f"{CATALOG}.{SCHEMA}.blocks")  # Save as managed Delta table
)


In [0]:
%sql
-- Total blocks ingested
SELECT COUNT(*) 
FROM blockchain.ethereum.blocks;



COUNT(*)
108555
