In [0]:
pip install web3

Collecting web3
  Downloading web3-7.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting eth-abi>=5.0.1 (from web3)
  Downloading eth_abi-5.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting eth-account>=0.13.6 (from web3)
  Downloading eth_account-0.13.7-py3-none-any.whl.metadata (3.7 kB)
Collecting eth-hash>=0.5.1 (from eth-hash[pycryptodome]>=0.5.1->web3)
  Downloading eth_hash-0.7.1-py3-none-any.whl.metadata (4.2 kB)
Collecting eth-typing>=5.0.0 (from web3)
  Downloading eth_typing-5.2.1-py3-none-any.whl.metadata (3.2 kB)
Collecting eth-utils>=5.0.0 (from web3)
  Downloading eth_utils-5.3.1-py3-none-any.whl.metadata (5.7 kB)
Collecting hexbytes>=1.2.0 (from web3)
  Downloading hexbytes-1.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting aiohttp>=3.7.4.post0 (from web3)
  Downloading aiohttp-3.12.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (7.7 kB)
Collecting types-requests>=2.0.0 (from web3)
  Downloading types_requests-2.32.4.20250809-py3-none-any.whl.me

In [0]:
import os
import json
import time
import logging
from typing import Any, Dict, Union
from web3 import Web3
from web3.datastructures import AttributeDict
from hexbytes import HexBytes
from requests.exceptions import HTTPError, ConnectionError, Timeout
from web3.exceptions import Web3Exception

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class RateLimiter:
    """Simple rate limiter to control API call frequency"""
    def __init__(self, max_calls_per_second: float = 5.0):
        self.max_calls_per_second = max_calls_per_second
        self.min_interval = 1.0 / max_calls_per_second
        self.last_call_time = 0
    
    def wait_if_needed(self):
        """Wait if necessary to respect rate limit"""
        current_time = time.time()
        time_since_last_call = current_time - self.last_call_time
        
        if time_since_last_call < self.min_interval:
            sleep_time = self.min_interval - time_since_last_call
            time.sleep(sleep_time)
        
        self.last_call_time = time.time()

def check_api_response_status(w3: Web3) -> bool:
    """Check if the API endpoint is responding correctly using Web3"""
    try:
        # Make a simple test request using Web3
        block_number = w3.eth.block_number
        
        if block_number > 0:
            logger.info(f"✅ API endpoint responding correctly (current block: {block_number})")
            return True
        else:
            logger.error(f"❌ API endpoint returned invalid block number: {block_number}")
            return False
            
    except Web3Exception as e:
        logger.error(f"❌ Web3 error connecting to API endpoint: {e}")
        return False
    except Exception as e:
        logger.error(f"❌ Failed to connect to API endpoint: {e}")
        return False

def to_serializable(obj: Any) -> Any:
    """Convert Web3 objects to JSON-serializable format"""
    if isinstance(obj, (AttributeDict, dict)):
        return {key: to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_serializable(item) for item in obj]
    elif isinstance(obj, HexBytes):
        return obj.hex()
    elif isinstance(obj, bytes):
        return obj.hex()
    elif hasattr(obj, '__dict__'):
        return to_serializable(obj.__dict__)
    else:
        return obj

def fetch_block_with_retry(w3: Web3, block_number: int, rate_limiter: RateLimiter, 
                          max_retries: int = 3, base_delay: float = 1.0) -> Dict[str, Any]:
    """Fetch a block with rate limiting and retry logic"""
    for attempt in range(max_retries):
        try:
            rate_limiter.wait_if_needed()
            block = w3.eth.get_block(block_number, full_transactions=True)
            return to_serializable(block)
            
        except (HTTPError, ConnectionError, Timeout) as e:
            if attempt == max_retries - 1:
                logger.error(f"Failed to fetch block {block_number} after {max_retries} attempts: {e}")
                raise
            
            # Exponential backoff
            delay = base_delay * (2 ** attempt)
            logger.warning(f"Attempt {attempt + 1} failed for block {block_number}, retrying in {delay}s: {e}")
            time.sleep(delay)
            
        except Exception as e:
            logger.error(f"Unexpected error fetching block {block_number}: {e}")
            raise

def main():
    # Configuration
    provider_uri = "https://mainnet.infura.io/v3/31966bfed8674afaab1a9d9ba201665a"
    
    # Rate limiting: Infura free tier allows ~100k requests/day (~1.15 requests/second)
    # Setting to 0.8 requests/second to be safe
    rate_limiter = RateLimiter(max_calls_per_second=0.8)
    
    # Initialize Web3
    w3 = Web3(Web3.HTTPProvider(provider_uri))
    
    # Check connection
    if not w3.is_connected():
        logger.error("Failed to connect to Ethereum node")
        return
    
    logger.info("Connected to Ethereum mainnet")
    
    # Get starting block
    rate_limiter.wait_if_needed()
    last_block = w3.eth.block_number
    logger.info(f"Starting from block {last_block}")
    
    # Setup output directory
    output_dir = "/Volumes/ethereum-catalog/ethereum/ethereum-volume/raw/blocks/"
    os.makedirs(output_dir, exist_ok=True)
    
    # Main loop
    while True:
        try:
            # Get latest block number
            rate_limiter.wait_if_needed()
            latest_block = w3.eth.block_number
            
            # Process new blocks
            blocks_to_process = list(range(last_block + 1, latest_block + 1))
            
            if blocks_to_process:
                logger.info(f"Processing blocks {last_block + 1} to {latest_block}")
                
                # Debug: log the block numbers we're about to process
                logger.debug(f"Block numbers to process: {blocks_to_process}")
            
            for block_number in blocks_to_process:
                try:
                    # Ensure block_number is an integer
                    if not isinstance(block_number, int):
                        logger.error(f"❌ Invalid block number type: {type(block_number)} - {block_number}")
                        continue
                        
                    logger.debug(f"Processing block {block_number} (type: {type(block_number)})")
                    
                    # Fetch block with rate limiting and retry logic
                    block_dict = fetch_block_with_retry(w3, block_number, rate_limiter)
                    
                    # Save to file
                    file_path = os.path.join(output_dir, f"block_{block_number}.json")
                    with open(file_path, "w") as f:
                        json.dump(block_dict, f, indent=2)
                    
                    tx_count = len(block_dict.get('transactions', []))
                    logger.info(f"✅ Saved block {block_number} with {tx_count} transactions")
                    
                except Exception as e:
                    logger.error(f"❌ Failed to process block {block_number}: {e}")
                    logger.debug(f"Block number type: {type(block_number)}, value: {repr(block_number)}")
                    # Check if we should do a health check
                    if "web3" in str(e).lower() or "connection" in str(e).lower():
                        logger.info("Performing API health check due to connection error...")
                        check_api_response_status(w3)
                    # Continue with next block instead of crashing
                    continue
            
            last_block = latest_block
            
            # Wait before checking for new blocks
            logger.info("Waiting for new blocks...")
            time.sleep(15)  # Check every 15 seconds (Ethereum block time is ~12-13 seconds)
            
        except KeyboardInterrupt:
            logger.info("Stopping block fetcher...")
            break
        except Exception as e:
            logger.error(f"Unexpected error in main loop: {e}")
            logger.info("Waiting 30 seconds before retrying...")
            time.sleep(30)

if __name__ == "__main__":
    main()

2025-09-11 01:20:02,656 - INFO - Connected to Ethereum mainnet
2025-09-11 01:20:02,683 - INFO - Starting from block 23336499
2025-09-11 01:20:03,937 - INFO - Waiting for new blocks...
2025-09-11 01:20:18,965 - INFO - Processing blocks 23336500 to 23336500
2025-09-11 01:20:20,691 - INFO - ✅ Saved block 23336500 with 253 transactions
2025-09-11 01:20:20,693 - INFO - Waiting for new blocks...
2025-09-11 01:20:35,724 - INFO - Processing blocks 23336501 to 23336501
2025-09-11 01:20:37,350 - INFO - ✅ Saved block 23336501 with 328 transactions
2025-09-11 01:20:37,351 - INFO - Waiting for new blocks...
2025-09-11 01:20:52,377 - INFO - Processing blocks 23336502 to 23336503
2025-09-11 01:20:54,031 - INFO - ✅ Saved block 23336502 with 190 transactions
2025-09-11 01:20:55,144 - INFO - ✅ Saved block 23336503 with 302 transactions
2025-09-11 01:20:55,144 - INFO - Waiting for new blocks...
2025-09-11 01:21:10,169 - INFO - Processing blocks 23336504 to 23336504
2025-09-11 01:21:11,702 - INFO - ✅ Save

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType

# Transaction schema
tx_schema = StructType([
    StructField("hash", StringType(), True),
    StructField("from", StringType(), True),
    StructField("to", StringType(), True),
    StructField("value", StringType(), True),   # value is big number, keep as string
    StructField("nonce", LongType(), True),
    StructField("gas", LongType(), True)
])

# Block schema
block_schema = StructType([
    StructField("number", LongType(), True),
    StructField("hash", StringType(), True),
    StructField("miner", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("transactions", ArrayType(tx_schema), True)
])


In [0]:
from pyspark.sql.functions import explode, col

# ✅ Read JSON block files incrementally from your Volume
raw_blocks = (
    spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "json")
      .schema(block_schema)  # provide schema for better performance
      .load("/Volumes/ethereum-catalog/ethereum/ethereum-volume/raw/blocks/")
)

# Just the block-level data
blocks_df = raw_blocks.select("number", "hash", "miner", "timestamp")

# Write to Delta (block table)
blocks_query = (
    blocks_df.writeStream
      .format("delta")
      .outputMode("append")
      .option("checkpointLocation", "/Volumes/ethereum-catalog/ethereum/ethereum-volume/checkpoints/blocks/")
      .trigger(once=True)
      .start("/Volumes/ethereum-catalog/ethereum/ethereum-volume/delta/blocks/")
)


In [0]:
# Flatten transactions
txs_df = raw_blocks.select(
    col("number").alias("block_number"),
    explode("transactions").alias("tx")
).select(
    col("block_number"),
    col("tx.hash").alias("tx_hash"),
    col("tx.from").alias("from_address"),
    col("tx.to").alias("to_address"),
    col("tx.value"),
    col("tx.nonce"),
    col("tx.gas")
)

# Write transactions to Delta
txs_query = (
    txs_df.writeStream
      .format("delta")
      .outputMode("append")
      .option("checkpointLocation", "/Volumes/ethereum-catalog/ethereum/ethereum-volume/checkpoints/transactions/")
      .trigger(once=True)
      .start("/Volumes/ethereum-catalog/ethereum/ethereum-volume/delta/transactions/")
)


In [0]:
df_blocks = spark.read.format("delta").load(
    "/Volumes/ethereum-catalog/ethereum/ethereum-volume/delta/blocks/"
)
df_blocks.createOrReplaceTempView("blocks_temp")


In [0]:
df_transactions = spark.read.format("delta").load(
    "/Volumes/ethereum-catalog/ethereum/ethereum-volume/delta/transactions/"
)

df_transactions.createOrReplaceTempView("transactions_temp")


In [0]:
%sql SELECT COUNT(*) FROM blocks_temp

COUNT(*)
37


In [0]:
%sql SELECT COUNT(*) FROM transactions_temp

COUNT(*)
8101


In [0]:
%sql SELECT * FROM blocks_temp

number,hash,miner,timestamp
23293395,84ddff04c138817f018f5565e8f40f449e8fa33b7385514dcd18415aee93ba56,0x95222290DD7278Aa3Ddd389Cc1E1d165CC4BAfe5,1757032943
23293393,7e48e192c462d272e453a45fb0ce5fdc82f325f51cfcb04453ae7ceb76bd4f2c,0x4838B106FCe9647Bdf1E7877BF73cE8B0BAD5f97,1757032919
23293387,999b5b7f056fde08558ee6ccb60b23f0afb3da5b09fffabb58b1edeb27e82442,0x4838B106FCe9647Bdf1E7877BF73cE8B0BAD5f97,1757032847
23293399,6a099c31c3878a25650312b4ff93d7f197fda2bb888e70b3f1bc60cd586d6a41,0xdadB0d80178819F2319190D340ce9A924f783711,1757032991
23293389,105e0e1ee4b91f3d80ff7730a4449b363e4d301b61bf56b0a26f1d93fc4ef8ed,0xdadB0d80178819F2319190D340ce9A924f783711,1757032871
23293403,d632c61c4994bbd0742450f8ed49e3a12b3c19f35c09a5d59525388044eed874,0x4838B106FCe9647Bdf1E7877BF73cE8B0BAD5f97,1757033039
23293390,bea67b7e7b8298acabcb40114c7abc3c5c1106cf9ff32aba9160bf5ef4d77eda,0x4838B106FCe9647Bdf1E7877BF73cE8B0BAD5f97,1757032883
23293401,713f54340cef210264d3b3d6df549e0242572d4679b531011cc30fbb4cf2cc41,0xdadB0d80178819F2319190D340ce9A924f783711,1757033015
23293396,ce50334b14f47b1d1bace0e5a9c530a6c63ef667a87f2d4ae5f4e3c4f5b35128,0x95222290DD7278Aa3Ddd389Cc1E1d165CC4BAfe5,1757032955
23293391,789e5c43d3472bda845e9895a3808ee707ea66d5262582948e6d201546df7a16,0xdadB0d80178819F2319190D340ce9A924f783711,1757032895


In [0]:
%sql
SELECT block_number, COUNT(*) FROM transactions_temp
GROUP BY 1;

block_number,COUNT(*)
23293401,248
23293399,361
23293389,279
23293393,301
23293387,416
23293388,142
23293395,442
23293396,156
23293403,302
23293400,129
