In [0]:
pip install web3

Collecting web3
  Downloading web3-7.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting eth-abi>=5.0.1 (from web3)
  Downloading eth_abi-5.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting eth-account>=0.13.6 (from web3)
  Downloading eth_account-0.13.7-py3-none-any.whl.metadata (3.7 kB)
Collecting eth-hash>=0.5.1 (from eth-hash[pycryptodome]>=0.5.1->web3)
  Downloading eth_hash-0.7.1-py3-none-any.whl.metadata (4.2 kB)
Collecting eth-typing>=5.0.0 (from web3)
  Downloading eth_typing-5.2.1-py3-none-any.whl.metadata (3.2 kB)
Collecting eth-utils>=5.0.0 (from web3)
  Downloading eth_utils-5.3.1-py3-none-any.whl.metadata (5.7 kB)
Collecting hexbytes>=1.2.0 (from web3)
  Downloading hexbytes-1.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting aiohttp>=3.7.4.post0 (from web3)
  Downloading aiohttp-3.12.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (7.7 kB)
Collecting types-requests>=2.0.0 (from web3)
  Downloading types_requests-2.32.4.20250913-py3-none-any.whl.me

**## Batch processing of Ethereum data from AWS S3**

In [0]:
# Databricks notebook source
"""
Download Ethereum blockchain data from AWS S3 to Unity Catalog Volume
Target: /Volumes/ethereum-catalog/ethereum/ethereum-batch-volume
"""

import boto3
from botocore import UNSIGNED
from botocore.client import Config
import os

# ============================================
# CONFIGURATION
# ============================================
UC_VOLUME_PATH = "/Volumes/ethereum-catalog/ethereum/ethereum-batch-volume"  # Your Unity Catalog volume
NUM_FILES_TO_DOWNLOAD = 10  # Download 10 files as test

print("Ethereum Blockchain Data Downloader")
print("="*60)
print(f"Target: {UC_VOLUME_PATH}")
print(f"Files to download: {NUM_FILES_TO_DOWNLOAD}")
print("="*60)

# COMMAND ----------

# ============================================
# STEP 1: Connect to S3 (no credentials needed)
# ============================================
print("\nStep 1: Connecting to AWS S3 public blockchain data...")

s3 = boto3.client(
    's3',
    region_name='us-east-2',
    config=Config(signature_version=UNSIGNED)  # Anonymous access
)

print("✓ Connected to S3")

# COMMAND ----------

# ============================================
# STEP 2: List available files
# ============================================
print("\nStep 2: Listing available Ethereum block files...")

try:
    response = s3.list_objects_v2(
        Bucket='aws-public-blockchain',
        Prefix='v1.0/eth/blocks/',
        MaxKeys=50  # Get 50 files to have more options
    )
    
    if 'Contents' not in response:
        print("✗ No files found!")
        exit(1)
    
    # Filter to get only .parquet files
    parquet_files = []
    for obj in response['Contents']:
        if obj['Key'].endswith('.parquet'):
            parquet_files.append({
                's3_key': obj['Key'],
                'size_mb': obj['Size'] / (1024 * 1024),
                'filename': obj['Key'].split('/')[-1]
            })
    
    print(f"✓ Found {len(parquet_files)} Parquet files")
    
    # Show first 5
    print("\nFirst 5 available files:")
    for i, f in enumerate(parquet_files[:5], 1):
        print(f"  {i}. {f['filename']} ({f['size_mb']:.2f} MB)")
    
except Exception as e:
    print(f"✗ ERROR listing files: {e}")
    exit(1)

# COMMAND ----------

# ============================================
# STEP 3: Download files to UC Volume
# ============================================
print(f"\nStep 3: Downloading {NUM_FILES_TO_DOWNLOAD} files to {UC_VOLUME_PATH}...")
print("-"*60)

# Select files to download
files_to_download = parquet_files[:NUM_FILES_TO_DOWNLOAD]

success_count = 0
fail_count = 0
total_size_mb = 0

for i, file_info in enumerate(files_to_download, 1):
    s3_key = file_info['s3_key']
    
    # Create local path preserving the S3 structure
    # Example: v1.0/eth/blocks/date=2024-09-20/file.parquet
    # Becomes: /Volumes/soni/default/ethereum/blocks/date=2024-09-20/file.parquet
    
    relative_path = s3_key.replace('v1.0/eth/', '')  # Remove prefix
    local_path = os.path.join(UC_VOLUME_PATH, relative_path)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    
    # Download the file
    print(f"[{i}/{NUM_FILES_TO_DOWNLOAD}] Downloading: {file_info['filename']}... ", end="", flush=True)
    
    try:
        s3.download_file(
            Bucket='aws-public-blockchain',
            Key=s3_key,
            Filename=local_path
        )
        print(f"✓ ({file_info['size_mb']:.2f} MB)")
        success_count += 1
        total_size_mb += file_info['size_mb']
        
    except Exception as e:
        print(f"✗ Error: {e}")
        fail_count += 1


# COMMAND ----------

# ============================================
# STEP 4: Summary
# ============================================
print("\n" + "="*60)
print("DOWNLOAD COMPLETE!")
print("="*60)
print(f"Successful: {success_count}/{NUM_FILES_TO_DOWNLOAD}")
print(f"Failed: {fail_count}/{NUM_FILES_TO_DOWNLOAD}")
print(f"Total downloaded: {total_size_mb:.2f} MB")
print(f"Location: {UC_VOLUME_PATH}")

# ============================================
# STEP 5: Verify downloaded files
# ============================================
print(f"\nStep 5: Verifying files in {UC_VOLUME_PATH}...")
print("-"*60)

try:
    # Walk through the directory tree
    for root, dirs, files in os.walk(UC_VOLUME_PATH):
        level = root.replace(UC_VOLUME_PATH, '').count(os.sep)
        indent = '  ' * level
        print(f'{indent}{os.path.basename(root)}/')
        
        sub_indent = '  ' * (level + 1)
        for file in files:
            file_path = os.path.join(root, file)
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f'{sub_indent}{file} ({size_mb:.2f} MB)')
            
    print(f"\n✓ All files saved to {UC_VOLUME_PATH}")
    
except Exception as e:
    print(f"⚠ Could not list directory: {e}")

# COMMAND ----------



Ethereum Blockchain Data Downloader
Target: /Volumes/ethereum-catalog/ethereum/ethereum-batch-volume
Files to download: 10

Step 1: Connecting to AWS S3 public blockchain data...
✓ Connected to S3

Step 2: Listing available Ethereum block files...
✓ Found 50 Parquet files

First 5 available files:
  1. part-00000-32767f69-9150-49ac-9c03-45f34b103c34-c000.snappy.parquet (1.74 MB)
  2. part-00000-62c9c86c-8a10-4196-b54c-01a2a139f4ec-c000.snappy.parquet (1.69 MB)
  3. part-00000-5438c668-b9c9-4b0a-8a35-64ff30b73cdf-c000.snappy.parquet (1.31 MB)
  4. part-00000-e0818341-7c32-4d1d-8fa5-a6fb563777ea-c000.snappy.parquet (1.33 MB)
  5. part-00000-70e7bc53-8610-4048-b386-93edcd06465c-c000.snappy.parquet (1.31 MB)

Step 3: Downloading 10 files to /Volumes/ethereum-catalog/ethereum/ethereum-batch-volume...
------------------------------------------------------------
[1/10] Downloading: part-00000-32767f69-9150-49ac-9c03-45f34b103c34-c000.snappy.parquet... ✓ (1.74 MB)
[2/10] Downloading: part-0000

In [0]:
df_all = spark.read.option("mergeSchema", "true").parquet(
    "/Volumes/ethereum-catalog/ethereum/ethereum-batch-volume/blocks"
)

# Check schema (note: Spark will include 'date' as a partition column)
df_all.printSchema()

# Show some rows
df_all.show(5, truncate=False)

root
 |-- difficulty: double (nullable = true)
 |-- hash: string (nullable = true)
 |-- miner: string (nullable = true)
 |-- nonce: string (nullable = true)
 |-- number: long (nullable = true)
 |-- size: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- total_difficulty: double (nullable = true)
 |-- base_fee_per_gas: long (nullable = true)
 |-- gas_limit: long (nullable = true)
 |-- gas_used: long (nullable = true)
 |-- extra_data: string (nullable = true)
 |-- logs_bloom: string (nullable = true)
 |-- parent_hash: string (nullable = true)
 |-- state_root: string (nullable = true)
 |-- receipts_root: string (nullable = true)
 |-- transactions_root: string (nullable = true)
 |-- sha3_uncles: string (nullable = true)
 |-- transaction_count: long (nullable = true)
 |-- date: date (nullable = true)
 |-- last_modified: timestamp (nullable = true)

+----------------+------------------------------------------------------------------+-------------------------------------

In [0]:
display(df_all)

difficulty,hash,miner,nonce,number,size,timestamp,total_difficulty,base_fee_per_gas,gas_limit,gas_used,extra_data,logs_bloom,parent_hash,state_root,receipts_root,transactions_root,sha3_uncles,transaction_count,date,last_modified
102856293685.0,0xb41d6002c750bfabc4ca507000d32d16367ae0402812d0819e2fddbbea02d8c8,0x9dfc0377058b7b9eb277421769b56df1395705f0,0x7f30a333095c5c64,3730,546,2015-07-30T18:23:09.000Z,179888978823852.0,,5000,0,0x476574682f76312e302e302d66633739643332642f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x846078d6d12be57c1b5fb87f3cf2c87f9acc07f789cfb1dc68832ab045633672,0xe35267cdfcaff683bb74ccf6ace2ab505454cdd10761d7261faa37ba8862bf77,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:56.747Z
18189628936.0,0x12ef0c55f6a6adcd43b65fdf0c356927b82ef30fb8a72eddad012b726e2824b6,0x1b7047b4338acf65be94c1a3e8c5c9338ad7d67c,0xc79245b4967062c4,131,539,2015-07-30T15:35:07.000Z,2326528090697.0,,5000,0,0x426974636f696e2069732054484520426c6f636b636861696e2e,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x2807e7d5796039fa4040c5db859dc599b7ac73de38978af8ee01667fb04ec5c0,0xb516807b88579af98aa7531c8492d0311ce727df4cdb352b2c384d87ce82d6db,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:09.004Z
31501460475.0,0x6fd8a2720224b4dca3b3c8139d5d695462ba342709ca6e0c87324ef3bc833161,0x0e858640eb9cbeee16b74124658815bb2b5c0686,0xa9b02885aee63128,1256,546,2015-07-30T16:11:21.000Z,29602472045318.0,,5000,0,0x476574682f76312e302e302d30636463373634372f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x4a0526dcea5342341d55cbc3c054629690a57046e9a48990017f534c63b91051,0x49a5916f09bc1ded5281eebe85823db0be5574738a4959fbe3d814e1baf16520,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:24.124Z
30907398488.0,0x202df65f4ec4c27fd65d4c4d34c13343f46e9013ff0df2b5f00f45a75e74ea08,0x9beb4d346c6309a9ce4f27393c6322c8f03a3be6,0xcffa11e09d7355fa,1217,546,2015-07-30T16:10:03.000Z,28385238997474.0,,5000,0,0x476574682f7370616e6b7930312f76312e302e302f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0xdb8a95a61b222e044afe27b33a621823fc4f961899edbb68affd74c7dbe282dc,0x2b56db731a948d2301743ff8808465fe9dbf9bfede1c14ac54d8993c4c74745a,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:23.636Z
103410095210.0,0x4b1a21458763697d810b197d0f3ae28b733db35cf597145dee1714e8f44acaf5,0x322df8fd261820c36234f806b15f51a2203a8f88,0xf0135edf3f857cec,3741,546,2015-07-30T18:23:53.000Z,181023718160090.0,,5000,0,0x476574682f76312e302e302d66633739643332642f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0xc01370b102a0c675a83ed90526226a7044226d44daa067f78790fb98aed68e49,0xef86b96914f288f9010b0ff49bf7f4c232418a6bdcbaee43dc248ee64af3a42b,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:56.914Z
155144276420.0,0xdcb92e0dd722a648ec13743a2ad7c789b8a1c505a1d6319882dc0e50368ecbcd,0xf927a40c8b7f6e07c5af7fa2155b4864a4112b13,0xb8a6c488573314ea,4654,539,2015-07-30T19:32:55.000Z,298067009033463.0,,5000,0,0x476574682f76312e302e302f6c696e75782f676f312e342e32,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x2b7dd6c90799caa8f5666ac80a3d27f34b3211786f54259883df379a549933bb,0x2b2dbb8d5d2c81b8362a25e603de6f123068e91aa2a352268eb46e4b59b502c8,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:36:10.128Z
223841247918.0,0x32e1aa1ad14b6bc3ebf42332621750d361d3e1beb7554183758a1e5d4c4d5b43,0x9dfc0377058b7b9eb277421769b56df1395705f0,0xff1e6b100cf52fad,5641,546,2015-07-30T21:17:46.000Z,484406949148753.0,,5000,0,0x476574682f76312e302e302d66633739643332642f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0xdb244445e3db88c9b15ebac9019c5f58c71005c7e34f5c49aa95920553f137f7,0x4ff7282477301451dfe9559f46c6b3262b0085780e5af08bf43f282a3e35674e,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:36:23.236Z
316398355179.0,0xd91eae51202137710d74266c4c2dd2b78e6143e4e67585ab0ff8bad755f87b72,0xd7e30ae310c1d1800f5b641baa7af95b2e1fd98c,0xa021c448b61f3927,6702,546,2015-07-30T23:33:05.000Z,770179747249710.0,,5000,0,0x476574682f6b6c6f737572652f76312e302e302d66633739643332642f6c696e,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x2d9cae59176bf564c71f3358218d30b7e3388410dcad3d9d2b7ada5d505db295,0x65d1406c77a5fe03ae818f8aa5e3f897406295b3001501fcafc6c9ee91297bd4,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:36:37.791Z
67330454791.0,0x177f5be5972c2a492da34390d9182d8681c454009866d6375c7f3a429cd809fa,0xa1623430350c5df1b52b0b57483a5bb45d1796da,0x2ceadd5beb90ed6c,2824,546,2015-07-30T17:25:07.000Z,103640806765184.0,,5000,0,0x476574682f76312e302e302d30636463373634372f6c696e75782f676f312e34,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0x7c614d10de28d834a8029a26465279c64acbd62aea1c4d2c078bb88dff788627,0x440b41dc8aee54ab6d9dc1571e720b1b1c1a4f5d02a17d5a888f0a8707594968,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:44.862Z
46166957217.0,0xd3aed9f3cf1e70e3dc553a3e9f6d0ac552f73d37fbe77d49cdddf6ae694b615c,0x8f22398f1567cddaba1b6bb1973e62b4992d5c9c,0xc74003dc9bcb8994,2043,539,2015-07-30T16:45:36.000Z,59811785042578.0,,5000,0,0x476574682f76312e302e302f6c696e75782f676f312e342e32,0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,0xc432a66dbdf03d6c404990335dd1c5a2b6d440fda942c3f21b118d6841302e90,0xd78806f44666068e53e13177a131ec2df42532f8c240b39f28f249ef444f7f4c,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347,0,2015-07-30,2022-09-11T23:35:34.346Z


In [0]:
import os
import json
import time
import logging
from typing import Any, Dict, Union
from web3 import Web3
from web3.datastructures import AttributeDict
from hexbytes import HexBytes
from requests.exceptions import HTTPError, ConnectionError, Timeout
from web3.exceptions import Web3Exception

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class RateLimiter:
    """Simple rate limiter to control API call frequency"""
    def __init__(self, max_calls_per_second: float = 5.0):
        self.max_calls_per_second = max_calls_per_second
        self.min_interval = 1.0 / max_calls_per_second
        self.last_call_time = 0
    
    def wait_if_needed(self):
        """Wait if necessary to respect rate limit"""
        current_time = time.time()
        time_since_last_call = current_time - self.last_call_time
        
        if time_since_last_call < self.min_interval:
            sleep_time = self.min_interval - time_since_last_call
            time.sleep(sleep_time)
        
        self.last_call_time = time.time()

def check_api_response_status(w3: Web3) -> bool:
    """Check if the API endpoint is responding correctly using Web3"""
    try:
        # Make a simple test request using Web3
        block_number = w3.eth.block_number
        
        if block_number > 0:
            logger.info(f"✅ API endpoint responding correctly (current block: {block_number})")
            return True
        else:
            logger.error(f"❌ API endpoint returned invalid block number: {block_number}")
            return False
            
    except Web3Exception as e:
        logger.error(f"❌ Web3 error connecting to API endpoint: {e}")
        return False
    except Exception as e:
        logger.error(f"❌ Failed to connect to API endpoint: {e}")
        return False

def to_serializable(obj: Any) -> Any:
    """Convert Web3 objects to JSON-serializable format"""
    if isinstance(obj, (AttributeDict, dict)):
        return {key: to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_serializable(item) for item in obj]
    elif isinstance(obj, HexBytes):
        return obj.hex()
    elif isinstance(obj, bytes):
        return obj.hex()
    elif hasattr(obj, '__dict__'):
        return to_serializable(obj.__dict__)
    else:
        return obj

def fetch_block_with_retry(w3: Web3, block_number: int, rate_limiter: RateLimiter, 
                          max_retries: int = 3, base_delay: float = 1.0) -> Dict[str, Any]:
    """Fetch a block with rate limiting and retry logic"""
    for attempt in range(max_retries):
        try:
            rate_limiter.wait_if_needed()
            block = w3.eth.get_block(block_number, full_transactions=True)
            return to_serializable(block)
            
        except (HTTPError, ConnectionError, Timeout) as e:
            if attempt == max_retries - 1:
                logger.error(f"Failed to fetch block {block_number} after {max_retries} attempts: {e}")
                raise
            
            # Exponential backoff
            delay = base_delay * (2 ** attempt)
            logger.warning(f"Attempt {attempt + 1} failed for block {block_number}, retrying in {delay}s: {e}")
            time.sleep(delay)
            
        except Exception as e:
            logger.error(f"Unexpected error fetching block {block_number}: {e}")
            raise

def main():
    # Configuration
    provider_uri = "https://mainnet.infura.io/v3/31966bfed8674afaab1a9d9ba201665a"
    
    # Rate limiting: Infura free tier allows ~100k requests/day (~1.15 requests/second)
    # Setting to 0.8 requests/second to be safe
    rate_limiter = RateLimiter(max_calls_per_second=0.8)
    
    # Initialize Web3
    w3 = Web3(Web3.HTTPProvider(provider_uri))
    
    # Check connection
    if not w3.is_connected():
        logger.error("Failed to connect to Ethereum node")
        return
    
    logger.info("Connected to Ethereum mainnet")
    
    # Get starting block
    rate_limiter.wait_if_needed()
    last_block = w3.eth.block_number
    logger.info(f"Starting from block {last_block}")
    
    # Setup output directory
    output_dir = "/Volumes/ethereum-catalog/ethereum/ethereum-volume/raw/blocks/"
    os.makedirs(output_dir, exist_ok=True)
    
    # Main loop
    while True:
        try:
            # Get latest block number
            rate_limiter.wait_if_needed()
            latest_block = w3.eth.block_number
            
            # Process new blocks
            blocks_to_process = list(range(last_block + 1, latest_block + 1))
            
            if blocks_to_process:
                logger.info(f"Processing blocks {last_block + 1} to {latest_block}")
                
                # Debug: log the block numbers we're about to process
                logger.debug(f"Block numbers to process: {blocks_to_process}")
            
            for block_number in blocks_to_process:
                try:
                    # Ensure block_number is an integer
                    if not isinstance(block_number, int):
                        logger.error(f"❌ Invalid block number type: {type(block_number)} - {block_number}")
                        continue
                        
                    logger.debug(f"Processing block {block_number} (type: {type(block_number)})")
                    
                    # Fetch block with rate limiting and retry logic
                    block_dict = fetch_block_with_retry(w3, block_number, rate_limiter)
                    
                    # Save to file
                    file_path = os.path.join(output_dir, f"block_{block_number}.json")
                    with open(file_path, "w") as f:
                        json.dump(block_dict, f, indent=2)
                    
                    tx_count = len(block_dict.get('transactions', []))
                    logger.info(f"✅ Saved block {block_number} with {tx_count} transactions")
                    
                except Exception as e:
                    logger.error(f"❌ Failed to process block {block_number}: {e}")
                    logger.debug(f"Block number type: {type(block_number)}, value: {repr(block_number)}")
                    # Check if we should do a health check
                    if "web3" in str(e).lower() or "connection" in str(e).lower():
                        logger.info("Performing API health check due to connection error...")
                        check_api_response_status(w3)
                    # Continue with next block instead of crashing
                    continue
            
            last_block = latest_block
            
            # Wait before checking for new blocks
            logger.info("Waiting for new blocks...")
            time.sleep(15)  # Check every 15 seconds (Ethereum block time is ~12-13 seconds)
            
        except KeyboardInterrupt:
            logger.info("Stopping block fetcher...")
            break
        except Exception as e:
            logger.error(f"Unexpected error in main loop: {e}")
            logger.info("Waiting 30 seconds before retrying...")
            time.sleep(30)

if __name__ == "__main__":
    main()

2025-09-22 01:06:24,029 - INFO - Connected to Ethereum mainnet
2025-09-22 01:06:24,056 - INFO - Starting from block 23415182
2025-09-22 01:06:25,301 - INFO - Processing blocks 23415183 to 23415183
2025-09-22 01:06:26,846 - INFO - ✅ Saved block 23415183 with 185 transactions
2025-09-22 01:06:26,847 - INFO - Waiting for new blocks...
2025-09-22 01:06:41,870 - INFO - Processing blocks 23415184 to 23415184
2025-09-22 01:06:43,360 - INFO - ✅ Saved block 23415184 with 210 transactions
2025-09-22 01:06:43,361 - INFO - Waiting for new blocks...
2025-09-22 01:06:58,390 - INFO - Processing blocks 23415185 to 23415185
2025-09-22 01:06:59,871 - INFO - ✅ Saved block 23415185 with 181 transactions
2025-09-22 01:06:59,871 - INFO - Waiting for new blocks...
2025-09-22 01:07:14,897 - INFO - Processing blocks 23415186 to 23415187
2025-09-22 01:07:16,386 - INFO - ✅ Saved block 23415186 with 212 transactions
2025-09-22 01:07:17,628 - INFO - ✅ Saved block 23415187 with 235 transactions
2025-09-22 01:07:17,

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType

# Transaction schema
tx_schema = StructType([
    StructField("hash", StringType(), True),
    StructField("from", StringType(), True),
    StructField("to", StringType(), True),
    StructField("value", StringType(), True),   # value is big number, keep as string
    StructField("nonce", LongType(), True),
    StructField("gas", LongType(), True)
])

# Block schema
block_schema = StructType([
    StructField("number", LongType(), True),
    StructField("hash", StringType(), True),
    StructField("miner", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("transactions", ArrayType(tx_schema), True)
])


In [0]:
%sql
-- Check your current catalog
SELECT current_catalog();

-- List schemas in your catalog
SHOW SCHEMAS;

CREATE SCHEMA IF NOT EXISTS workspace.ethereum;


databaseName
default
ethereum
ethereum_delta_catalog
information_schema


In [0]:
from pyspark.sql.functions import explode, col

# ✅ Read JSON block files incrementally from your Volume
raw_blocks = (
    spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "json")
      .schema(block_schema)  # provide schema for better performance
      .load("/Volumes/ethereum-catalog/ethereum/ethereum-volume/raw/blocks/")
)

# Extract block-level fields
blocks_df = raw_blocks.select("number", "hash", "miner", "timestamp")

# Write to Delta using Structured Streaming
blocks_query = (
    blocks_df.writeStream
        .format("delta")  # Delta Lake sink
        .outputMode("append")  # Append new blocks
        .option("checkpointLocation", "/Volumes/ethereum-catalog/ethereum/ethereum-volume/checkpoints/blocks/")  # Required for streaming
        .trigger(availableNow=True)  # Process all available data immediately
        .table("ethereum.blocks")  # Save directly as Delta table in Unity Catalog
)

In [0]:
from pyspark.sql.functions import explode, col

# Flatten nested transactions
txs_df = raw_blocks.select(
    col("number").alias("block_number"),
    explode("transactions").alias("tx")
).select(
    col("block_number"),
    col("tx.hash").alias("tx_hash"),
    col("tx.from").alias("from_address"),
    col("tx.to").alias("to_address"),
    col("tx.value"),
    col("tx.nonce"),
    col("tx.gas")
)

# Write to Delta using Structured Streaming
txs_query = (
    txs_df.writeStream
        .format("delta")  # Delta Lake sink
        .outputMode("append")  # Append new transactions
        .option("checkpointLocation", "/Volumes/ethereum-catalog/ethereum/ethereum-volume/checkpoints/transactions/")  # Required for streaming
        .trigger(availableNow=True)  # Process all available data immediately
        .table("ethereum.transactions")  # Save directly as Delta table in Unity Catalog
)


In [0]:
%sql
-- Total blocks ingested
SELECT COUNT(*) 
FROM ethereum.blocks;



COUNT(*)
11


In [0]:
%sql

-- Total transactions ingested
SELECT COUNT(*) 
FROM ethereum.transactions;



COUNT(*)
2441


In [0]:
%sql
-- Top miners by number of blocks mined
SELECT miner, COUNT(*) AS blocks_mined
FROM ethereum.blocks
GROUP BY miner
ORDER BY blocks_mined DESC;


miner,blocks_mined
0xdadB0d80178819F2319190D340ce9A924f783711,8
0x4838B106FCe9647Bdf1E7877BF73cE8B0BAD5f97,2
0x388C818CA8B9251b393131C08a736A67ccB19297,1


In [0]:
%sql

-- High-value transfers (greater than 1 ETH)
SELECT *
FROM ethereum.transactions
WHERE CAST(value AS DECIMAL(38,0)) > 1000000000000000000;

block_number,tx_hash,from_address,to_address,value,nonce,gas
23415045,4c760d62865a16248c300067ad0286415b740899f28e97ffad9b749efbe0d1d0,0xf4da41ebED948663f988aFaf80c53996b9a6834d,0x1231DEB6f5749EF6cE6943a275A1D3E7486F4EaE,1160000000000000000,139,177350
23415045,6177a3046243563c8f79fe7e99b8ca61f76199716e87c4a682528a9049839983,0x95793142648358E4De193Da6507EBA28e4A7cDFf,0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D,1700000000000000000,7,300000
23415045,7cd0c3ddde44f8a40ad009665894673e67631cc68c2365c70296d01db067a469,0x21a31Ee1afC51d94C2eFcCAa2092aD1028285549,0xcF30FcF35845713a7A86915b6DbfA27908286eD0,1961316000000000000,12956996,207128
23415046,b1a52c05fd7fc25a0616ee76d3ac3becca67dbed5cbfad0848637297d952331c,0xf35eaA2F01CDbc11C5181751528970F95BFea253,0xa03400E098F4421b34a3a44A1B4e571419517687,50516796971647967000,1810,30000
23415046,6dbef7f8cc6b4338e040b5afad94eec0692b2a6683e27dc97e9e92c30517919c,0x41eb5aB31Aa0e4A4027547C05853af2Fd13A11Dc,0x241f4848678302Fa027DFdF05f6d548fe3CDeaf7,183180000000000000000,11,24150
23415046,b2255f7aad15f306316d752d9a68c5cb3571d75c7cb58b5efaf40cfdd9837678,0xaF62aecC38802DE1765733C40c55a28584779729,0xaF62aecC38802DE1765733C40c55a28584779729,1359691118269870569,143,94041
23415046,389844477f2dddeae662691efab04b5a8e8568a7a9c83b138e56cece4e03b916,0xdD9BD62D1c08210fDDa6f841eD5196B45A939625,0xd01607c3C5eCABa394D8be377a08590149325722,222000000000000000000,2773,300000
23415046,dca652a1502ef80b5b7c2955212f1fb9116107bfaf353573e232b0290fb62576,0xaEAb0c0499cED7cB0cf32A15d642530B3c96bbE3,0xA9D1e08C7793af67e9d92fe308d5697FB81d3E43,499999953400000000000,0,23300
23415046,ca26ffaeca384e82cd883f97a02a32a9266019cbd575b56956b59e8c42506845,0x4BE9b904707c0E346f9629e2e29F71bf87F49dD1,0x5FE6cF1542C27a2a19bB28f427d546F296cdfa25,2000000000000000000,17,21000
23415046,a75a8ce3d06759b6b430d4ba07873f01b8328eb7a2c8feb8399826a1693363c5,0x9696f59E4d72E237BE84fFD425DCaD154Bf96976,0xF214798A8aF12Ad98e173171ee2D8d7ea11CE75C,24836000000000000000,8231593,207128
