In [0]:
%pip install web3

Collecting web3
  Downloading web3-7.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting eth-abi>=5.0.1 (from web3)
  Downloading eth_abi-5.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting eth-account>=0.13.6 (from web3)
  Downloading eth_account-0.13.7-py3-none-any.whl.metadata (3.7 kB)
Collecting eth-hash>=0.5.1 (from eth-hash[pycryptodome]>=0.5.1->web3)
  Downloading eth_hash-0.7.1-py3-none-any.whl.metadata (4.2 kB)
Collecting eth-typing>=5.0.0 (from web3)
  Downloading eth_typing-5.2.1-py3-none-any.whl.metadata (3.2 kB)
Collecting eth-utils>=5.0.0 (from web3)
  Downloading eth_utils-5.3.1-py3-none-any.whl.metadata (5.7 kB)
Collecting hexbytes>=1.2.0 (from web3)
  Downloading hexbytes-1.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting aiohttp>=3.7.4.post0 (from web3)
  Downloading aiohttp-3.13.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting types-requests>=2.0.0 (from web3)
  Downloading types_requests-2.32.4.20250913-

In [0]:
from web3 import Web3



# === SIMPLE PARAMETERIZATION (VARIABLES FOR REUSABILITY) ===
dbutils.widgets.text("catalog_name", "web3_data", "Catalog Name")
dbutils.widgets.text("schema_name", "ethereum_mainnet", "Schema Name")
dbutils.widgets.text("poll_interval", "15", "Polling Interval (seconds)")
dbutils.widgets.text("microbatch_size", "10", "Blocks per Microbatch")
dbutils.widgets.text("s3_managed_bucket","eth-stream-ingestion","S3 Managed Bucket")
dbutils.widgets.text("max_calls_per_second","0.8","Max calls/sec")

# === CONFIGURATION ===
CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")
S3_MANAGED_BUCKET = dbutils.widgets.get('s3_managed_bucket')
POLL_INTERVAL = int(dbutils.widgets.get("poll_interval"))
MICROBATCH_SIZE = int(dbutils.widgets.get("microbatch_size"))
MAX_CALLS_PER_SECOND = dbutils.widgets.get("max_calls_per_second")


# Unity Catalog volume paths
DATA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_blocks"
CHECKPOINT_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_checkpoints"
SCHEMA_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_schemas"
OUTPUT_VOLUME = f"/Volumes/{CATALOG}/{SCHEMA}/ethereum_output"

print(f"🔧 Using Catalog: {CATALOG}, Schema: {SCHEMA}")
print(f"⏱ Poll Interval: {POLL_INTERVAL}s")
print(f"📦 Microbatch Size: {MICROBATCH_SIZE} blocks per batch")
print(f"📁 Data: {DATA_VOLUME}")
print(f"📁 Checkpoints: {CHECKPOINT_VOLUME}")
print(f"📁 Schemas: {SCHEMA_VOLUME}")

# === UNITY CATALOG SETUP ===
stmts = [
    f"CREATE CATALOG IF NOT EXISTS {CATALOG} MANAGED LOCATION 's3://{S3_MANAGED_BUCKET}/'",
    f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_blocks",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_checkpoints",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_schemas",
    f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.ethereum_output",
]

for i, s in enumerate(stmts, 1):
    print(f"[{i}/{len(stmts)}] {s}")
    try:
        spark.sql(s)
        print("  ✅ Success")
    except Exception as e:
        print(f"  ❌ Error: {e}")


🔧 Using Catalog: web3_data, Schema: ethereum_mainnet
⏱ Poll Interval: 15s
📦 Microbatch Size: 10 blocks per batch
📁 Data: /Volumes/web3_data/ethereum_mainnet/ethereum_blocks
📁 Checkpoints: /Volumes/web3_data/ethereum_mainnet/ethereum_checkpoints
📁 Schemas: /Volumes/web3_data/ethereum_mainnet/ethereum_schemas
[1/6] CREATE CATALOG IF NOT EXISTS web3_data MANAGED LOCATION 's3://eth-stream-ingestion/'
  ✅ Success
[2/6] CREATE SCHEMA IF NOT EXISTS web3_data.ethereum_mainnet
  ✅ Success
[3/6] CREATE VOLUME IF NOT EXISTS web3_data.ethereum_mainnet.ethereum_blocks
  ✅ Success
[4/6] CREATE VOLUME IF NOT EXISTS web3_data.ethereum_mainnet.ethereum_checkpoints
  ✅ Success
[5/6] CREATE VOLUME IF NOT EXISTS web3_data.ethereum_mainnet.ethereum_schemas
  ✅ Success
[6/6] CREATE VOLUME IF NOT EXISTS web3_data.ethereum_mainnet.ethereum_output
  ✅ Success


In [0]:

import os, json, time, logging
from pyspark.sql.datasource import DataSource, DataSourceStreamReader, InputPartition
from pyspark.sql.types import StructType, StructField, LongType, StringType
from pyspark.sql import Row
from web3 import Web3

            

# -----------------------------
# Logging setup
# -----------------------------
logging.basicConfig(
    level=logging.INFO, 
    format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger("EthereumStream")

# -----------------------------
# Partition class
# -----------------------------
class BlockRangePartition(InputPartition):
    def __init__(self, start, end):
        self.start = start
        self.end = end
        logger.debug(f"Created partition: blocks {start} to {end}")

# -----------------------------
# StreamReader
# -----------------------------
class EthereumStreamReader(DataSourceStreamReader):
    def __init__(self, schema, options):
        logger.info("=" * 60)
        logger.info("Initializing EthereumStreamReader")
        logger.info("=" * 60)
        
        self.schema = schema
        self.options = options
        self.provider_uri = options.get("provider_uri")
        self.start_block = int(options.get("start_block", 1))
        self.output_dir = options.get("output_dir", "dbfs:/Volumes/web3_data/ethereum_mainnet/ethereum_blocks/raw")
        self.current_block = self.start_block
        self.max_calls_per_second = float(options.get("max_calls_per_second", 0.8))

        logger.info(f"Configuration:")
        logger.info(f"  - Provider URI: {self.provider_uri}")
        logger.info(f"  - Start block: {self.start_block}")
        logger.info(f"  - Output directory: {self.output_dir}")
        logger.info(f"  - Max calls/sec: {self.max_calls_per_second}")
        logger.info("=" * 60)
        print(f"start Block at , {self.start_block}")
    # -------------------------
    # Offset handling (dict-based)
    # -------------------------
    def initialOffset(self):
        offset = {"offset": self.current_block}
        logger.info(f"initialOffset() called -> Returning: {offset}")
        return offset

    def latestOffset(self):
        logger.info("latestOffset() called -> Connecting to Ethereum provider...")
        try:
            w3 = Web3(Web3.HTTPProvider(self.provider_uri))
            if not w3.is_connected():
                logger.error("Failed to connect to Ethereum provider in latestOffset()")
                raise RuntimeError("Failed to connect to Ethereum provider")
            
            latest = w3.eth.block_number
            offset = {"offset": latest}
            logger.info(f"latestOffset() -> Latest block from chain: {latest}")
            logger.info(f"latestOffset() -> Returning: {offset}")
            print(f"latestOffset() -> Returning: {offset}")
            return offset
        except Exception as e:
            logger.error(f"Error in latestOffset(): {e}", exc_info=True)
            raise

    # -------------------------
    # Partition planning
    # -------------------------
    def partitions(self, start_offset, end_offset):
        """Legacy method name - calls planPartitions internally"""
        return self.planPartitions(start_offset, end_offset)
    
    def planPartitions(self, start_offset, end_offset):
        logger.info("-" * 60)
        logger.info("planPartitions() called")
        logger.info(f"  - start_offset: {start_offset}")
        logger.info(f"  - end_offset: {end_offset}")
        
        start = start_offset.get("offset", self.start_block) if start_offset else self.start_block
        end = end_offset.get("offset", start + 1000) if end_offset else start + 1000
        step = int(self.options.get("batch_size", 10))
        
        logger.info(f"  - Computed start: {start}")
        print(f"  - Computed start: {start}")

        logger.info(f"  - Computed end: {end}")
        print(f"  - Computed end: {end}")

        logger.info(f"  - Batch size: {step}")
        print(f"  - Batch size: {step}")




        
        partitions = []
        for i in range(start, end + 1, step):
            partition_end = min(i + step - 1, end)
            partitions.append(BlockRangePartition(i, partition_end))
        
        logger.info(f"  - Created {len(partitions)} partition(s)")
        for idx, p in enumerate(partitions):
            logger.info(f"    Partition {idx}: blocks {p.start} to {p.end} ({p.end - p.start + 1} blocks)")
        logger.info("-" * 60)
        
        return partitions

    # -------------------------
    # Reader
    # -------------------------
    def read(self, partition):
        logger.info("*" * 60)
        logger.info(f"read() called for partition: blocks {partition.start} to {partition.end}")
        logger.info("*" * 60)
        
        try:
            w3 = Web3(Web3.HTTPProvider(self.provider_uri))
            if not w3.is_connected():
                logger.error("Failed to connect to Ethereum provider in read()")
                raise RuntimeError("Failed to connect to Ethereum provider")
            
            logger.info("Successfully connected to Ethereum provider")
        except Exception as e:
            logger.error(f"Connection error in read(): {e}", exc_info=True)
            raise

        last_call_time = 0
        blocks_processed = 0
        blocks_failed = 0
        
        for blk_num in range(partition.start, partition.end + 1):
            try:
                # Rate limiting
                now = time.time()
                sleep_time = (1 / self.max_calls_per_second) - (now - last_call_time)
                if sleep_time > 0:
                    logger.debug(f"Rate limiting: sleeping for {sleep_time:.3f}s")
                    time.sleep(sleep_time)
                last_call_time = time.time()

                logger.info(f"Fetching block {blk_num}...")
                block = w3.eth.get_block(blk_num, full_transactions=False)

                blk_dict = {
                    "block_number": block.number,
                    "block_hash": block.hash.hex(),
                    "timestamp": block.timestamp,
                    "tx_count": len(block.transactions)
                }

                # Save JSON locally (optional)
                file_path = os.path.join(self.output_dir, f"block_{blk_num}.json")
                with open(file_path, "w") as f:
                    json.dump(blk_dict, f, indent=2)
                logger.debug(f"Saved block data to {file_path}")

                logger.info(f"✓ Block {blk_num}: hash={block.hash.hex()[:10]}..., "
                           f"timestamp={block.timestamp}, tx_count={len(block.transactions)}")

                yield Row(
                    block_number=block.number,
                    block_hash=block.hash.hex(),
                    timestamp=block.timestamp,
                    tx_count=len(block.transactions)
                )
                
                self.current_block = blk_num + 1
                blocks_processed += 1

            except Exception as e:
                blocks_failed += 1
                logger.error(f"✗ Error fetching block {blk_num}: {e}", exc_info=True)
                continue
        
        logger.info("*" * 60)
        logger.info(f"read() completed for partition {partition.start}-{partition.end}")
        logger.info(f"  - Blocks processed: {blocks_processed}")
        logger.info(f"  - Blocks failed: {blocks_failed}")
        logger.info("*" * 60)

    # -------------------------
    # Commit method for checkpointing
    # -------------------------
    def commit(self, end_offset):
        logger.info("=" * 60)
        logger.info(f"commit() called with end_offset: {end_offset}")
        logger.info(f"Successfully committed up to block: {end_offset.get('offset')}")
        logger.info("=" * 60)

    # -------------------------
    # Pickle support
    # -------------------------
    def __getstate__(self):
        logger.debug("__getstate__() called for serialization")
        state = self.__dict__.copy()
        return state

    def __setstate__(self, state):
        logger.debug("__setstate__() called for deserialization")
        self.__dict__.update(state)

# -----------------------------
# DataSource wrapper
# -----------------------------
class EthereumDataSource(DataSource):
    def __init__(self, options):
        logger.info("=" * 60)
        logger.info("EthereumDataSource.__init__() called")
        logger.info(f"Options: {options}")
        logger.info("=" * 60)
        self.options = options

    @classmethod
    def name(cls):
        logger.debug("name() called -> returning 'ethereum'")
        return "ethereum"

    def schema(self):
        schema = StructType([
            StructField("block_number", LongType()),
            StructField("block_hash", StringType()),
            StructField("timestamp", LongType()),
            StructField("tx_count", LongType())
        ])
        logger.info("schema() called -> returning schema with 4 fields")
        logger.debug(f"Schema: {schema}")
        return schema

    def streamReader(self, schema):
        logger.info("streamReader() called -> creating EthereumStreamReader")
        return EthereumStreamReader(schema, self.options)



In [0]:

#import uuid
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, ArrayType, MapType
from web3 import Web3


# Hard-coded constants
ETH_PROVIDER_URI = "https://mainnet.infura.io/v3/6504e6a7883c4b49ac1cf17099e2ea3a"

# Connect to an Ethereum node (Infura, Alchemy, etc.)
w3 = Web3(Web3.HTTPProvider(ETH_PROVIDER_URI))

# Get latest block number
start_block = w3.eth.block_number
print("Latest Ethereum block number:", start_block)

# --- Spark session ---
spark = SparkSession.builder.appName("EthereumStream").getOrCreate()

eth_schema = StructType([
    StructField("block_number", LongType(), True),
    StructField("block_hash", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("tx_count", LongType(), True)
])


checkpoint_path = f"{CHECKPOINT_VOLUME}/"
output_path = f"{OUTPUT_VOLUME}/"
eth_provider_uri = f"{ETH_PROVIDER_URI}"
start_block = f"{start_block}"
poll_interval = f"{POLL_INTERVAL}"

print("✅ Using checkpoint:", checkpoint_path)
print("✅ Using output:", output_path)
print("✅ Using eth_provider_uri:", eth_provider_uri)
print("✅ Using start_block:", start_block)
print("✅ Using poll_interval:", poll_interval)

spark.dataSource.register(EthereumDataSource)

# --- Read from custom Ethereum source ---
df = spark.readStream \
    .format("ethereum") \
    .schema(eth_schema) \
    .option("provider_uri",eth_provider_uri) \
    .option("start_block", start_block) \
    .option("poll_interval", poll_interval) \
    .load()

# --- Write with dynamic paths ---
query = df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .trigger(availableNow=True) \
    .start(output_path)

logger.info("Streaming query started. Awaiting termination...")



Latest Ethereum block number: 23522635
✅ Using checkpoint: /Volumes/web3_data/ethereum_mainnet/ethereum_checkpoints/
✅ Using output: /Volumes/web3_data/ethereum_mainnet/ethereum_output/
✅ Using eth_provider_uri: https://mainnet.infura.io/v3/6504e6a7883c4b49ac1cf17099e2ea3a
✅ Using start_block: 23522635
✅ Using poll_interval: 15


2025-10-07 01:35:19 [INFO] [EthereumStream] Streaming query started. Awaiting termination...


In [0]:
df = spark.read.format("delta").load("/Volumes/web3_data/ethereum_mainnet/ethereum_output")

display(df)


block_number,block_hash,timestamp,tx_count
23522115,48962791406f2d43f9e5eb781b2abd56a3576eb58a8c610412559e397693177c,1759794635,280
23522116,26f8cca59c572203bae2177e6df033b84f1ce3c879309bc160426312d862dcaa,1759794647,165
23522117,c3e5b919aaa9ac063f0cced47d0217c583ba890694a31d38f2be5e4ff5344ce1,1759794659,152
23522118,44ab9dd502a52acc6847122a19d538cd87d821979aee86412ae04bdc3d9d4ee1,1759794671,204
23522119,241de434fa347f5f1c98c4f48436e50ca26d598dcc68d791f9c4ca7e6acb0903,1759794683,576
23522120,e83f2128d34653848353a1a7d5822da171fd566ed72adbc60665987ebca9d17a,1759794695,268
23522121,45b2c37a83dd78cda3ad538e223721ff7501f5ded8ca5d7a423834d735fea102,1759794707,145
23522122,670c312ba9d891567bd65cb5962069976a9181990cde8262aa574c54eb36ca4d,1759794719,131
23522123,2f441f426c5f53bfd43e69bfbfbe65cbcaeeb535b68e4db1c31b58c1e4422930,1759794731,137
23522124,84b2b2b4990273c9c0a20dad97db14676a8267a1f0c950a6cf207cffe0a85c66,1759794743,211
