In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, TimestampType

# Block schema
block_schema = StructType([
    StructField("block_number", LongType()),
    StructField("block_hash", StringType()),
    StructField("miner", StringType()),
    StructField("timestamp", TimestampType())
])

# Transaction schema
tx_schema = StructType([
    StructField("tx_hash", StringType()),
    StructField("block_number", LongType()),
    StructField("from_address", StringType()),
    StructField("to_address", StringType()),
    StructField("value", DoubleType()),
    StructField("timestamp", TimestampType())
])


In [0]:
from web3.datastructures import AttributeDict
from hexbytes import HexBytes
from datetime import datetime
from pyspark.sql import Row
import json

def to_serializable(obj):
    if isinstance(obj, HexBytes):
        return obj.hex()
    elif isinstance(obj, AttributeDict):
        return {k: to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_serializable(i) for i in obj]
    elif isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    else:
        return obj

def block_to_row(block):
    b = to_serializable(block)
    return Row(
        block_number=b["number"],
        block_hash=b["hash"],
        miner=b["miner"],
        timestamp=datetime.fromtimestamp(b["timestamp"])
    )

def tx_to_row(tx, block_number, timestamp):
    tx_dict = to_serializable(tx)
    return Row(
        tx_hash=tx_dict["hash"],
        block_number=block_number,
        from_address=tx_dict["from"],
        to_address=tx_dict["to"],
        value=float(tx_dict["value"])/1e18,  # convert wei to ether
        timestamp=timestamp
    )


In [0]:
from pyspark.sql.datasource import DataSource, DataSourceStreamReader, DataSourceStreamWriter
from web3 import Web3
import os
import time

provider_uri = "https://mainnet.infura.io/v3/YOUR_INFURA_CODE"
w3 = Web3(Web3.HTTPProvider(provider_uri))

base_volume = "/Volumes/ethereum_catalog/ethereum/ethereum-volume"

# Volume paths
raw_blocks_dir = f"{base_volume}/raw/blocks/"
raw_tx_dir = f"{base_volume}/raw/transactions/"
#os.makedirs(raw_blocks_dir, exist_ok=True)
#os.makedirs(raw_tx_dir, exist_ok=True)

delta_blocks_path = f"{base_volume}/delta/blocks/"
delta_tx_path = f"{base_volume}/delta/transactions/"
checkpoint_blocks = f"{base_volume}/checkpoints/blocks/"
checkpoint_tx = f"{base_volume}/checkpoints/transactions/"

class EthereumStreamReader(DataSourceStreamReader):
    def __init__(self, schema, options, entity="blocks"):
        self._schema = schema
        self.options = options
        self.entity = entity
        self.last_block = w3.eth.block_number

    def readSchema(self):
        return self._schema

    def planInputPartitions(self):
        return []

    def getRows(self):
        latest_block = w3.eth.block_number
        rows = []
        for block_number in range(self.last_block + 1, latest_block + 1):
            block = w3.eth.get_block(block_number, full_transactions=True)
            timestamp = datetime.fromtimestamp(block.timestamp)

            # Save raw JSON
            if self.entity == "blocks":
                file_path = os.path.join(raw_blocks_dir, f"block_{block_number}.json")
                with open(file_path, "w") as f:
                    json.dump(to_serializable(block), f)
                rows.append(block_to_row(block))
            else:
                for tx in block.transactions:
                    file_path = os.path.join(raw_tx_dir, f"tx_{tx.hash.hex()}.json")
                    with open(file_path, "w") as f:
                        json.dump(to_serializable(tx), f)
                    rows.append(tx_to_row(tx, block_number, timestamp))

            print(f"✅ Processed block {block_number}")
        self.last_block = latest_block
        time.sleep(10)  # poll interval
        return rows

class EthereumStreamWriter(DataSourceStreamWriter):
    def __init__(self, options):
        self.options = options

    def createWriter(self, partitionId, taskId, epochId):
        return None

class EthereumCustomDataSource(DataSource):
    @classmethod
    def name(cls):
        return "ethereum_custom"

    def schema(self):
        return block_schema

    def streamReader(self, schema):
        entity = self.options.get("entity", "blocks")
        s = block_schema if entity == "blocks" else tx_schema
        return EthereumStreamReader(s, self.options, entity)

    def streamWriter(self, schema, overwrite):
        return EthereumStreamWriter(self.options)


In [0]:
# Blocks stream
df_blocks = spark.readStream.format("ethereum_custom") \
    .option("entity", "blocks") \
    .load(schema=block_schema)



df_blocks.writeStream.format("delta") \
    .option("checkpointLocation", checkpoint_blocks) \
    .option("path", delta_blocks_path) \
    .outputMode("append") \
    .trigger(once=True) \
    .start()

In [0]:
# Transactions stream
df_tx = spark.readStream.format("ethereum_custom") \
    .option("entity", "transactions") \
    .load(schema=tx_schema)

df_tx.writeStream.format("delta") \
    .option("checkpointLocation", checkpoint_tx) \
    .option("path", delta_tx_path) \
    .outputMode("append") \
    .start()


In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS ethereum_blocks
USING DELTA
LOCATION '{delta_blocks_path}'
""")


spark.sql(f"""
CREATE TABLE IF NOT EXISTS ethereum_transactions
USING DELTA
LOCATION '{delta_tx_path}'
""")
