In [1]:
import duckdb
print(duckdb.__version__)
import os

1.2.0


## Ingest data

In [2]:
y  = "/mnt/ugreen/HuggingFace/Ethereum_blockchain_parquet"
y2 = "/mnt/ugreen/Eth_pq_sample/Ethereum_blockchain_parquet_full_zstd_10"
y3 = "/mnt/ugreen/Ethereum_blockchain_parquet_full_zstd_10"

def get_blocks_folder(folder):
    return os.path.join(folder, "blocks", "*.parquet")

def get_tx_folder(folder):
    return os.path.join(folder, "transactions", "*.parquet")

In [3]:
def ingest_pq(*cols, folder):

    # Convert column names to a comma-separated string
    col_str = ", ".join(cols) if cols else "*"

    pq_input = duckdb.sql(f"""
                           SELECT {col_str} 
                           FROM read_parquet('{folder}')
                           """)

    return pq_input

## Run query

### Count

In [4]:
def get_num_rows(folder):

    one_col = ingest_pq("block_number", folder = folder)

    num_rows = duckdb.sql("""
                          SELECT COUNT(*) FROM one_col
                          """)

    print(num_rows)

    return None    

In [7]:
%%time

get_num_rows(get_blocks_folder(y))

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       634440 │
└──────────────┘

CPU times: user 28.3 ms, sys: 11.8 ms, total: 40.1 ms
Wall time: 18.1 ms


In [8]:
%%time

get_num_rows(get_tx_folder(y))

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     93492934 │
└──────────────┘

CPU times: user 126 ms, sys: 11.5 ms, total: 138 ms
Wall time: 47.3 ms


### Get time range

In [10]:
def get_time_range(folder):

    one_col = ingest_pq("timestamp", folder = folder)

    min_max_time = duckdb.sql("""
                    SELECT 
                    MIN(TO_TIMESTAMP(timestamp)) FILTER (WHERE timestamp > 0) AS min_time, 
                    MAX(TO_TIMESTAMP(timestamp)) AS max_time
                    FROM one_col   
                    """)

    return min_max_time   

In [11]:
%%time

get_time_range(get_blocks_folder(y))

CPU times: user 1.55 ms, sys: 3 ms, total: 4.55 ms
Wall time: 3.66 ms


┌──────────────────────────┬──────────────────────────┐
│         min_time         │         max_time         │
│ timestamp with time zone │ timestamp with time zone │
├──────────────────────────┼──────────────────────────┤
│ 2016-11-17 01:40:08+01   │ 2025-03-25 22:17:35+01   │
└──────────────────────────┴──────────────────────────┘

### Filter blocks

In [12]:
def get_blocks_filtered(t_start, t_end, folder):

    dim_cols = ingest_pq("block_number", "timestamp",
                          folder = folder)

    filter_blocks = duckdb.sql(f"""
                                SELECT 
                                TO_TIMESTAMP(timestamp) AS timestamp, 
                                block_number
                                FROM dim_cols
                                WHERE TO_TIMESTAMP(timestamp) 
                                BETWEEN '{t_start}' AND '{t_end}'
                               """)

    return filter_blocks    

In [13]:
# %%time

# get_blocks_filtered("2022-01-01", "2024-12-31")

### Block level analytics

#### Tx count per block

In [14]:
def get_tx_count_per_block(t_start, t_end, folder):
    
    blocks = get_blocks_filtered(t_start, t_end, get_blocks_folder(folder))

    tx_cols = ingest_pq("block_number",
                        "success",
                        folder = get_tx_folder(folder)
                       )
    
    tx_cols_success = duckdb.sql("""
                                 SELECT * FROM tx_cols
                                 WHERE tx_cols.success = TRUE
                                 """)

    count_per_block = duckdb.sql("""
                                SELECT 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp, 
                                    COUNT(*) AS count_tx
                                FROM tx_cols_success
                                INNER JOIN blocks 
                                    ON tx_cols_success.block_number = blocks.block_number
                                GROUP BY 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp
                                ORDER BY count_tx DESC
                                LIMIT 100
                                """).df()        
       
    return count_per_block

In [21]:
%%time

get_tx_count_per_block("2020-01-01", "2025-01-01", y)

CPU times: user 3.84 s, sys: 120 ms, total: 3.96 s
Wall time: 208 ms


Unnamed: 0,block_number,timestamp,count_tx
0,13330992,2021-10-01 05:19:25+02:00,1425
1,13331492,2021-10-01 07:13:20+02:00,1413
2,15471742,2022-09-04 14:50:39+02:00,1402
3,15273992,2022-08-04 07:25:20+02:00,1399
4,13235517,2021-09-16 09:49:44+02:00,1393
...,...,...,...
95,20137892,2024-06-21 07:05:47+02:00,1082
96,13657742,2021-11-21 11:53:26+01:00,1082
97,13795742,2021-12-13 09:32:27+01:00,1077
98,15553367,2022-09-17 14:41:35+02:00,1069


In [19]:
# %%time

# get_tx_count_per_block("2023-09-01", "2025-03-01", y2)

#### Get total ETH per block

In [22]:
def get_eth_per_block(t_start, t_end, folder):
    
    blocks = get_blocks_filtered(t_start, t_end, get_blocks_folder(folder))

    tx_cols = ingest_pq("block_number",
                        "success",
                        "value_f64",
                        folder = get_tx_folder(folder)
                       )
    
    tx_cols_success = duckdb.sql("""
                                 SELECT * FROM tx_cols
                                 WHERE tx_cols.success = TRUE
                                 """)

    eth_per_block = duckdb.sql("""
                                SELECT 
                                tx_cols_success.block_number, 
                                blocks.timestamp, 
                                COUNT(*) AS count_tx,
                                SUM(tx_cols_success.value_f64 / 1e18) AS total_tx_eth
                                FROM tx_cols_success
                                INNER JOIN blocks 
                                    ON tx_cols_success.block_number = blocks.block_number
                                GROUP BY 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp
                                ORDER BY total_tx_eth DESC
                                LIMIT 100
                                """).df()    
       
    return eth_per_block

In [24]:
%%time

get_eth_per_block("2020-01-01", "2025-01-01", y)

CPU times: user 4.85 s, sys: 241 ms, total: 5.09 s
Wall time: 259 ms


Unnamed: 0,block_number,timestamp,count_tx,total_tx_eth
0,10328842,2020-06-24 15:54:49+02:00,206,789744.896731
1,15093817,2022-07-07 08:30:47+02:00,353,598912.604601
2,12506592,2021-05-26 01:59:57+02:00,136,527673.911818
3,12461692,2021-05-19 02:44:33+02:00,261,500081.936254
4,15433542,2022-08-29 12:19:06+02:00,249,500057.213779
...,...,...,...,...
95,15946367,2022-11-11 12:05:35+01:00,131,50059.604783
96,21128917,2024-11-06 14:30:35+01:00,191,50041.179866
97,12839192,2021-07-16 19:15:58+02:00,243,50040.950173
98,18484467,2023-11-02 13:42:23+01:00,160,50031.292705


In [25]:
# %%time

# get_eth_per_block("2023-01-01", "2025-01-01", y3)

#### Get total gas used % per block

In [27]:
def get_gas_per_block(t_start, t_end, folder):
    
    blocks = get_blocks_filtered(t_start, t_end, get_blocks_folder(folder))

    tx_cols = ingest_pq("block_number",
                        "success",
                        "gas_limit",
                        "gas_used",
                        folder = get_tx_folder(folder)
                       )
    
    tx_cols_success = duckdb.sql("""
                                 SELECT * FROM tx_cols
                                 WHERE tx_cols.success = TRUE
                                 """)

    gas_per_block = duckdb.sql("""
                                SELECT 
                                    tx.block_number, 
                                    b.timestamp, 
                                    SUM(tx.gas_limit) AS sum_gas_limit,
                                    SUM(tx.gas_used) AS sum_gas_used,
                                    (SUM(tx.gas_used) * 100.0 / NULLIF(SUM(tx.gas_limit), 0)) AS gas_pert
                                FROM tx_cols_success tx
                                INNER JOIN blocks b
                                    ON tx.block_number = b.block_number
                                GROUP BY 
                                    tx.block_number, 
                                    b.timestamp
                                ORDER BY gas_pert DESC
                                LIMIT 100;
                                """).df()    
       
    return gas_per_block

In [28]:
%%time

get_gas_per_block("2020-01-01", "2025-01-01", y)

CPU times: user 6.22 s, sys: 1.45 s, total: 7.67 s
Wall time: 2.26 s


Unnamed: 0,block_number,timestamp,sum_gas_limit,sum_gas_used,gas_pert
0,14752267,2022-05-11 04:42:55+02:00,21000.0,21000.0,100.000000
1,13212767,2021-09-12 21:11:39+02:00,21000.0,21000.0,100.000000
2,20388292,2024-07-26 06:20:23+02:00,84209.0,84209.0,100.000000
3,15055692,2022-07-01 11:03:36+02:00,214996.0,214996.0,100.000000
4,9646492,2020-03-10 23:38:16+01:00,21000.0,21000.0,100.000000
...,...,...,...,...,...
95,16024067,2022-11-22 08:29:47+01:00,30125599.0,29890435.0,99.219388
96,12513192,2021-05-27 02:36:21+02:00,15050770.0,14932003.0,99.210891
97,12211017,2021-04-10 10:22:47+02:00,12585000.0,12482857.0,99.188375
98,12446267,2021-05-16 17:24:36+02:00,14985339.0,14859411.0,99.159659
