In [17]:
import duckdb
print(duckdb.__version__)
import os

1.2.1


## Ingest data

In [18]:
yearly_folder = "/home/vikas/Desktop/HuggingFace/Ethereum_blockchain_parquet"
blocks = os.path.join(yearly_folder, "blocks", "*.parquet")
tx = os.path.join(yearly_folder, "transactions", "*.parquet")

In [9]:
def ingest_pq(*cols, folder):

    # Convert column names to a comma-separated string
    col_str = ", ".join(cols) if cols else "*"

    pq_input = duckdb.sql(f"""
                           SELECT {col_str} 
                           FROM read_parquet('{folder}')
                           """)

    return pq_input

## Run query

### Count

In [12]:
def get_num_rows(folder):

    one_col = ingest_pq("block_number", folder = folder)

    num_rows = duckdb.sql("""
                          SELECT COUNT(*) FROM one_col
                          """)

    print(num_rows)

    return None    

In [19]:
%%time

get_num_rows(blocks)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       634440 │
└──────────────┘

CPU times: user 26.2 ms, sys: 2.4 ms, total: 28.6 ms
Wall time: 14.2 ms


In [22]:
%%time

get_num_rows(tx)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     93492934 │
└──────────────┘

CPU times: user 134 ms, sys: 11.8 ms, total: 146 ms
Wall time: 50 ms


### Get time range

In [23]:
def get_time_range():

    one_col = ingest_pq("timestamp", folder = blocks)

    min_max_time = duckdb.sql("""
                                SELECT 
                                MIN(TO_TIMESTAMP(timestamp)) AS min_time, 
                                MAX(TO_TIMESTAMP(timestamp)) AS max_time
                                FROM one_col   
                              """)

    return min_max_time   

In [24]:
%%time

get_time_range()

CPU times: user 3.05 ms, sys: 0 ns, total: 3.05 ms
Wall time: 1.89 ms


┌──────────────────────────┬──────────────────────────┐
│         min_time         │         max_time         │
│ timestamp with time zone │ timestamp with time zone │
├──────────────────────────┼──────────────────────────┤
│ 2016-11-17 01:40:08+01   │ 2025-03-25 22:17:35+01   │
└──────────────────────────┴──────────────────────────┘

### Filter blocks

In [33]:
def get_blocks_filtered(t_start, t_end):

    dim_cols = ingest_pq("block_number", "timestamp",
                          folder = blocks)

    filter_blocks = duckdb.sql(f"""
                                SELECT 
                                TO_TIMESTAMP(timestamp) AS timestamp, 
                                block_number
                                FROM dim_cols
                                WHERE TO_TIMESTAMP(timestamp) 
                                BETWEEN '{t_start}' AND '{t_end}'
                               """)

    return filter_blocks    

In [32]:
%%time

get_blocks_filtered("2022-01-01", "2024-12-31")

CPU times: user 2.37 ms, sys: 417 µs, total: 2.78 ms
Wall time: 1.93 ms


┌──────────────────────────┬──────────────┐
│        timestamp         │ block_number │
│ timestamp with time zone │    uint32    │
├──────────────────────────┼──────────────┤
│ 2022-01-01 00:04:01+01   │     13915917 │
│ 2022-01-01 00:08:54+01   │     13915942 │
│ 2022-01-01 00:14:28+01   │     13915967 │
│ 2022-01-01 00:20:05+01   │     13915992 │
│ 2022-01-01 00:24:23+01   │     13916017 │
│ 2022-01-01 00:31:13+01   │     13916042 │
│ 2022-01-01 00:39:10+01   │     13916067 │
│ 2022-01-01 00:44:38+01   │     13916092 │
│ 2022-01-01 00:49:37+01   │     13916117 │
│ 2022-01-01 00:54:27+01   │     13916142 │
│           ·              │         ·    │
│           ·              │         ·    │
│           ·              │         ·    │
│ 2022-03-19 09:00:34+01   │     14415667 │
│ 2022-03-19 09:07:18+01   │     14415692 │
│ 2022-03-19 09:12:23+01   │     14415717 │
│ 2022-03-19 09:16:53+01   │     14415742 │
│ 2022-03-19 09:21:56+01   │     14415767 │
│ 2022-03-19 09:29:44+01   │    

### Block level analytics

#### Tx count per block

In [56]:
def get_tx_count_per_block(t_start, t_end):
    
    blocks = get_blocks_filtered(t_start, t_end)

    tx_cols = ingest_pq("block_number",
                        "success",
                        folder = tx)
    
    tx_cols_success = duckdb.sql("""
                                 SELECT * FROM tx_cols
                                 WHERE tx_cols.success = TRUE
                                 """)

    count_per_block = duckdb.sql("""
                                SELECT 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp, 
                                    COUNT(*) AS count_tx
                                FROM tx_cols_success
                                INNER JOIN blocks 
                                    ON tx_cols_success.block_number = blocks.block_number
                                GROUP BY 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp
                                ORDER BY count_tx DESC
                                LIMIT 100
                                """)        
       
    return count_per_block

In [58]:
%%time

get_tx_count_per_block("2017-01-01", "2024-12-31")

CPU times: user 6.88 ms, sys: 81 µs, total: 6.96 ms
Wall time: 5.25 ms


┌──────────────┬──────────────────────────┬──────────┐
│ block_number │        timestamp         │ count_tx │
│    uint32    │ timestamp with time zone │  int64   │
├──────────────┼──────────────────────────┼──────────┤
│     13330992 │ 2021-10-01 05:19:25+02   │     1425 │
│     13331492 │ 2021-10-01 07:13:20+02   │     1413 │
│     15471742 │ 2022-09-04 14:50:39+02   │     1402 │
│     15273992 │ 2022-08-04 07:25:20+02   │     1399 │
│     13235517 │ 2021-09-16 09:49:44+02   │     1393 │
│     14619167 │ 2022-04-20 04:04:41+02   │     1388 │
│     13240667 │ 2021-09-17 04:49:36+02   │     1382 │
│     13543492 │ 2021-11-03 11:59:34+01   │     1381 │
│     13319492 │ 2021-09-29 10:03:32+02   │     1381 │
│     13332792 │ 2021-10-01 12:09:00+02   │     1380 │
│         ·    │           ·              │       ·  │
│         ·    │           ·              │       ·  │
│         ·    │           ·              │       ·  │
│     14586692 │ 2022-04-15 02:13:14+02   │     1094 │
│     1399

#### Get total ETH per block

In [63]:
def get_eth_per_block(t_start, t_end):
    
    blocks = get_blocks_filtered(t_start, t_end)

    tx_cols = ingest_pq("block_number",
                        "success",
                        "value_f64",
                        folder = tx)
    
    tx_cols_success = duckdb.sql("""
                                 SELECT * FROM tx_cols
                                 WHERE tx_cols.success = TRUE
                                 """)

    eth_per_block = duckdb.sql("""
                                SELECT 
                                tx_cols_success.block_number, 
                                blocks.timestamp, 
                                COUNT(*) AS count_tx,
                                SUM(tx_cols_success.value_f64 / 1e18) AS total_tx_eth
                                FROM tx_cols_success
                                INNER JOIN blocks 
                                    ON tx_cols_success.block_number = blocks.block_number
                                GROUP BY 
                                    tx_cols_success.block_number, 
                                    blocks.timestamp
                                ORDER BY total_tx_eth DESC
                                LIMIT 100
                                """)    
       
    return eth_per_block

In [65]:
%%time

get_eth_per_block("2017-01-01", "2024-12-31")

CPU times: user 0 ns, sys: 10.9 ms, total: 10.9 ms
Wall time: 18 ms


┌──────────────┬──────────────────────────┬──────────┬────────────────────┐
│ block_number │        timestamp         │ count_tx │    total_tx_eth    │
│    uint32    │ timestamp with time zone │  int64   │       double       │
├──────────────┼──────────────────────────┼──────────┼────────────────────┤
│      8118067 │ 2019-07-09 18:08:37+02   │      223 │ 1944140.3861762336 │
│      6803842 │ 2018-12-01 03:13:35+01   │       98 │ 1538628.4441383088 │
│     10328842 │ 2020-06-24 15:54:49+02   │      206 │  789744.8967305534 │
│     15093817 │ 2022-07-07 08:30:47+02   │      353 │  598912.6046008696 │
│      6801842 │ 2018-11-30 19:20:49+01   │      170 │  577531.2432517316 │
│     12506592 │ 2021-05-26 01:59:57+02   │      136 │   527673.911817796 │
│     12461692 │ 2021-05-19 02:44:33+02   │      261 │  500081.9362541297 │
│     15433542 │ 2022-08-29 12:19:06+02   │      249 │ 500057.21377944015 │
│     12249042 │ 2021-04-16 06:42:05+02   │      220 │ 451389.63398052484 │
│      85574