In [10]:
# pip install psycopg2-binary duckdb

In [2]:
import os
import shutil
import pandas as pd
import numpy as np
import random
import pyarrow as pa
import pyarrow.parquet as pq
from sqlalchemy import create_engine

def create_mock_data(num_rows=1000, start_date='2025-02-14', end_date='2025-02-19'):
    """
    Generate a DataFrame with mock data over a five-day period.
    
    Columns:
      - id: Sequential identifier.
      - created_on: Random timestamps within the specified range.
      - some_numeric: A float column.
      - name: A simple text column.
      - col_mixed: Mostly integers; about 20% of rows contain the string "error".
      - col_list: With 50% probability, a string representation of a list; otherwise, a plain string.
    """
    start_ts = pd.Timestamp(start_date)
    end_ts = pd.Timestamp(end_date)
    random.seed(42)
    np.random.seed(42)
    random_seconds = np.random.randint(
        start_ts.value // 10**9,
        end_ts.value // 10**9 + 1,
        size=num_rows
    )
    created_on = pd.to_datetime(random_seconds, unit='s')
    ids = np.arange(1, num_rows + 1)
    
    def random_mixed_value():
        return random.randint(1, 100) if random.random() < 0.8 else "error"
    col_mixed = [random_mixed_value() for _ in range(num_rows)]
    
    def random_list_value():
        return str([random.randint(1, 10), random.randint(1, 10)]) if random.random() < 0.5 else "single_value"
    col_list = [random_list_value() for _ in range(num_rows)]
    
    df = pd.DataFrame({
        "id": ids,
        "created_on": created_on,
        "some_numeric": np.random.randn(num_rows),
        "name": [f"Name_{i}" for i in ids],
        "col_mixed": col_mixed,
        "col_list": col_list
    })
    df.sort_values("created_on", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def write_data_to_postgres(df, table_name="mockdata"):
    """
    Write the DataFrame to PostgreSQL using SQLAlchemy.
    This creates or replaces the table 'mockdata'.
    """
    connection_str = "postgresql+psycopg2://testuser:testpassword@postgres:5432/testdb"
    engine = create_engine(connection_str)
    df.to_sql(table_name, engine, if_exists="replace", index=False)
    print(f"Data written to table '{table_name}' in PostgreSQL.")
    return engine

def run_etl(engine, root_path, partition_cols=["year", "month", "day"], overwrite_date=None):
    """
    Run ETL by:
      1. Querying data from PostgreSQL.
      2. Adding partition columns (year, month, day) from 'created_on'.
      3. If overwrite_date is provided, clearing the contents of the corresponding partition folder.
      4. Converting the DataFrame to a pyarrow Table.
      5. Writing the table to a Parquet dataset partitioned by the specified partition columns.
      
    When overwrite_date is provided (as "YYYY-MM-DD"), the query fetches only that day's data.
    """
    # 1. Build a query that fetches only one day if overwrite_date is provided.
    if overwrite_date is not None:
        query = f"""
            SELECT
                id,
                name,
                some_numeric,
                created_on
            FROM mockdata
            WHERE to_char(created_on, 'YYYY-MM-DD') = '{overwrite_date}'
        """
    else:
        query = """
            SELECT
                id,
                name,
                some_numeric,
                created_on
            FROM mockdata
            WHERE 1=1
        """
    df = pd.read_sql_query(query, engine, dtype_backend="pyarrow")
    
    # 2. Add partition columns based on 'created_on'
    df["year"] = df["created_on"].dt.strftime("%Y")
    df["month"] = df["created_on"].dt.strftime("%m")
    df["day"] = df["created_on"].dt.strftime("%d")
    
    # 3. If overwrite_date is provided, clear only the target partition folder.
    if overwrite_date is not None:
        # Extract partition folder from overwrite_date.
        y, m, d = overwrite_date.split("-")
        partition_dir = os.path.join(root_path, f"year={y}", f"month={m}", f"day={d}")
        if os.path.exists(partition_dir):
            print(f"Clearing contents in partition directory: {partition_dir}")
            for entry in os.scandir(partition_dir):
                if entry.is_file() or entry.is_symlink():
                    os.remove(entry.path)
                elif entry.is_dir():
                    shutil.rmtree(entry.path)
            print(f"Cleared contents in partition: {partition_dir}")
        else:
            print(f"Partition directory {partition_dir} does not exist. It will be created.")
    
    # 4. Convert DataFrame to pyarrow Table.
    table = pa.Table.from_pandas(df, preserve_index=False)
    
    # 5. Write the table to a Parquet dataset partitioned by the specified columns.
    pq.write_to_dataset(
        table=table,
        root_path=root_path,
        partition_cols=partition_cols
    )
    print("ETL complete. Data written to partitions under:", root_path)


In [14]:
# Step 1: Create mock data.
df = create_mock_data(num_rows=1_000_000)
df.info()
df.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   id            1000000 non-null  int64         
 1   created_on    1000000 non-null  datetime64[ns]
 2   some_numeric  1000000 non-null  float64       
 3   name          1000000 non-null  object        
 4   col_mixed     1000000 non-null  object        
 5   col_list      1000000 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 45.8+ MB


Unnamed: 0,id,created_on,some_numeric,name,col_mixed,col_list
0,203234,2025-02-14 00:00:01,-0.494074,Name_203234,51,single_value
1,672607,2025-02-14 00:00:02,-2.331103,Name_672607,error,"[7, 9]"
2,675467,2025-02-14 00:00:02,-0.608402,Name_675467,1,single_value
3,132097,2025-02-14 00:00:03,1.093924,Name_132097,36,single_value
4,897351,2025-02-14 00:00:03,-0.353474,Name_897351,90,single_value
5,593221,2025-02-14 00:00:03,0.988131,Name_593221,error,"[5, 2]"
6,916728,2025-02-14 00:00:03,0.134446,Name_916728,69,"[10, 7]"
7,895512,2025-02-14 00:00:04,0.456805,Name_895512,78,single_value
8,400742,2025-02-14 00:00:04,0.458349,Name_400742,45,single_value
9,272198,2025-02-14 00:00:04,1.462708,Name_272198,12,single_value


In [15]:
# Step 2: Write mock data to PostgreSQL.
engine = write_data_to_postgres(df, table_name="mockdata")

Data written to table 'mockdata' in PostgreSQL.


In [5]:
# Root path for output Parquet dataset.
parquet_root = "l.fact_basic"

# Run ETL in full mode (all data).
# run_etl(engine, parquet_root)

# To process one specific day (e.g. '2025-02-17') and clear only that partition's files:
run_etl(engine, parquet_root, overwrite_date="2025-02-17")

Partition directory l.fact_basic/year=2025/month=02/day=17 does not exist. It will be created.
ETL complete. Data written to partitions under: l.fact_basic


In [6]:
run_etl(engine, parquet_root, overwrite_date="2025-02-18")

Partition directory l.fact_basic/year=2025/month=02/day=18 does not exist. It will be created.
ETL complete. Data written to partitions under: l.fact_basic


In [16]:
run_etl(engine, parquet_root, overwrite_date="2025-02-19")

Partition directory l.fact_basic/year=2025/month=02/day=19 does not exist. It will be created.
ETL complete. Data written to partitions under: l.fact_basic


In [11]:
import duckdb
import pandas as pd

# Connect to an in-memory DuckDB instance.
con = duckdb.connect(database=':memory:')

# Use DuckDB's read_parquet() function to read the dataset.
# Assume the dataset is stored in the directory "output_parquet_dataset" with hive partitions:
# output_parquet_dataset/year=2025/month=02/day=17/...
query = """
SELECT *
FROM read_parquet('l.fact_basic/**/*.parquet')
WHERE year = '2025'
  AND month = '02'
"""

df2 = con.execute(query).arrow().to_pandas(types_mapper=pd.ArrowDtype)

print("DuckDB Parquet Query Result:")
df2.info()
df2.head(20)


DuckDB Parquet Query Result:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype                 
---  ------        --------------  -----                 
 0   id            6 non-null      int64[pyarrow]        
 1   name          6 non-null      string[pyarrow]       
 2   some_numeric  6 non-null      double[pyarrow]       
 3   created_on    6 non-null      timestamp[ns][pyarrow]
 4   day           6 non-null      int64[pyarrow]        
 5   month         6 non-null      string[pyarrow]       
 6   year          6 non-null      int64[pyarrow]        
dtypes: double[pyarrow](1), int64[pyarrow](3), string[pyarrow](2), timestamp[ns][pyarrow](1)
memory usage: 480.0 bytes


Unnamed: 0,id,name,some_numeric,created_on,day,month,year
0,17,Name_17,0.122219,2025-02-17 05:16:07,17,2,2025
1,19,Name_19,-0.600254,2025-02-17 19:29:25,17,2,2025
2,4,Name_4,-0.924083,2025-02-18 05:37:18,18,2,2025
3,13,Name_13,-0.629475,2025-02-18 08:07:51,18,2,2025
4,14,Name_14,0.59772,2025-02-18 11:54:28,18,2,2025
5,11,Name_11,-0.703344,2025-02-18 23:33:30,18,2,2025
