Ingesting data to Database

In [None]:
# Define a dlt resource to download and process Parquet files as single table
@dlt.resource(name="rides", write_disposition="replace")
def download_parquet():
    prefix = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata'
    for month in range(1, 7):
        print(f"Downloading data for month: {month}")
        url = f"{prefix}_2024-0{month}.parquet"
        response = requests.get(url)
        df = pd.read_parquet(BytesIO(response.content))
        yield df

# Use .env values for DuckDB path and dataset
import os
os.environ.pop('DUCKDB_PATH', None)  # Remove existing DUCKDB_PATH if set
duckdb_path = os.getenv('DUCKDB_PATH', 'data/rides_pipeline.db')
duckdb_dataset = os.getenv('DUCKDB_DATASET', 'rides_dataset')

# Unset GCP-related env vars to avoid contamination
# os.environ.pop('GCP_CREDENTIALS_PATH', None)
# os.environ.pop('BUCKET_URL', None)
# os.environ.pop('DESTINATION__CREDENTIALS', None)

# Set DuckDB path for dlt
os.environ['DUCKDB_DATABASE'] = duckdb_path

# Initialize the pipeline
duck_pipeline = dlt.pipeline(
    pipeline_name="rides_pipeline",
    destination=dlt.destinations.duckdb(duckdb_path),  # must be the string 'duckdb', not a path
    dataset_name=duckdb_dataset,
    # dlt will use duckdb_path automatically if set in env, but you can also set via env or config
)

# Run the pipeline to load Parquet data into DuckDB
info = duck_pipeline.run(download_parquet)

# Print the results
print(info)


In [None]:
import duckdb

conn = duckdb.connect(f"{duck_pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{duck_pipeline.dataset_name}'")

# Describe the dataset to see loaded tables
res = conn.sql("DESCRIBE").df()
print(res)

## Question 1. Counting records

    What is count of records for the 2024 Yellow Taxi Data?

In [None]:
# provide a resource name to query a table of that name
with duck_pipeline.sql_client() as client:
    with client.execute_query(f"SELECT count(1) FROM rides") as cursor:
        data = cursor.df()
print(data)

## Question 2. Data read estimation

    Write a query to count the distinct number of PULocationIDs for the entire dataset on both the tables.
    What is the estimated amount of data that will be read when this query is executed on the External Table and the Table?

In [None]:
# Count distinct pu_location_id in DuckDB table
with duck_pipeline.sql_client() as client:
    with client.execute_query("SELECT COUNT(DISTINCT pu_location_id) AS distinct_pu FROM rides") as cursor:
        duckdb_distinct = cursor.df()
print("DuckDB Table distinct pu_location_id:", duckdb_distinct)

