# Module 3: Data Ingestion with dlt (Data Load Tool)
**Description:** Discovery and Prototyping for 2024 Yellow Taxi Ingestion.
**Course:** DataTalksClub Data Engineering Zoomcamp (2026)

**Credits:** * **Original dlt logic:** DataTalksClub / dltHub
* **Modifications & Infrastructure Sync:** Victoria T.
* **Key Modification:** Replaced manual secrets with local `terraform.tfvars` parsing and ADC authentication.

In [None]:
import os
import hcl2
import dlt
from dlt.destinations import filesystem

In [None]:
# load Terraform config
def get_tf_config(path="../terraform/terraform.tfvars"):
    try:
        with open(path, 'r') as f:
            config = hcl2.load(f)
            # standard hcl2 extraction
            return {k: v[0] if isinstance(v, list) else v for k, v in config.items()}
    except FileNotFoundError:
        print("Warning: terraform.tfvars not found. Falling back to local defaults.")
        return {}

CONFIG = get_tf_config()
BUCKET_NAME = CONFIG.get("gcs_bucket_name", "your-fallback-bucket")
PROJECT_ID = CONFIG.get("project", "your-fallback-project")

# set dlt environment variables (use 'gcloud login' session)
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = f"gs://{BUCKET_NAME}"
os.environ["DESTINATION__BIGQUERY__PROJECT_ID"] = PROJECT_ID

print(f"dlt linked to Bucket: {BUCKET_NAME}")

In [None]:
import requests
import pandas as pd
from io import BytesIO

Ingesting parquet files to GCS.

In [None]:
@dlt.source(name="rides")
def download_parquet():
    prefix = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024"
    for month in range(1, 7):
        file_name = f"yellow_tripdata_2024-0{month}.parquet"
        url = f"{prefix}-0{month}.parquet"
        
        print(f"Downloading: {url}")
        response = requests.get(url)
        if response.status_code == 200:
            df = pd.read_parquet(BytesIO(response.content))
            # return the dataframe as a dlt resource for ingestion
            yield dlt.resource(df, name=file_name)

# initialize GCS Pipeline
pipeline = dlt.pipeline(
    pipeline_name="rides_pipeline_gcs",
    destination=filesystem(layout="{schema_name}/{table_name}.{ext}"),
    dataset_name="rides_dataset",
)

# run the pipeline to load Parquet data into DuckDB
load_info = pipeline.run(download_parquet(), loader_file_format="parquet")
print(load_info)

Ingesting data to Database

In [None]:
# Re-using the resource for local testing
@dlt.resource(name="rides", write_disposition="replace")
def download_parquet_single():
    prefix = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024'
    for month in range(1, 7):
        url = f"{prefix}-0{month}.parquet"
        response = requests.get(url)
        if response.status_code == 200:
            yield pd.read_parquet(BytesIO(response.content))

pipeline_test = dlt.pipeline(
    pipeline_name="rides_test_pipeline",
    destination="duckdb",
    dataset_name="rides_test_dataset",
)

info = pipeline_test.run(download_parquet_single)
print(info)

In [None]:
import duckdb

# connect to the database file
conn = duckdb.connect(f"{pipeline_test.pipeline_name}.duckdb")

# use the dynamic dataset_name from the pipeline object
# dlt often prefixes or modifies this name internally
dataset_schema = pipeline_test.dataset_name

try:
    conn.sql(f"SET search_path = '{dataset_schema}'")
    print(f"Search path set to: {dataset_schema}")
    
    # describe the dataset to see the 'rides' table
    res = conn.sql("DESCRIBE").df()
    print(res)
except duckdb.CatalogException:
    print(f"'{dataset_schema}' not found. Available schemas:")
    print(conn.sql("SELECT schema_name FROM information_schema.schemata").df())

In [None]:
# provide a resource name to query a table of that name
with pipeline_test.sql_client() as client:
    with client.execute_query(f"SELECT count(1) FROM rides") as cursor:
        data = cursor.df()
print(data)