In [None]:
!pip install dlt[duckdb]

In [1]:
!dlt --version

[39mdlt 1.6.1[0m


In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


# Use the @dlt.resource decorator to define the API source.
@dlt.resource(name="ny_taxi_rides")
def ny_taxi():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        #Implement automatic pagination using dlt's built-in REST client.
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):
        yield page


# define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_pipeline",
    destination="duckdb",
    dataset_name="ny_taxi_data"
)


# Load the extracted data into DuckDB for querying.
load_info = pipeline.run(ny_taxi)
print(load_info)

In [None]:
import duckdb
from google.colab import data_table
data_table.enable_dataframe_formatter()

conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

conn.sql("DESCRIBE").df()

In [6]:
df = pipeline.dataset(dataset_type="default").ny_taxi_rides.df()
print(len(df))

10000


In [8]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            AVG(date_diff('minute', trip_pickup_date_time, trip_dropoff_date_time))
            FROM ny_taxi_rides;
            """
        )
    print(res)

[(12.3049,)]
