In [1]:
import dlt
import dlt
from itertools import islice
from dlt.sources.rest_api import rest_api_source



In [2]:
def openlibrary_source(query: str = "harry potter"):

    return rest_api_source({
        "client": {
            "base_url": "https://openlibrary.org",
        },
        "resource_defaults": {
            "primary_key": "key",
            "write_disposition": "replace",
        },
        "resources": [
            {
                "name": "books",
                "endpoint": {
                    "path": "search.json",
                    "params": {
                        "q": query,
                        "limit": 100,
                    },
                    "data_selector": "docs",
                    "paginator": {
                        "type": "offset",
                        "limit": 100,
                        "offset_param": "offset",
                        "limit_param": "limit",
                        "total_path": "numFound",
                    },
                },
            },
        ],
    })


In [3]:
pipeline = dlt.pipeline(
    pipeline_name="ol_demo",
    destination="duckdb",
    dataset_name="ol_data",
    progress="log" # logs the pipeline run (Optiona)
)

In [6]:
extract_info = pipeline.extract(openlibrary_source())

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 0.00s | Rate: 0.00/s
Memory usage: 48.98 MB (84.80%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 0.59s | Rate: 0.00/s
books: 100  | Time: 0.00s | Rate: 24672376.47/s
Memory usage: 58.45 MB (83.60%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 1.82s | Rate: 0.00/s
books: 400  | Time: 1.22s | Rate: 327.19/s
Memory usage: 87.06 MB (82.90%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 2.87s | Rate: 0.00/s
books: 600  | Time: 2.27s | Rate: 264.03/s
Memory usage: 88.14 MB (83.20%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 4.38s | Rate: 0.00/

In [8]:
load_id = extract_info.loads_ids[-1]
m = extract_info.metrics[load_id][0]

print("Resources:", list(m["resource_metrics"].keys()))
print("Tables:", list(m["table_metrics"].keys()))
print("Load ID:", load_id)
print()

for resource, rm in m["resource_metrics"].items():
    print(f"Resource: {resource}")
    print(f"rows extracted: {rm.items_count}")
    print()


Resources: ['books']
Tables: ['books']
Load ID: 1771919162.319891

Resource: books
rows extracted: 3758



In [9]:
normalize_info = pipeline.normalize()

------------------- Normalize rest_api in 1771919162.319891 --------------------
Files: 0/2 (0.0%) | Time: 0.00s | Rate: 0.00/s
Memory usage: 42.89 MB (86.10%) | CPU usage: 0.00%

------------------- Normalize rest_api in 1771919162.319891 --------------------
Files: 0/2 (0.0%) | Time: 0.00s | Rate: 0.00/s
Items: 0  | Time: 0.00s | Rate: 0.00/s
Memory usage: 43.12 MB (86.10%) | CPU usage: 0.00%

------------------- Normalize rest_api in 1771919162.319891 --------------------
Files: 10/2 (500.0%) | Time: 0.50s | Rate: 20.18/s
Items: 23006  | Time: 0.49s | Rate: 46530.16/s
Memory usage: 100.36 MB (85.30%) | CPU usage: 0.00%



In [10]:
load_info = pipeline.run(openlibrary_source())

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 0.00s | Rate: 0.00/s
Memory usage: 82.28 MB (84.60%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 0.44s | Rate: 0.00/s
books: 100  | Time: 0.00s | Rate: 19972876.19/s
Memory usage: 86.64 MB (84.40%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 1.54s | Rate: 0.00/s
books: 400  | Time: 1.10s | Rate: 362.92/s
Memory usage: 86.06 MB (84.60%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 2.60s | Rate: 0.00/s
books: 600  | Time: 2.16s | Rate: 277.82/s
Memory usage: 85.06 MB (84.60%) | CPU usage: 0.00%

------------------------------- Extract rest_api -------------------------------
Resources: 0/1 (0.0%) | Time: 3.65s | Rate: 0.00/

In [11]:
ds = pipeline.dataset()

In [12]:
ds.tables


['books',
 'books__author_key',
 'books__author_name',
 'books__ia',
 'books__ia_collection',
 'books__language',
 'books__id_standard_ebooks',
 'books__id_librivox',
 'books__id_project_gutenberg',
 '_dlt_version',
 '_dlt_loads',
 '_dlt_pipeline_state']