In [1]:
%pwd

'/Users/tbrekalo/src/amphinicy/misc'

In [15]:
import asyncio
import itertools as it

from collections.abc import Callable, Iterable, Iterator, Sequence, Set
from typing import TypeVar, Union

from pydantic import BaseModel, ValidationError
from pydantic_extra_types.isbn import ISBN

import httpx
import polars as pl

In [3]:
T = TypeVar("T")

In [4]:
JSON = Union[None, int, str, bool, list["JSON"], dict[str, "JSON"]]

In [5]:
class Book(BaseModel):
    model_config = {
        "frozen": True,
        "extra": "ignore",
    }

    title: str
    authors: Set[str]
    publisher: str
    publish_date: str
    isbn: ISBN


class BookError:
    message: str

In [6]:
BOOKS_API = "https://www.googleapis.com/books/v1/volumes"
AUTHORS = ["William Shakespare", "George Orwell", "Aldous Huxley", "Hermann Hesse"]

aclient = httpx.AsyncClient()

In [7]:
def try_parse_book(book: JSON) -> Book | None:
    try:
        ids = {
            id["type"]: id["identifier"]
            for id in book.get("industryIdentifiers", [])
            if id.get("type", None) in {"ISBN_10", "ISBN_13"} and "identifier" in id
        }

        isbn = ids.get("ISBN_13", ids.get("ISBN_10", None))
        if isbn is None:
            return None

        return Book(
            title=book.get("title"),
            authors=set(book.get("authors")),
            publisher=book.get("publisher"),
            publish_date=book.get("publishedDate"),
            isbn=isbn,
        )
    except ValidationError:
        return None

In [8]:
def slice(src: Iterator[T], length: int | None = None) -> Iterable[Sequence[T]]:
    if length is None:
        return (tuple(jt) for jt in (src,))

    jt = iter(src)
    return iter(lambda: tuple(it.islice(jt, length)), ())

In [9]:
async def batch_calls(
    client: httpx.AsyncClient,
    *,
    tasks: Sequence[Callable[[httpx.AsyncClient], Sequence[T]]],
    batch_size: int | None = None,
) -> Sequence[T]:
    if not tasks:
        return []

    if batch_size is None:
        batch_size = len(tasks)

    ret = []
    for batch in slice(tasks, batch_size):
        ret.extend(await asyncio.gather(*(task(client) for task in batch)))

    return ret

In [10]:
async def get_by_author(
    client: httpx.AsyncClient,
    author: str,
) -> Sequence[Book] | BookError:
    response = await client.get(BOOKS_API, params={"q": f"inauthor:{author}"})
    if response.is_error:
        return BookError(response.text)
    return [
        book
        for book in (
            try_parse_book(volume["volumeInfo"])
            for volume in response.json().get("items", [])
            if "volumeInfo" in volume
        )
        if book is not None
    ]

In [None]:
tasks = [
    lambda client: get_by_author(client=client, author=author) for author in AUTHORS
]

books = [
    book
    for books in await batch_calls(
        aclient,
        tasks=tasks,
        batch_size=2,
    )
    for book in books
]

In [24]:
df = pl.from_dicts(
    (lambda m: {**m, "authors": ", ".join(m["authors"])})(book.model_dump())
    for book in books
)

df.head()

title,authors,publisher,publish_date,isbn
str,str,str,str,str
"""Hermann Hesse""","""Hermann Hesse""","""Farrar, Straus and Giroux""","""2013-01-22""","""9781466835085"""
"""The Prodigy""","""Hermann Hesse""","""Peter Owen Publishers""","""2002""","""9780720611748"""
"""The Seasons of the Soul""","""Hermann Hesse""","""North Atlantic Books""","""2011-10-11""","""9781583943410"""
"""The Fairy Tales of Hermann Hes…","""Hermann Hesse""","""Bantam""","""2009-09-30""","""9780307420510"""
"""Soul of the Age""","""Hermann Hesse""","""Farrar, Straus and Giroux""","""2013-01-22""","""9781466835191"""
