From b14574de21d8e18f2eb322c49b5397760f0ad222 Mon Sep 17 00:00:00 2001 From: Zsolt Dollenstein Date: Sat, 1 Jun 2024 19:09:40 +0100 Subject: [PATCH] Add more logging Add more verbose `logging` logs, as well as a new `--trace` flag to dump chrome traces powered by `keke`. --- .pre-commit-config.yaml | 1 + py_wtf/cli/__init__.py | 95 ++++++++++++++++++++------------------ py_wtf/indexer/file.py | 5 ++ py_wtf/indexer/pypi.py | 33 +++++++++++-- py_wtf/logging.py | 9 +++- py_wtf/repository.py | 5 +- pyproject.toml | 1 + requirements.txt | 4 ++ tests/indexer/test_file.py | 4 +- 9 files changed, 105 insertions(+), 52 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 886817674..466e88918 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,6 +26,7 @@ repos: - cattrs==22.1.0 - click>=8.1.3 - httpx==0.23.0 + - keke>=0.1.0 - rich>=12.5.1 - libcst>=1.1.0 - trailrunner>=1.2.1 diff --git a/py_wtf/cli/__init__.py b/py_wtf/cli/__init__.py index fe15ed6e6..e1e88349e 100644 --- a/py_wtf/cli/__init__.py +++ b/py_wtf/cli/__init__.py @@ -13,12 +13,13 @@ from functools import partial, wraps from pathlib import Path -from typing import Callable, Coroutine, Iterable +from typing import Callable, Coroutine, IO, Iterable import click import httpx import rich import rich.progress +from keke import kev, TraceOutput from py_wtf.__about__ import __version__ from py_wtf.indexer import index_dir, index_file, index_project @@ -131,7 +132,7 @@ def index_file_cmd(file: str) -> None: async def index_dir_cmd(dir: str) -> None: path = Path(dir) cnt = 0 - async for mod in index_dir(path): + async for mod in index_dir(ProjectName(""), path): cnt += 1 rich.print(mod) rich.print(f"Found {cnt} modules in total.") @@ -139,49 +140,55 @@ async def index_dir_cmd(dir: str) -> None: @py_wtf.command(name="index-since") @click.option("--since", type=click.DateTime(), required=True) +@click.option("--trace", type=click.File(mode="w")) @click.argument("directory") @coroutine -async def index_since(directory: str, since: datetime) -> None: - from google.cloud import bigquery - - client = bigquery.Client() - time_format = "%Y-%m-%d %H:%M:%S" - threadpool = ThreadPoolExecutor() - rows = await asyncio.get_running_loop().run_in_executor( - threadpool, - client.query_and_wait, - f""" - SELECT distinct name - FROM - `bigquery-public-data.pypi.distribution_metadata` - WHERE - TIMESTAMP(upload_time) >= TIMESTAMP("{since.strftime(time_format)}") - """, - ) - rows = list(rows) - logger.info(f"Found {len(rows)} new projects to index") - out_dir = Path(directory) - out_dir.mkdir(parents=True, exist_ok=True) - async with httpx.AsyncClient() as client: - resp = await client.get(f"https://py.wtf/_index/{METADATA_FILENAME}") - resp.raise_for_status() - (out_dir / METADATA_FILENAME).write_bytes(resp.content) - - logger.info("Fetched prod index") - repo = ProjectRepository(out_dir) - rets = await asyncio.gather( - *[ - repo.get(ProjectName(row.name), partial(index_project, repo=repo)) - for row in rows - ], - return_exceptions=True, - ) - for ret in rets: - if isinstance(ret, Exception): - logger.exception(ret) - logger.info("Done indexing") - repo.update_index() - logger.info("Wrote new index") +async def index_since(directory: str, since: datetime, trace: IO[str] | None) -> None: + with TraceOutput(file=trace): # type: ignore + from google.cloud import bigquery + + with kev("bigquery"): + client = bigquery.Client() + time_format = "%Y-%m-%d %H:%M:%S" + threadpool = ThreadPoolExecutor() + rows = await asyncio.get_running_loop().run_in_executor( + threadpool, + client.query_and_wait, + f""" + SELECT distinct name + FROM + `bigquery-public-data.pypi.distribution_metadata` + WHERE + TIMESTAMP(upload_time) >= TIMESTAMP("{since.strftime(time_format)}") + """, + ) + rows = list(rows) + logger.info(f"Found {len(rows)} new projects to index") + out_dir = Path(directory) + out_dir.mkdir(parents=True, exist_ok=True) + with kev("fetch prod index"): + async with httpx.AsyncClient() as client: + resp = await client.get(f"https://py.wtf/_index/{METADATA_FILENAME}") + resp.raise_for_status() + (out_dir / METADATA_FILENAME).write_bytes(resp.content) + + logger.info("Fetched prod index") + repo = ProjectRepository(out_dir) + with kev("index projects"): + rets = await asyncio.gather( + *[ + repo.get(ProjectName(row.name), partial(index_project, repo=repo)) + for row in rows + ], + return_exceptions=True, + ) + for ret in rets: + if isinstance(ret, Exception): + logger.error(ret, exc_info=ret) + logger.info("Done indexing") + with kev("update index"): + repo.update_index() + logger.info("Wrote new index") @py_wtf.command() @@ -209,7 +216,7 @@ async def generate_test_index(dir: str | None) -> None: upload_time=parse_upload_time(proj_metadata.get("upload_time")), ) symbol_table = SymbolTable({FQName("alpha.bar"): ProjectName("project-alpha")}) - mods = [mod async for mod in index_dir(proj_dir, symbol_table)] + mods = [mod async for mod in index_dir(proj_name, proj_dir, symbol_table)] proj = Project( proj_name, metadata=proj_info, diff --git a/py_wtf/indexer/file.py b/py_wtf/indexer/file.py index 8d676b6e2..4e0b05896 100644 --- a/py_wtf/indexer/file.py +++ b/py_wtf/indexer/file.py @@ -7,6 +7,8 @@ import libcst as cst import trailrunner + +from keke import ktrace from libcst import matchers as m from libcst.codemod import CodemodContext from libcst.codemod.visitors import GatherExportsVisitor @@ -23,6 +25,7 @@ Function, Module, Parameter, + ProjectName, SymbolTable, Type, Variable, @@ -34,7 +37,9 @@ logger = logging.getLogger(__name__) +@ktrace("_project_name", "dir") async def index_dir( + _project_name: ProjectName, dir: Path, symbol_table: SymbolTable | None = None, executor: Executor | None = None, diff --git a/py_wtf/indexer/pypi.py b/py_wtf/indexer/pypi.py index 92c850769..7ecd597a5 100644 --- a/py_wtf/indexer/pypi.py +++ b/py_wtf/indexer/pypi.py @@ -15,6 +15,7 @@ import aiofiles.tempfile import httpx +from keke import ktrace from packaging.requirements import Requirement from rich.progress import Progress, TaskID @@ -97,6 +98,7 @@ def blocklisted_project_factory(project_name: ProjectName) -> Project: ) +@ktrace("project_name") async def index_project( project_name: ProjectName, repo: ProjectRepository, progress: Progress | None = None ) -> AsyncIterable[Project]: @@ -108,12 +110,21 @@ async def index_project( task_id = progress.add_task(project_name, action="Fetching", total=3) deps: list[Project] = [] with TemporaryDirectory() as tmpdir: - src_dir, info, description = await download(project_name, Path(tmpdir)) + try: + src_dir, info, description = await download(project_name, Path(tmpdir)) + except Exception as err: + logger.error( + f"Unable to download project {project_name}, skipping", exc_info=err + ) + return if progress: progress.update( task_id, action="Gathering deps for", visible=True, advance=1 ) dep_project_names = [ProjectName(dep) for dep in info.dependencies] + logger.debug( + f"Found {project_name}'s dependencies ({len(dep_project_names)}): {dep_project_names}" + ) dep_projects = await asyncio.gather( *[ @@ -134,7 +145,10 @@ async def index_project( if progress: progress.update(task_id, action="Indexing", visible=True, advance=1) - modules = [mod async for mod in index_dir(src_dir, symbols, executor)] + logger.info(f"Starting indexing of {project_name}") + modules = [ + mod async for mod in index_dir(project_name, src_dir, symbols, executor) + ] if progress: progress.advance(task_id) @@ -154,12 +168,14 @@ async def index_project( ) if progress: progress.update(task_id, visible=False) + logger.info(f"Done indexing of {project_name}") yield proj Archive = TarFile | ZipFile +@ktrace("src") def extract_archive(src: str, dir: Path) -> None: if is_tarfile(src): f = TarFile.open(src) @@ -240,13 +256,18 @@ def pick_artifact(artifacts: list[Artifact]) -> Artifact | None: sem = asyncio.BoundedSemaphore(value=20) +@ktrace("project_name") async def download( project_name: str, directory: Path ) -> Tuple[Path, ProjectMetadata, ProjectDescription]: async with sem, httpx.AsyncClient(timeout=httpx.Timeout(None)) as client: - proj_data = ( - await client.get(f"https://pypi.org/pypi/{project_name}/json") - ).json() + try: + resp = await client.get(f"https://pypi.org/pypi/{project_name}/json") + resp.raise_for_status() + except httpx.HTTPStatusError as e: + error = f"Unable to find project {project_name} on pypi, got HTTP {e.response.status_code}" + raise ValueError(error) from e + proj_data = resp.json() pypi_info = proj_data["info"] latest_version = pypi_info["version"] project_urls = pypi_info.get("project_urls") or {} @@ -276,6 +297,7 @@ async def download( upload_time=parse_upload_time(artifact["upload_time"]), ) + logger.debug(f"Fetching {project_name} sources from {artifact['url']}") # this is a bit unnecessary 🙃 async with ( client.stream("GET", artifact["url"]) as response, @@ -286,6 +308,7 @@ async def download( archive_name = str(src_archive.name) try: + logger.debug(f"Extracting sources for {project_name} from {archive_name}") extract_archive(archive_name, directory) finally: os.unlink(archive_name) diff --git a/py_wtf/logging.py b/py_wtf/logging.py index 3dcb8c2b7..5a6054886 100644 --- a/py_wtf/logging.py +++ b/py_wtf/logging.py @@ -15,4 +15,11 @@ def setup_logging(level: int | None = None, to_file: bool = False) -> None: dir.mkdir(exist_ok=True, parents=True) handler = logging.FileHandler(dir / "indexer.log") - logging.basicConfig(level=level, handlers=[handler], force=True) + logging.basicConfig( + level=level, + handlers=[handler], + force=True, + format="[{name}] {message}", + style="{", + ) + logging.captureWarnings(True) diff --git a/py_wtf/repository.py b/py_wtf/repository.py index 4f0edd341..fd7764349 100644 --- a/py_wtf/repository.py +++ b/py_wtf/repository.py @@ -8,6 +8,7 @@ from typing import AsyncIterable, Callable, Tuple from cattrs.preconf.json import make_converter +from keke import ktrace from py_wtf.types import Index, Project, ProjectMetadata, ProjectName @@ -37,6 +38,7 @@ def _load_from_disk(self, key: ProjectName) -> None: proj = converter.loads(index_contents, Project) self._cache[key].set_result(proj) + @ktrace("project.name") def _save(self, project: Project) -> None: name = ProjectName(project.name) if self._cache[name].done(): @@ -46,6 +48,7 @@ def _save(self, project: Project) -> None: index_file = self._index_file(name) index_file.write_text(converter.dumps(project)) + @ktrace("key") async def get( self, key: ProjectName, @@ -68,7 +71,7 @@ async def get( self._save(project) if not fut.done(): - raise ValueError(f"{key} was never yielded by {factory}") + raise ValueError(f"{key} was never yielded by factory") return fut.result() def generate_index( diff --git a/pyproject.toml b/pyproject.toml index 1a7c64ea7..c67d50a44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "click>=8.1.3", "google-cloud-bigquery>=3.23.1", "httpx==0.23.0", + "keke>=0.1.4", "rich>=12.5.1", "libcst>=1.1.0", "trailrunner>=1.2.1", diff --git a/requirements.txt b/requirements.txt index 947e67991..22716d1cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ # - click>=8.1.3 # - google-cloud-bigquery>=3.23.1 # - httpx==0.23.0 +# - keke>=0.1.4 # - libcst>=1.1.0 # - packaging>=21 # - rich>=12.5.1 @@ -380,6 +381,9 @@ iniconfig==2.0.0 \ jinja2==3.1.4 \ --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \ --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d +keke==0.1.4 \ + --hash=sha256:a8653b7d9936ddae08d1ea2c298e5e36a50eb379705e3f74ab08b1f6ae3831a0 \ + --hash=sha256:b16eb1f8dd768747cf55e9e272a02fc642a4a80718ff4feccb5cdcdf9290af0a libcst==1.4.0 \ --hash=sha256:061d6855ef30efe38b8a292b7e5d57c8e820e71fc9ec9846678b60a934b53bbb \ --hash=sha256:17d71001cb25e94cfe8c3d997095741a8c4aa7a6d234c0f972bc42818c88dfaf \ diff --git a/tests/indexer/test_file.py b/tests/indexer/test_file.py index 13e7408c0..e691e5edb 100644 --- a/tests/indexer/test_file.py +++ b/tests/indexer/test_file.py @@ -34,7 +34,9 @@ async def test_index_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Non some_file = tmp_path / "somefile.py" some_file.touch() - assert [mod async for mod in index_dir(tmp_path)] == [empty_module] + assert [mod async for mod in index_dir(ProjectName("UNUSED"), tmp_path)] == [ + empty_module + ] def test_index_file_syntax_error(tmp_path: Path) -> None: