Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more logging #172

Merged
merged 1 commit into from
Jun 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ repos:
- cattrs==22.1.0
- click>=8.1.3
- httpx==0.23.0
- keke>=0.1.0
- rich>=12.5.1
- libcst>=1.1.0
- trailrunner>=1.2.1
Expand Down
95 changes: 51 additions & 44 deletions py_wtf/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
from functools import partial, wraps
from pathlib import Path

from typing import Callable, Coroutine, Iterable
from typing import Callable, Coroutine, IO, Iterable

import click
import httpx
import rich
import rich.progress
from keke import kev, TraceOutput

from py_wtf.__about__ import __version__
from py_wtf.indexer import index_dir, index_file, index_project
Expand Down Expand Up @@ -131,57 +132,63 @@ def index_file_cmd(file: str) -> None:
async def index_dir_cmd(dir: str) -> None:
path = Path(dir)
cnt = 0
async for mod in index_dir(path):
async for mod in index_dir(ProjectName("<UNKNOWN>"), path):
cnt += 1
rich.print(mod)
rich.print(f"Found {cnt} modules in total.")


@py_wtf.command(name="index-since")
@click.option("--since", type=click.DateTime(), required=True)
@click.option("--trace", type=click.File(mode="w"))
@click.argument("directory")
@coroutine
async def index_since(directory: str, since: datetime) -> None:
from google.cloud import bigquery

client = bigquery.Client()
time_format = "%Y-%m-%d %H:%M:%S"
threadpool = ThreadPoolExecutor()
rows = await asyncio.get_running_loop().run_in_executor(
threadpool,
client.query_and_wait,
f"""
SELECT distinct name
FROM
`bigquery-public-data.pypi.distribution_metadata`
WHERE
TIMESTAMP(upload_time) >= TIMESTAMP("{since.strftime(time_format)}")
""",
)
rows = list(rows)
logger.info(f"Found {len(rows)} new projects to index")
out_dir = Path(directory)
out_dir.mkdir(parents=True, exist_ok=True)
async with httpx.AsyncClient() as client:
resp = await client.get(f"https://py.wtf/_index/{METADATA_FILENAME}")
resp.raise_for_status()
(out_dir / METADATA_FILENAME).write_bytes(resp.content)

logger.info("Fetched prod index")
repo = ProjectRepository(out_dir)
rets = await asyncio.gather(
*[
repo.get(ProjectName(row.name), partial(index_project, repo=repo))
for row in rows
],
return_exceptions=True,
)
for ret in rets:
if isinstance(ret, Exception):
logger.exception(ret)
logger.info("Done indexing")
repo.update_index()
logger.info("Wrote new index")
async def index_since(directory: str, since: datetime, trace: IO[str] | None) -> None:
with TraceOutput(file=trace): # type: ignore
from google.cloud import bigquery

with kev("bigquery"):
client = bigquery.Client()
time_format = "%Y-%m-%d %H:%M:%S"
threadpool = ThreadPoolExecutor()
rows = await asyncio.get_running_loop().run_in_executor(
threadpool,
client.query_and_wait,
f"""
SELECT distinct name
FROM
`bigquery-public-data.pypi.distribution_metadata`
WHERE
TIMESTAMP(upload_time) >= TIMESTAMP("{since.strftime(time_format)}")
""",
)
rows = list(rows)
logger.info(f"Found {len(rows)} new projects to index")
out_dir = Path(directory)
out_dir.mkdir(parents=True, exist_ok=True)
with kev("fetch prod index"):
async with httpx.AsyncClient() as client:
resp = await client.get(f"https://py.wtf/_index/{METADATA_FILENAME}")
resp.raise_for_status()
(out_dir / METADATA_FILENAME).write_bytes(resp.content)

logger.info("Fetched prod index")
repo = ProjectRepository(out_dir)
with kev("index projects"):
rets = await asyncio.gather(
*[
repo.get(ProjectName(row.name), partial(index_project, repo=repo))
for row in rows
],
return_exceptions=True,
)
for ret in rets:
if isinstance(ret, Exception):
logger.error(ret, exc_info=ret)
logger.info("Done indexing")
with kev("update index"):
repo.update_index()
logger.info("Wrote new index")


@py_wtf.command()
Expand Down Expand Up @@ -209,7 +216,7 @@ async def generate_test_index(dir: str | None) -> None:
upload_time=parse_upload_time(proj_metadata.get("upload_time")),
)
symbol_table = SymbolTable({FQName("alpha.bar"): ProjectName("project-alpha")})
mods = [mod async for mod in index_dir(proj_dir, symbol_table)]
mods = [mod async for mod in index_dir(proj_name, proj_dir, symbol_table)]
proj = Project(
proj_name,
metadata=proj_info,
Expand Down
5 changes: 5 additions & 0 deletions py_wtf/indexer/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import libcst as cst
import trailrunner

from keke import ktrace
from libcst import matchers as m
from libcst.codemod import CodemodContext
from libcst.codemod.visitors import GatherExportsVisitor
Expand All @@ -23,6 +25,7 @@
Function,
Module,
Parameter,
ProjectName,
SymbolTable,
Type,
Variable,
Expand All @@ -34,7 +37,9 @@
logger = logging.getLogger(__name__)


@ktrace("_project_name", "dir")
async def index_dir(
_project_name: ProjectName,
dir: Path,
symbol_table: SymbolTable | None = None,
executor: Executor | None = None,
Expand Down
33 changes: 28 additions & 5 deletions py_wtf/indexer/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import aiofiles.tempfile
import httpx

from keke import ktrace
from packaging.requirements import Requirement
from rich.progress import Progress, TaskID

Expand Down Expand Up @@ -97,6 +98,7 @@ def blocklisted_project_factory(project_name: ProjectName) -> Project:
)


@ktrace("project_name")
async def index_project(
project_name: ProjectName, repo: ProjectRepository, progress: Progress | None = None
) -> AsyncIterable[Project]:
Expand All @@ -108,12 +110,21 @@ async def index_project(
task_id = progress.add_task(project_name, action="Fetching", total=3)
deps: list[Project] = []
with TemporaryDirectory() as tmpdir:
src_dir, info, description = await download(project_name, Path(tmpdir))
try:
src_dir, info, description = await download(project_name, Path(tmpdir))
except Exception as err:
logger.error(
f"Unable to download project {project_name}, skipping", exc_info=err
)
return
if progress:
progress.update(
task_id, action="Gathering deps for", visible=True, advance=1
)
dep_project_names = [ProjectName(dep) for dep in info.dependencies]
logger.debug(
f"Found {project_name}'s dependencies ({len(dep_project_names)}): {dep_project_names}"
)

dep_projects = await asyncio.gather(
*[
Expand All @@ -134,7 +145,10 @@ async def index_project(
if progress:
progress.update(task_id, action="Indexing", visible=True, advance=1)

modules = [mod async for mod in index_dir(src_dir, symbols, executor)]
logger.info(f"Starting indexing of {project_name}")
modules = [
mod async for mod in index_dir(project_name, src_dir, symbols, executor)
]

if progress:
progress.advance(task_id)
Expand All @@ -154,12 +168,14 @@ async def index_project(
)
if progress:
progress.update(task_id, visible=False)
logger.info(f"Done indexing of {project_name}")
yield proj


Archive = TarFile | ZipFile


@ktrace("src")
def extract_archive(src: str, dir: Path) -> None:
if is_tarfile(src):
f = TarFile.open(src)
Expand Down Expand Up @@ -240,13 +256,18 @@ def pick_artifact(artifacts: list[Artifact]) -> Artifact | None:
sem = asyncio.BoundedSemaphore(value=20)


@ktrace("project_name")
async def download(
project_name: str, directory: Path
) -> Tuple[Path, ProjectMetadata, ProjectDescription]:
async with sem, httpx.AsyncClient(timeout=httpx.Timeout(None)) as client:
proj_data = (
await client.get(f"https://pypi.org/pypi/{project_name}/json")
).json()
try:
resp = await client.get(f"https://pypi.org/pypi/{project_name}/json")
resp.raise_for_status()
except httpx.HTTPStatusError as e:
error = f"Unable to find project {project_name} on pypi, got HTTP {e.response.status_code}"
raise ValueError(error) from e
proj_data = resp.json()
pypi_info = proj_data["info"]
latest_version = pypi_info["version"]
project_urls = pypi_info.get("project_urls") or {}
Expand Down Expand Up @@ -276,6 +297,7 @@ async def download(
upload_time=parse_upload_time(artifact["upload_time"]),
)

logger.debug(f"Fetching {project_name} sources from {artifact['url']}")
# this is a bit unnecessary 🙃
async with (
client.stream("GET", artifact["url"]) as response,
Expand All @@ -286,6 +308,7 @@ async def download(

archive_name = str(src_archive.name)
try:
logger.debug(f"Extracting sources for {project_name} from {archive_name}")
extract_archive(archive_name, directory)
finally:
os.unlink(archive_name)
Expand Down
9 changes: 8 additions & 1 deletion py_wtf/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,11 @@ def setup_logging(level: int | None = None, to_file: bool = False) -> None:
dir.mkdir(exist_ok=True, parents=True)
handler = logging.FileHandler(dir / "indexer.log")

logging.basicConfig(level=level, handlers=[handler], force=True)
logging.basicConfig(
level=level,
handlers=[handler],
force=True,
format="[{name}] {message}",
style="{",
)
logging.captureWarnings(True)
5 changes: 4 additions & 1 deletion py_wtf/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import AsyncIterable, Callable, Tuple

from cattrs.preconf.json import make_converter
from keke import ktrace

from py_wtf.types import Index, Project, ProjectMetadata, ProjectName

Expand Down Expand Up @@ -37,6 +38,7 @@ def _load_from_disk(self, key: ProjectName) -> None:
proj = converter.loads(index_contents, Project)
self._cache[key].set_result(proj)

@ktrace("project.name")
def _save(self, project: Project) -> None:
name = ProjectName(project.name)
if self._cache[name].done():
Expand All @@ -46,6 +48,7 @@ def _save(self, project: Project) -> None:
index_file = self._index_file(name)
index_file.write_text(converter.dumps(project))

@ktrace("key")
async def get(
self,
key: ProjectName,
Expand All @@ -68,7 +71,7 @@ async def get(
self._save(project)

if not fut.done():
raise ValueError(f"{key} was never yielded by {factory}")
raise ValueError(f"{key} was never yielded by factory")
return fut.result()

def generate_index(
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"click>=8.1.3",
"google-cloud-bigquery>=3.23.1",
"httpx==0.23.0",
"keke>=0.1.4",
"rich>=12.5.1",
"libcst>=1.1.0",
"trailrunner>=1.2.1",
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# - click>=8.1.3
# - google-cloud-bigquery>=3.23.1
# - httpx==0.23.0
# - keke>=0.1.4
# - libcst>=1.1.0
# - packaging>=21
# - rich>=12.5.1
Expand Down Expand Up @@ -380,6 +381,9 @@ iniconfig==2.0.0 \
jinja2==3.1.4 \
--hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
--hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
keke==0.1.4 \
--hash=sha256:a8653b7d9936ddae08d1ea2c298e5e36a50eb379705e3f74ab08b1f6ae3831a0 \
--hash=sha256:b16eb1f8dd768747cf55e9e272a02fc642a4a80718ff4feccb5cdcdf9290af0a
libcst==1.4.0 \
--hash=sha256:061d6855ef30efe38b8a292b7e5d57c8e820e71fc9ec9846678b60a934b53bbb \
--hash=sha256:17d71001cb25e94cfe8c3d997095741a8c4aa7a6d234c0f972bc42818c88dfaf \
Expand Down
4 changes: 3 additions & 1 deletion tests/indexer/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ async def test_index_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Non

some_file = tmp_path / "somefile.py"
some_file.touch()
assert [mod async for mod in index_dir(tmp_path)] == [empty_module]
assert [mod async for mod in index_dir(ProjectName("UNUSED"), tmp_path)] == [
empty_module
]


def test_index_file_syntax_error(tmp_path: Path) -> None:
Expand Down
Loading