Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scripts/check-more-info-urls.py: add script #12506

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This section contains a summary of the scripts available in this directory. For
- [build-index.sh](build-index.sh) script builds the index of available pages.
- [check-pr.sh](check-pr.sh) script checks the pages syntax and performs various checks on the PR.
- [deploy.sh](deploy.sh) script deploys the ZIP and PDF archives to the static website repository.
- [check-more-info-urls.py](check-more-info-urls.py) is a Python script to check for "More information" links that are broken or redirect to another one, using asynchronous code for speed.
- [send-to-bot.py](send-to-bot.py) is a Python script that send the build or tests output to tldr-bot.
- [set-alias-page.py](set-alias-page.py) is a Python script to generate or update alias pages.
- [set-more-info-link.py](set-more-info-link.py) is a Python script to generate or update more information links across pages.
Expand All @@ -27,6 +28,7 @@ The below table shows the compatibility of user-executable scripts with differen
| [render.py](pdf/render.py) | ✅ | ✅ | ✅ |
| [build-pdf.sh](pdf/build-pdf.sh) | ✅ | ✅ | ❌ |
| [build.sh](build.sh) | ✅ | ✅ | ❌ |
| [check-more-info-links.py](check-more-info-links.py) | ✅ | ✅ | ✅ |
| [set-alias-pages.py](set-alias-pages.py) | ✅ | ✅ | ✅ |
| [set-more-info-link.py](set-more-info-link.py) | ✅ | ✅ | ✅ |
| [wrong-filename.sh](wrong-filename.sh) | ✅ | ❌ | ❌ |
Expand Down
137 changes: 137 additions & 0 deletions scripts/check-more-info-urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT

"""
A Python script to check for bad (HTTP status code different than 200) "More information" URLs accross all pages.

Check failure on line 5 in scripts/check-more-info-urls.py

View workflow job for this annotation

GitHub Actions / build

accross ==> across

These bad codes tipically indicate a not found page or a redirection. They are written to bad-urls.txt with their respective URLs.

Check failure on line 7 in scripts/check-more-info-urls.py

View workflow job for this annotation

GitHub Actions / build

tipically ==> typically
kbdharun marked this conversation as resolved.
Show resolved Hide resolved

Usage:
python3 scripts/check-more-info-urls.py
"""

import random
import re
import asyncio
import sys
from aiofile import AIOFile, Reader, Writer
import aiohttp.client_exceptions
from aioconsole import aprint
from aiofile import async_open
from aiopath import AsyncPath

MAX_CONCURRENCY = 500

sem = asyncio.Semaphore(MAX_CONCURRENCY)


async def find_md_files(search_path: AsyncPath) -> list[AsyncPath]:
"""Find all .md files in the specified search path."""
md_files = set()
async for path_dir in search_path.glob("*"):
await aprint(path_dir.name)
async for file in search_path.glob("*/*.md"):
md_files.add(file)
return md_files


async def append_if_is_file(path_list: list[AsyncPath], path: AsyncPath):
"""Append the file to the list if it exists"""
if await path.is_file():
path_list.add(path)


async def filter_files(md_files: list[AsyncPath]) -> list[AsyncPath]:
"""Filter out non-file paths from the list."""
filtered_files = set()
await asyncio.gather(
*(append_if_is_file(filtered_files, path) for path in md_files)
)
return filtered_files


async def process_file(
file: AsyncPath,
writer: Writer,
output_file: AsyncPath,
session: aiohttp.ClientSession,
) -> None:
"""Extract the URL of a single .md file and check it."""
async with sem:
async with file.open("r") as f:
try:
content = await f.read()
except Exception as e:
await aprint(file.parts[-3:])
return

url = extract_url(content)

if url is not None:
await check_url_and_write_if_bad(url, writer, output_file, session)


def extract_url(content: str) -> list[str]:
"""Extract the URL of '> More information: '."""
return next(
(
match.group(1)
for match in re.finditer(r"> More information: <(.+)>", content)
),
None,
)


async def check_url_and_write_if_bad(
url: str, writer: Writer, output_file: AsyncPath, session: aiohttp.ClientSession
) -> None:
"""Check URL status and write bad URLs to a file."""
await aprint(f"??? {url}")
code = -1
try:
code = await check_url(url, session)
except aiohttp.ClientError as exc:
if hasattr(exc, "strerr"):
await aprint(f"\033[31m{exc.strerr}\033[0m")
if hasattr(exc, "message"):
await aprint(f"\033[31m{exc.message}\033[0m")
else:
await aprint(f"\033[31m{exc}\033[0m")
await aprint(f"{code} {url}")

if 200 > code or code >= 400:
await writer(f"{code}|{url}\n")


async def check_url(url: str, session: aiohttp.ClientSession) -> int:
"""Get the status code of a URL."""
async with session.head(url) as response:
return response.status


async def find_and_write_bad_urls(
output_file: AsyncPath, search_path: str = "."
) -> None:
"""Find and write bad URLs to a specified file."""
search_path = AsyncPath(search_path)
await aprint("Getting pages...")
md_files = await filter_files(await find_md_files(search_path))
await aprint("Found all pages!")

async with AIOFile(output_file.name, "a") as afp:
writer = Writer(afp)
async with aiohttp.ClientSession(
trust_env=True, timeout=aiohttp.ClientTimeout(total=500)
) as session:
await asyncio.gather(
*(process_file(file, writer, output_file, session) for file in md_files)
)
await afp.fsync()


async def main():
await find_and_write_bad_urls(AsyncPath("bad-urls.txt"), search_path="./pages")


if __name__ == "__main__":
asyncio.run(main())