diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index 11603e8..9821bdc 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -1,7 +1,10 @@ from pathlib import Path +from urllib.error import URLError +from urllib.parse import urlparse import typer from pymupdf4llm.helpers.utils import md_path as _md_path +from kbmate_cli.url_downloader import download_to_temp, is_url, print_cleanup_hint, resolve_file_type app = typer.Typer() @@ -13,9 +16,27 @@ def main(): @app.command() def convert( - source_file: str = typer.Argument(..., help="Path to the .docx or .pdf file"), + source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"), output_dir: str = typer.Option("raw", help="Output directory"), ): + temp_path: Path | None = None + + if is_url(source_file): + if source_file.startswith("file://"): + source_file = urlparse(source_file).path + else: + try: + suffix = resolve_file_type(source_file) + temp_path = download_to_temp(source_file, suffix) + if temp_path.stat().st_size == 0: + temp_path.unlink() + typer.echo("Error: downloaded file is empty", err=True) + raise typer.Exit(code=1) + source_file = str(temp_path) + except (ValueError, URLError) as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) + src = Path(source_file) if not src.exists(): typer.echo(f"Error: file not found: {source_file}", err=True) @@ -39,37 +60,41 @@ def convert( markdown_content: str = "" - if ext == ".pdf": - from kbmate_cli.pdf_converter import convert_pdf + try: + if ext == ".pdf": + from kbmate_cli.pdf_converter import convert_pdf - markdown_content = convert_pdf(str(src), str(assets_dir)) + markdown_content = convert_pdf(str(src), str(assets_dir)) - from kbmate_cli.image_helper import extract_and_relink_images + from kbmate_cli.image_helper import extract_and_relink_images - markdown_content = extract_and_relink_images( - markdown_content, str(assets_dir), str(assets_dir) - ) + markdown_content = extract_and_relink_images( + markdown_content, str(assets_dir), str(assets_dir) + ) - elif ext == ".docx": - from kbmate_cli.docx_converter import convert_docx + elif ext == ".docx": + from kbmate_cli.docx_converter import convert_docx - pandoc_output = assets_dir / "pandoc_output" - markdown_content = convert_docx(str(src), str(pandoc_output)) + pandoc_output = assets_dir / "pandoc_output" + markdown_content = convert_docx(str(src), str(pandoc_output)) - from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images + from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images - markdown_content = normalize_image_refs(markdown_content) - markdown_content = extract_and_relink_images( - markdown_content, str(pandoc_output), str(assets_dir) - ) - if pandoc_output.exists(): - import shutil + markdown_content = normalize_image_refs(markdown_content) + markdown_content = extract_and_relink_images( + markdown_content, str(pandoc_output), str(assets_dir) + ) + if pandoc_output.exists(): + import shutil - shutil.rmtree(pandoc_output) + shutil.rmtree(pandoc_output) - md_path = converts_dir / f"{safe_stem}.md" - md_path.write_text(markdown_content, encoding="utf-8") - typer.echo(f"Converted: {src} -> {md_path}") + md_path = converts_dir / f"{safe_stem}.md" + md_path.write_text(markdown_content, encoding="utf-8") + typer.echo(f"Converted: {src} -> {md_path}") + finally: + if temp_path: + print_cleanup_hint(temp_path) if __name__ == "__main__": diff --git a/src/kbmate_cli/url_downloader.py b/src/kbmate_cli/url_downloader.py new file mode 100644 index 0000000..0567c20 --- /dev/null +++ b/src/kbmate_cli/url_downloader.py @@ -0,0 +1,57 @@ +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import Request, urlopen +from urllib.error import URLError +import tempfile +import uuid + +import typer + + +def is_url(s: str) -> bool: + return s.startswith("http://") or s.startswith("https://") or s.startswith("file://") + + +def guess_ext_from_url(url: str) -> str | None: + path = urlparse(url).path + ext = Path(path).suffix.lower() + return ext if ext in (".pdf", ".docx") else None + + +_CONTENT_TYPE_MAP = { + "application/pdf": ".pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", +} + + +def probe_content_type(url: str) -> str | None: + req = Request(url, method="HEAD") + try: + with urlopen(req, timeout=10) as resp: + ct = resp.headers.get("Content-Type", "").split(";")[0].strip() + return _CONTENT_TYPE_MAP.get(ct) + except URLError: + return None + + +def resolve_file_type(url: str) -> str: + ext = probe_content_type(url) + if ext: + return ext + ext = guess_ext_from_url(url) + if ext: + return ext + raise ValueError(f"cannot determine file type for URL: {url}") + + +def download_to_temp(url: str, suffix: str) -> Path: + tmp_dir = Path(tempfile.gettempdir()) + tmp_file = tmp_dir / f"kbmate-{uuid.uuid4().hex}{suffix}" + req = Request(url) + with urlopen(req, timeout=30) as resp: + tmp_file.write_bytes(resp.read()) + return tmp_file + + +def print_cleanup_hint(path: Path) -> None: + typer.echo(f"临时文件已保存至: {path},如不需要请手动删除") diff --git a/tests/test_cli.py b/tests/test_cli.py index f58f931..581ecbf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,5 @@ from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from typer.testing import CliRunner @@ -42,3 +43,34 @@ def test_convert_pdf_with_spaces_in_filename(): assert assets_dir.exists() images = list(assets_dir.glob("*")) assert len(images) > 0, f"No images found in {assets_dir}" + + +def test_convert_file_url(): + """file:// URL 应解析为本地路径并正常工作""" + pdf = FIXTURE_DIR / "eigent README CN.pdf" + file_url = f"file://{pdf.resolve()}" + result = runner.invoke(app, ["convert", file_url, "--output-dir", "/tmp/test_cli_file_url"]) + assert result.exit_code == 0, f"Failed with output: {result.output}" + + +@patch("kbmate_cli.url_downloader.urlopen") +def test_convert_http_url(mock_urlopen): + """http URL 应下载后转换""" + # Mock HEAD probe + head_resp = MagicMock() + head_resp.headers = {"Content-Type": "application/pdf"} + head_resp.__enter__.return_value = head_resp + + # Mock GET download + pdf_path = FIXTURE_DIR / "eigent README CN.pdf" + download_resp = MagicMock() + download_resp.read.return_value = pdf_path.read_bytes() + download_resp.__enter__.return_value = download_resp + + mock_urlopen.side_effect = [head_resp, download_resp] + + result = runner.invoke( + app, ["convert", "https://example.com/doc.pdf", "--output-dir", "/tmp/test_cli_http_url"] + ) + assert result.exit_code == 0, f"Failed with output: {result.output}" + assert "临时文件已保存至" in result.stdout diff --git a/tests/test_url_downloader.py b/tests/test_url_downloader.py new file mode 100644 index 0000000..d3ae8aa --- /dev/null +++ b/tests/test_url_downloader.py @@ -0,0 +1,119 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +from urllib.error import URLError + +from kbmate_cli.url_downloader import ( + is_url, guess_ext_from_url, + probe_content_type, resolve_file_type, + download_to_temp, print_cleanup_hint, +) + + +class TestIsUrl: + def test_http(self): + assert is_url("http://example.com/doc.pdf") is True + + def test_https(self): + assert is_url("https://example.com/doc.pdf") is True + + def test_file_protocol(self): + assert is_url("file:///home/user/doc.pdf") is True + + def test_local_path(self): + assert is_url("/home/user/doc.pdf") is False + + def test_relative_path(self): + assert is_url("doc.pdf") is False + + +class TestGuessExtFromUrl: + def test_pdf(self): + assert guess_ext_from_url("https://example.com/doc.pdf") == ".pdf" + + def test_docx(self): + assert guess_ext_from_url("https://example.com/report.docx") == ".docx" + + def test_no_ext(self): + assert guess_ext_from_url("https://example.com/download") is None + + def test_query_string(self): + assert guess_ext_from_url("https://example.com/file.pdf?token=abc") == ".pdf" + + +class TestProbeContentType: + @patch("kbmate_cli.url_downloader.urlopen") + def test_pdf_content_type(self, mock_urlopen): + mock_resp = MagicMock() + mock_resp.headers = {"Content-Type": "application/pdf"} + mock_urlopen.return_value.__enter__.return_value = mock_resp + assert probe_content_type("https://example.com/doc") == ".pdf" + + @patch("kbmate_cli.url_downloader.urlopen") + def test_docx_content_type(self, mock_urlopen): + mock_resp = MagicMock() + mock_resp.headers = {"Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"} + mock_urlopen.return_value.__enter__.return_value = mock_resp + assert probe_content_type("https://example.com/doc") == ".docx" + + @patch("kbmate_cli.url_downloader.urlopen") + def test_unknown_content_type(self, mock_urlopen): + mock_resp = MagicMock() + mock_resp.headers = {"Content-Type": "application/octet-stream"} + mock_urlopen.return_value.__enter__.return_value = mock_resp + assert probe_content_type("https://example.com/doc") is None + + @patch("kbmate_cli.url_downloader.urlopen") + def test_network_error_returns_none(self, mock_urlopen): + mock_urlopen.side_effect = URLError("connection failed") + assert probe_content_type("https://example.com/doc") is None + + +class TestResolveFileType: + @patch("kbmate_cli.url_downloader.probe_content_type") + def test_probe_success(self, mock_probe): + mock_probe.return_value = ".pdf" + assert resolve_file_type("https://example.com/doc") == ".pdf" + mock_probe.assert_called_once() + + @patch("kbmate_cli.url_downloader.probe_content_type") + def test_probe_fallback_to_url(self, mock_probe): + mock_probe.return_value = None + assert resolve_file_type("https://example.com/doc.pdf") == ".pdf" + + @patch("kbmate_cli.url_downloader.probe_content_type") + def test_no_match_raises(self, mock_probe): + mock_probe.return_value = None + with pytest.raises(ValueError, match="cannot determine file type"): + resolve_file_type("https://example.com/doc") + + +class TestDownloadToTemp: + @patch("kbmate_cli.url_downloader.urlopen") + def test_download_success(self, mock_urlopen): + mock_resp = MagicMock() + mock_resp.read.return_value = b"%PDF-1.4 fake content" + mock_resp.__enter__.return_value = mock_resp + mock_urlopen.return_value = mock_resp + + result = download_to_temp("https://example.com/doc.pdf", ".pdf") + assert isinstance(result, Path) + assert result.suffix == ".pdf" + assert result.exists() + assert result.read_bytes() == b"%PDF-1.4 fake content" + result.unlink() + + @patch("kbmate_cli.url_downloader.urlopen") + def test_network_error_raises(self, mock_urlopen): + mock_urlopen.side_effect = URLError("connection refused") + with pytest.raises(URLError): + download_to_temp("https://example.com/doc.pdf", ".pdf") + + +class TestPrintCleanupHint: + def test_prints_message(self, capsys): + p = Path("/tmp/test_file.pdf") + print_cleanup_hint(p) + captured = capsys.readouterr() + assert "/tmp/test_file.pdf" in captured.out + assert "手动删除" in captured.out