yandy · yandy · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py
@@ -1,7 +1,10 @@
 from pathlib import Path
+from urllib.error import URLError
+from urllib.parse import urlparse
 
 import typer
 from pymupdf4llm.helpers.utils import md_path as _md_path
+from kbmate_cli.url_downloader import download_to_temp, is_url, print_cleanup_hint, resolve_file_type
 
 app = typer.Typer()
 
@@ -13,9 +16,27 @@ def main():
 
 @app.command()
 def convert(
-    source_file: str = typer.Argument(..., help="Path to the .docx or .pdf file"),
+    source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"),
     output_dir: str = typer.Option("raw", help="Output directory"),
 ):
+    temp_path: Path | None = None
+
+    if is_url(source_file):
+        if source_file.startswith("file://"):
+            source_file = urlparse(source_file).path
+        else:
+            try:
+                suffix = resolve_file_type(source_file)
+                temp_path = download_to_temp(source_file, suffix)
+                if temp_path.stat().st_size == 0:
+                    temp_path.unlink()
+                    typer.echo("Error: downloaded file is empty", err=True)
+                    raise typer.Exit(code=1)
+                source_file = str(temp_path)
+            except (ValueError, URLError) as e:
+                typer.echo(f"Error: {e}", err=True)
+                raise typer.Exit(code=1)
+
     src = Path(source_file)
     if not src.exists():
         typer.echo(f"Error: file not found: {source_file}", err=True)
@@ -39,37 +60,41 @@ def convert(
 
     markdown_content: str = ""
 
-    if ext == ".pdf":
-        from kbmate_cli.pdf_converter import convert_pdf
+    try:
+        if ext == ".pdf":
+            from kbmate_cli.pdf_converter import convert_pdf
 
-        markdown_content = convert_pdf(str(src), str(assets_dir))
+            markdown_content = convert_pdf(str(src), str(assets_dir))
 
-        from kbmate_cli.image_helper import extract_and_relink_images
+            from kbmate_cli.image_helper import extract_and_relink_images
 
-        markdown_content = extract_and_relink_images(
-            markdown_content, str(assets_dir), str(assets_dir)
-        )
+            markdown_content = extract_and_relink_images(
+                markdown_content, str(assets_dir), str(assets_dir)
+            )
 
-    elif ext == ".docx":
-        from kbmate_cli.docx_converter import convert_docx
+        elif ext == ".docx":
+            from kbmate_cli.docx_converter import convert_docx
 
-        pandoc_output = assets_dir / "pandoc_output"
-        markdown_content = convert_docx(str(src), str(pandoc_output))
+            pandoc_output = assets_dir / "pandoc_output"
+            markdown_content = convert_docx(str(src), str(pandoc_output))
 
-        from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images
+            from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images
 
-        markdown_content = normalize_image_refs(markdown_content)
-        markdown_content = extract_and_relink_images(
-            markdown_content, str(pandoc_output), str(assets_dir)
-        )
-        if pandoc_output.exists():
-            import shutil
+            markdown_content = normalize_image_refs(markdown_content)
+            markdown_content = extract_and_relink_images(
+                markdown_content, str(pandoc_output), str(assets_dir)
+            )
+            if pandoc_output.exists():
+                import shutil
 
-            shutil.rmtree(pandoc_output)
+                shutil.rmtree(pandoc_output)
 
-    md_path = converts_dir / f"{safe_stem}.md"
-    md_path.write_text(markdown_content, encoding="utf-8")
-    typer.echo(f"Converted: {src} -> {md_path}")
+        md_path = converts_dir / f"{safe_stem}.md"
+        md_path.write_text(markdown_content, encoding="utf-8")
+        typer.echo(f"Converted: {src} -> {md_path}")
+    finally:
+        if temp_path:
+            print_cleanup_hint(temp_path)
 
 
 if __name__ == "__main__":

diff --git a/src/kbmate_cli/url_downloader.py b/src/kbmate_cli/url_downloader.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+from urllib.parse import urlparse
+from urllib.request import Request, urlopen
+from urllib.error import URLError
+import tempfile
+import uuid
+
+import typer
+
+
+def is_url(s: str) -> bool:
+    return s.startswith("http://") or s.startswith("https://") or s.startswith("file://")
+
+
+def guess_ext_from_url(url: str) -> str | None:
+    path = urlparse(url).path
+    ext = Path(path).suffix.lower()
+    return ext if ext in (".pdf", ".docx") else None
+
+
+_CONTENT_TYPE_MAP = {
+    "application/pdf": ".pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+}
+
+
+def probe_content_type(url: str) -> str | None:
+    req = Request(url, method="HEAD")
+    try:
+        with urlopen(req, timeout=10) as resp:
+            ct = resp.headers.get("Content-Type", "").split(";")[0].strip()
+            return _CONTENT_TYPE_MAP.get(ct)
+    except URLError:
+        return None
+
+
+def resolve_file_type(url: str) -> str:
+    ext = probe_content_type(url)
+    if ext:
+        return ext
+    ext = guess_ext_from_url(url)
+    if ext:
+        return ext
+    raise ValueError(f"cannot determine file type for URL: {url}")
+
+
+def download_to_temp(url: str, suffix: str) -> Path:
+    tmp_dir = Path(tempfile.gettempdir())
+    tmp_file = tmp_dir / f"kbmate-{uuid.uuid4().hex}{suffix}"
+    req = Request(url)
+    with urlopen(req, timeout=30) as resp:
+        tmp_file.write_bytes(resp.read())
+    return tmp_file
+
+
+def print_cleanup_hint(path: Path) -> None:
+    typer.echo(f"临时文件已保存至: {path}，如不需要请手动删除")
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 import pytest
 from typer.testing import CliRunner
@@ -42,3 +43,34 @@ def test_convert_pdf_with_spaces_in_filename():
     assert assets_dir.exists()
     images = list(assets_dir.glob("*"))
     assert len(images) > 0, f"No images found in {assets_dir}"
+
+
+def test_convert_file_url():
+    """file:// URL 应解析为本地路径并正常工作"""
+    pdf = FIXTURE_DIR / "eigent README CN.pdf"
+    file_url = f"file://{pdf.resolve()}"
+    result = runner.invoke(app, ["convert", file_url, "--output-dir", "/tmp/test_cli_file_url"])
+    assert result.exit_code == 0, f"Failed with output: {result.output}"
+
+
+@patch("kbmate_cli.url_downloader.urlopen")
+def test_convert_http_url(mock_urlopen):
+    """http URL 应下载后转换"""
+    # Mock HEAD probe
+    head_resp = MagicMock()
+    head_resp.headers = {"Content-Type": "application/pdf"}
+    head_resp.__enter__.return_value = head_resp
+
+    # Mock GET download
+    pdf_path = FIXTURE_DIR / "eigent README CN.pdf"
+    download_resp = MagicMock()
+    download_resp.read.return_value = pdf_path.read_bytes()
+    download_resp.__enter__.return_value = download_resp
+
+    mock_urlopen.side_effect = [head_resp, download_resp]
+
+    result = runner.invoke(
+        app, ["convert", "https://example.com/doc.pdf", "--output-dir", "/tmp/test_cli_http_url"]
+    )
+    assert result.exit_code == 0, f"Failed with output: {result.output}"
+    assert "临时文件已保存至" in result.stdout
diff --git a/tests/test_url_downloader.py b/tests/test_url_downloader.py
@@ -0,0 +1,119 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+from urllib.error import URLError
+
+from kbmate_cli.url_downloader import (
+    is_url, guess_ext_from_url,
+    probe_content_type, resolve_file_type,
+    download_to_temp, print_cleanup_hint,
+)
+
+
+class TestIsUrl:
+    def test_http(self):
+        assert is_url("http://example.com/doc.pdf") is True
+
+    def test_https(self):
+        assert is_url("https://example.com/doc.pdf") is True
+
+    def test_file_protocol(self):
+        assert is_url("file:///home/user/doc.pdf") is True
+
+    def test_local_path(self):
+        assert is_url("/home/user/doc.pdf") is False
+
+    def test_relative_path(self):
+        assert is_url("doc.pdf") is False
+
+
+class TestGuessExtFromUrl:
+    def test_pdf(self):
+        assert guess_ext_from_url("https://example.com/doc.pdf") == ".pdf"
+
+    def test_docx(self):
+        assert guess_ext_from_url("https://example.com/report.docx") == ".docx"
+
+    def test_no_ext(self):
+        assert guess_ext_from_url("https://example.com/download") is None
+
+    def test_query_string(self):
+        assert guess_ext_from_url("https://example.com/file.pdf?token=abc") == ".pdf"
+
+
+class TestProbeContentType:
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_pdf_content_type(self, mock_urlopen):
+        mock_resp = MagicMock()
+        mock_resp.headers = {"Content-Type": "application/pdf"}
+        mock_urlopen.return_value.__enter__.return_value = mock_resp
+        assert probe_content_type("https://example.com/doc") == ".pdf"
+
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_docx_content_type(self, mock_urlopen):
+        mock_resp = MagicMock()
+        mock_resp.headers = {"Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}
+        mock_urlopen.return_value.__enter__.return_value = mock_resp
+        assert probe_content_type("https://example.com/doc") == ".docx"
+
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_unknown_content_type(self, mock_urlopen):
+        mock_resp = MagicMock()
+        mock_resp.headers = {"Content-Type": "application/octet-stream"}
+        mock_urlopen.return_value.__enter__.return_value = mock_resp
+        assert probe_content_type("https://example.com/doc") is None
+
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_network_error_returns_none(self, mock_urlopen):
+        mock_urlopen.side_effect = URLError("connection failed")
+        assert probe_content_type("https://example.com/doc") is None
+
+
+class TestResolveFileType:
+    @patch("kbmate_cli.url_downloader.probe_content_type")
+    def test_probe_success(self, mock_probe):
+        mock_probe.return_value = ".pdf"
+        assert resolve_file_type("https://example.com/doc") == ".pdf"
+        mock_probe.assert_called_once()
+
+    @patch("kbmate_cli.url_downloader.probe_content_type")
+    def test_probe_fallback_to_url(self, mock_probe):
+        mock_probe.return_value = None
+        assert resolve_file_type("https://example.com/doc.pdf") == ".pdf"
+
+    @patch("kbmate_cli.url_downloader.probe_content_type")
+    def test_no_match_raises(self, mock_probe):
+        mock_probe.return_value = None
+        with pytest.raises(ValueError, match="cannot determine file type"):
+            resolve_file_type("https://example.com/doc")
+
+
+class TestDownloadToTemp:
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_download_success(self, mock_urlopen):
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = b"%PDF-1.4 fake content"
+        mock_resp.__enter__.return_value = mock_resp
+        mock_urlopen.return_value = mock_resp
+
+        result = download_to_temp("https://example.com/doc.pdf", ".pdf")
+        assert isinstance(result, Path)
+        assert result.suffix == ".pdf"
+        assert result.exists()
+        assert result.read_bytes() == b"%PDF-1.4 fake content"
+        result.unlink()
+
+    @patch("kbmate_cli.url_downloader.urlopen")
+    def test_network_error_raises(self, mock_urlopen):
+        mock_urlopen.side_effect = URLError("connection refused")
+        with pytest.raises(URLError):
+            download_to_temp("https://example.com/doc.pdf", ".pdf")
+
+
+class TestPrintCleanupHint:
+    def test_prints_message(self, capsys):
+        p = Path("/tmp/test_file.pdf")
+        print_cleanup_hint(p)
+        captured = capsys.readouterr()
+        assert "/tmp/test_file.pdf" in captured.out
+        assert "手动删除" in captured.out