Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 48 additions & 23 deletions src/kbmate_cli/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from pathlib import Path
from urllib.error import URLError
from urllib.parse import urlparse

import typer
from pymupdf4llm.helpers.utils import md_path as _md_path
from kbmate_cli.url_downloader import download_to_temp, is_url, print_cleanup_hint, resolve_file_type

app = typer.Typer()

Expand All @@ -13,9 +16,27 @@ def main():

@app.command()
def convert(
source_file: str = typer.Argument(..., help="Path to the .docx or .pdf file"),
source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"),
output_dir: str = typer.Option("raw", help="Output directory"),
):
temp_path: Path | None = None

if is_url(source_file):
if source_file.startswith("file://"):
source_file = urlparse(source_file).path
else:
try:
suffix = resolve_file_type(source_file)
temp_path = download_to_temp(source_file, suffix)
if temp_path.stat().st_size == 0:
temp_path.unlink()
typer.echo("Error: downloaded file is empty", err=True)
raise typer.Exit(code=1)
source_file = str(temp_path)
except (ValueError, URLError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(code=1)

src = Path(source_file)
if not src.exists():
typer.echo(f"Error: file not found: {source_file}", err=True)
Expand All @@ -39,37 +60,41 @@ def convert(

markdown_content: str = ""

if ext == ".pdf":
from kbmate_cli.pdf_converter import convert_pdf
try:
if ext == ".pdf":
from kbmate_cli.pdf_converter import convert_pdf

markdown_content = convert_pdf(str(src), str(assets_dir))
markdown_content = convert_pdf(str(src), str(assets_dir))

from kbmate_cli.image_helper import extract_and_relink_images
from kbmate_cli.image_helper import extract_and_relink_images

markdown_content = extract_and_relink_images(
markdown_content, str(assets_dir), str(assets_dir)
)
markdown_content = extract_and_relink_images(
markdown_content, str(assets_dir), str(assets_dir)
)

elif ext == ".docx":
from kbmate_cli.docx_converter import convert_docx
elif ext == ".docx":
from kbmate_cli.docx_converter import convert_docx

pandoc_output = assets_dir / "pandoc_output"
markdown_content = convert_docx(str(src), str(pandoc_output))
pandoc_output = assets_dir / "pandoc_output"
markdown_content = convert_docx(str(src), str(pandoc_output))

from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images
from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images

markdown_content = normalize_image_refs(markdown_content)
markdown_content = extract_and_relink_images(
markdown_content, str(pandoc_output), str(assets_dir)
)
if pandoc_output.exists():
import shutil
markdown_content = normalize_image_refs(markdown_content)
markdown_content = extract_and_relink_images(
markdown_content, str(pandoc_output), str(assets_dir)
)
if pandoc_output.exists():
import shutil

shutil.rmtree(pandoc_output)
shutil.rmtree(pandoc_output)

md_path = converts_dir / f"{safe_stem}.md"
md_path.write_text(markdown_content, encoding="utf-8")
typer.echo(f"Converted: {src} -> {md_path}")
md_path = converts_dir / f"{safe_stem}.md"
md_path.write_text(markdown_content, encoding="utf-8")
typer.echo(f"Converted: {src} -> {md_path}")
finally:
if temp_path:
print_cleanup_hint(temp_path)


if __name__ == "__main__":
Expand Down
57 changes: 57 additions & 0 deletions src/kbmate_cli/url_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import Request, urlopen
from urllib.error import URLError
import tempfile
import uuid

import typer


def is_url(s: str) -> bool:
return s.startswith("http://") or s.startswith("https://") or s.startswith("file://")


def guess_ext_from_url(url: str) -> str | None:
path = urlparse(url).path
ext = Path(path).suffix.lower()
return ext if ext in (".pdf", ".docx") else None


_CONTENT_TYPE_MAP = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
}


def probe_content_type(url: str) -> str | None:
req = Request(url, method="HEAD")
try:
with urlopen(req, timeout=10) as resp:
ct = resp.headers.get("Content-Type", "").split(";")[0].strip()
return _CONTENT_TYPE_MAP.get(ct)
except URLError:
return None


def resolve_file_type(url: str) -> str:
ext = probe_content_type(url)
if ext:
return ext
ext = guess_ext_from_url(url)
if ext:
return ext
raise ValueError(f"cannot determine file type for URL: {url}")


def download_to_temp(url: str, suffix: str) -> Path:
tmp_dir = Path(tempfile.gettempdir())
tmp_file = tmp_dir / f"kbmate-{uuid.uuid4().hex}{suffix}"
req = Request(url)
with urlopen(req, timeout=30) as resp:
tmp_file.write_bytes(resp.read())
return tmp_file


def print_cleanup_hint(path: Path) -> None:
typer.echo(f"临时文件已保存至: {path},如不需要请手动删除")
32 changes: 32 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
from typer.testing import CliRunner
Expand Down Expand Up @@ -42,3 +43,34 @@ def test_convert_pdf_with_spaces_in_filename():
assert assets_dir.exists()
images = list(assets_dir.glob("*"))
assert len(images) > 0, f"No images found in {assets_dir}"


def test_convert_file_url():
"""file:// URL 应解析为本地路径并正常工作"""
pdf = FIXTURE_DIR / "eigent README CN.pdf"
file_url = f"file://{pdf.resolve()}"
result = runner.invoke(app, ["convert", file_url, "--output-dir", "/tmp/test_cli_file_url"])
assert result.exit_code == 0, f"Failed with output: {result.output}"


@patch("kbmate_cli.url_downloader.urlopen")
def test_convert_http_url(mock_urlopen):
"""http URL 应下载后转换"""
# Mock HEAD probe
head_resp = MagicMock()
head_resp.headers = {"Content-Type": "application/pdf"}
head_resp.__enter__.return_value = head_resp

# Mock GET download
pdf_path = FIXTURE_DIR / "eigent README CN.pdf"
download_resp = MagicMock()
download_resp.read.return_value = pdf_path.read_bytes()
download_resp.__enter__.return_value = download_resp

mock_urlopen.side_effect = [head_resp, download_resp]

result = runner.invoke(
app, ["convert", "https://example.com/doc.pdf", "--output-dir", "/tmp/test_cli_http_url"]
)
assert result.exit_code == 0, f"Failed with output: {result.output}"
assert "临时文件已保存至" in result.stdout
119 changes: 119 additions & 0 deletions tests/test_url_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from urllib.error import URLError

from kbmate_cli.url_downloader import (
is_url, guess_ext_from_url,
probe_content_type, resolve_file_type,
download_to_temp, print_cleanup_hint,
)


class TestIsUrl:
def test_http(self):
assert is_url("http://example.com/doc.pdf") is True

def test_https(self):
assert is_url("https://example.com/doc.pdf") is True

def test_file_protocol(self):
assert is_url("file:///home/user/doc.pdf") is True

def test_local_path(self):
assert is_url("/home/user/doc.pdf") is False

def test_relative_path(self):
assert is_url("doc.pdf") is False


class TestGuessExtFromUrl:
def test_pdf(self):
assert guess_ext_from_url("https://example.com/doc.pdf") == ".pdf"

def test_docx(self):
assert guess_ext_from_url("https://example.com/report.docx") == ".docx"

def test_no_ext(self):
assert guess_ext_from_url("https://example.com/download") is None

def test_query_string(self):
assert guess_ext_from_url("https://example.com/file.pdf?token=abc") == ".pdf"


class TestProbeContentType:
@patch("kbmate_cli.url_downloader.urlopen")
def test_pdf_content_type(self, mock_urlopen):
mock_resp = MagicMock()
mock_resp.headers = {"Content-Type": "application/pdf"}
mock_urlopen.return_value.__enter__.return_value = mock_resp
assert probe_content_type("https://example.com/doc") == ".pdf"

@patch("kbmate_cli.url_downloader.urlopen")
def test_docx_content_type(self, mock_urlopen):
mock_resp = MagicMock()
mock_resp.headers = {"Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}
mock_urlopen.return_value.__enter__.return_value = mock_resp
assert probe_content_type("https://example.com/doc") == ".docx"

@patch("kbmate_cli.url_downloader.urlopen")
def test_unknown_content_type(self, mock_urlopen):
mock_resp = MagicMock()
mock_resp.headers = {"Content-Type": "application/octet-stream"}
mock_urlopen.return_value.__enter__.return_value = mock_resp
assert probe_content_type("https://example.com/doc") is None

@patch("kbmate_cli.url_downloader.urlopen")
def test_network_error_returns_none(self, mock_urlopen):
mock_urlopen.side_effect = URLError("connection failed")
assert probe_content_type("https://example.com/doc") is None


class TestResolveFileType:
@patch("kbmate_cli.url_downloader.probe_content_type")
def test_probe_success(self, mock_probe):
mock_probe.return_value = ".pdf"
assert resolve_file_type("https://example.com/doc") == ".pdf"
mock_probe.assert_called_once()

@patch("kbmate_cli.url_downloader.probe_content_type")
def test_probe_fallback_to_url(self, mock_probe):
mock_probe.return_value = None
assert resolve_file_type("https://example.com/doc.pdf") == ".pdf"

@patch("kbmate_cli.url_downloader.probe_content_type")
def test_no_match_raises(self, mock_probe):
mock_probe.return_value = None
with pytest.raises(ValueError, match="cannot determine file type"):
resolve_file_type("https://example.com/doc")


class TestDownloadToTemp:
@patch("kbmate_cli.url_downloader.urlopen")
def test_download_success(self, mock_urlopen):
mock_resp = MagicMock()
mock_resp.read.return_value = b"%PDF-1.4 fake content"
mock_resp.__enter__.return_value = mock_resp
mock_urlopen.return_value = mock_resp

result = download_to_temp("https://example.com/doc.pdf", ".pdf")
assert isinstance(result, Path)
assert result.suffix == ".pdf"
assert result.exists()
assert result.read_bytes() == b"%PDF-1.4 fake content"
result.unlink()

@patch("kbmate_cli.url_downloader.urlopen")
def test_network_error_raises(self, mock_urlopen):
mock_urlopen.side_effect = URLError("connection refused")
with pytest.raises(URLError):
download_to_temp("https://example.com/doc.pdf", ".pdf")


class TestPrintCleanupHint:
def test_prints_message(self, capsys):
p = Path("/tmp/test_file.pdf")
print_cleanup_hint(p)
captured = capsys.readouterr()
assert "/tmp/test_file.pdf" in captured.out
assert "手动删除" in captured.out