From c8b1c9d808af285ebbc7ac2459745a6c2cd2d925 Mon Sep 17 00:00:00 2001 From: graycreate Date: Mon, 1 Dec 2025 10:23:28 +0800 Subject: [PATCH 1/2] feat: add support for additional HTML tags in RichView MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for previously unsupported HTML tags: - Table support (table, thead, tbody, tfoot, tr, th, td) - Strikethrough (del, s, strike) - Underline (u, ins) - Superscript/subscript (sup, sub) - Mark/highlight (mark) - Definition lists (dl, dt, dd) - Semantic elements (abbr, cite, kbd, samp, var, small) - Figure elements (figure, figcaption) - Document structure (address, time, details, summary) - Container elements (article, section, nav, aside, header, footer, main, caption) Also added corresponding tests for all new HTML tags and strikethrough/highlight rendering in MarkdownRenderer. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Converters/HTMLToMarkdownConverter.swift | 197 +++++++++++++++- .../RichView/Renderers/MarkdownRenderer.swift | 136 +++++++++++ .../HTMLToMarkdownConverterTests.swift | 220 ++++++++++++++++++ 3 files changed, 552 insertions(+), 1 deletion(-) diff --git a/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift b/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift index 8f4852c..0bd989b 100644 --- a/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift +++ b/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift @@ -173,8 +173,117 @@ public class HTMLToMarkdownConverter { case "hr": result += "\n---\n" + // Table support + case "table": + result += try convertTable(childElement) + + case "thead", "tbody", "tfoot": + // These are handled by table, but if encountered alone, process children + result += try convertElement(childElement) + + case "tr", "th", "td": + // These should be handled by table, but if encountered alone, process children + result += try convertElement(childElement) + + // Strikethrough + case "del", "s", "strike": + let content = try convertElement(childElement) + result += "~~\(content)~~" + + // Underline - no standard markdown, render as emphasized text + case "u", "ins": + let content = try convertElement(childElement) + result += "_\(content)_" + + // Superscript/subscript - render with markers + case "sup": + let content = try convertElement(childElement) + result += "^\(content)" + + case "sub": + let content = try convertElement(childElement) + result += "~\(content)" + + // Mark/highlight - render with markers + case "mark": + let content = try convertElement(childElement) + result += "==\(content)==" + + // Definition list + case "dl": + result += try convertDefinitionList(childElement) + + case "dt": + let content = try convertElement(childElement) + result += "\n**\(content)**\n" + + case "dd": + let content = try convertElement(childElement) + result += ": \(content)\n" + + // Abbreviation - just show the text with title + case "abbr": + let content = try convertElement(childElement) + if let title = try? childElement.attr("title"), !title.isEmpty { + result += "\(content) (\(title))" + } else { + result += content + } + + // Citation + case "cite": + let content = try convertElement(childElement) + result += "*\(content)*" + + // Keyboard input + case "kbd": + let content = try convertElement(childElement) + result += "`\(content)`" + + // Sample output + case "samp": + let content = try convertElement(childElement) + result += "`\(content)`" + + // Variable + case "var": + let content = try convertElement(childElement) + result += "*\(content)*" + + // Small text + case "small": + let content = try convertElement(childElement) + result += content + + // Figure and figcaption + case "figure": + result += try convertElement(childElement) + + case "figcaption": + let content = try convertElement(childElement) + result += "\n*\(content)*\n" + + // Address + case "address": + let content = try convertElement(childElement) + result += "\n*\(content)*\n" + + // Time - just show the text + case "time": + let content = try convertElement(childElement) + result += content + + // Details/summary - collapsible sections + case "details": + result += try convertElement(childElement) + + case "summary": + let content = try convertElement(childElement) + result += "\n**\(content)**\n" + // Container elements - just process children - case "div", "span", "body", "html": + case "div", "span", "body", "html", "article", "section", "nav", "aside", + "header", "footer", "main", "caption": result += try convertElement(childElement) default: @@ -212,6 +321,92 @@ public class HTMLToMarkdownConverter { return result } + /// Convert table to Markdown + private func convertTable(_ element: Element) throws -> String { + var result = "\n" + var rows: [[String]] = [] + var headerRowCount = 0 + + // Get all rows from thead and tbody + let allRows = try element.select("tr") + + for row in allRows { + var cells: [String] = [] + let thCells = try row.select("th") + let isHeaderRow = row.parent()?.tagName().lowercased() == "thead" + || !thCells.isEmpty() + + // Get th and td cells + for cell in row.children() { + let tagName = cell.tagName().lowercased() + if tagName == "th" || tagName == "td" { + let content = try convertElement(cell) + .replacingOccurrences(of: "\n", with: " ") + .trimmingCharacters(in: .whitespaces) + cells.append(content) + } + } + + if !cells.isEmpty { + rows.append(cells) + if isHeaderRow && headerRowCount == 0 { + headerRowCount = 1 + } + } + } + + guard !rows.isEmpty else { return "" } + + // Calculate column widths + let columnCount = rows.map { $0.count }.max() ?? 0 + guard columnCount > 0 else { return "" } + + // Normalize rows to have the same column count + let normalizedRows = rows.map { row -> [String] in + var normalized = row + while normalized.count < columnCount { + normalized.append("") + } + return normalized + } + + // Build markdown table + for (index, row) in normalizedRows.enumerated() { + result += "| " + row.joined(separator: " | ") + " |\n" + + // Add separator after header row + if index == 0 { + let separator = Array(repeating: "---", count: columnCount) + result += "| " + separator.joined(separator: " | ") + " |\n" + } + } + + result += "\n" + return result + } + + /// Convert definition list to Markdown + private func convertDefinitionList(_ element: Element) throws -> String { + var result = "\n" + + for child in element.children() { + let tagName = child.tagName().lowercased() + let content = try convertElement(child) + + switch tagName { + case "dt": + result += "\n**\(content)**\n" + case "dd": + result += ": \(content)\n" + default: + result += content + } + } + + result += "\n" + return result + } + /// Escape special Markdown characters private func escapeMarkdown(_ text: String) -> String { // Only escape characters that would cause markdown parsing issues diff --git a/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift b/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift index 1d99ac4..81d681e 100644 --- a/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift +++ b/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift @@ -80,6 +80,12 @@ public class MarkdownRenderer { } else if line.starts(with: "---") { // Horizontal rule attributedString.append(AttributedString("—————————————\n")) + } else if line.starts(with: "|") && line.hasSuffix("|") { + // Markdown table + let (tableBlock, linesConsumed) = extractTableBlock(lines, startIndex: index) + attributedString.append(renderTable(tableBlock)) + index += linesConsumed + continue } else { // Regular paragraph with inline formatting attributedString.append(renderInlineMarkdown(line)) @@ -296,6 +302,46 @@ public class MarkdownRenderer { continue } + // Check for strikethrough + if let strikeMatch = currentText.firstMatch(of: /~~(.+?)~~/) { + // Add text before strikethrough + let beforeRange = currentText.startIndex.. ([[String]], Int) { + var rows: [[String]] = [] + var index = startIndex + + while index < lines.count { + let line = lines[index] + + // Check if line is a table row + guard line.starts(with: "|") && line.hasSuffix("|") else { + break + } + + // Skip separator row (| --- | --- |) + if line.contains("---") { + index += 1 + continue + } + + // Parse cells + let cells = line + .trimmingCharacters(in: CharacterSet(charactersIn: "|")) + .components(separatedBy: "|") + .map { $0.trimmingCharacters(in: .whitespaces) } + + if !cells.isEmpty { + rows.append(cells) + } + + index += 1 + } + + return (rows, index - startIndex) + } + + /// Render markdown table + private func renderTable(_ rows: [[String]]) -> AttributedString { + guard !rows.isEmpty else { return AttributedString() } + + var result = AttributedString("\n") + + // Get column count + let columnCount = rows.map { $0.count }.max() ?? 0 + guard columnCount > 0 else { return AttributedString() } + + // Calculate column widths for alignment + var columnWidths: [Int] = Array(repeating: 0, count: columnCount) + for row in rows { + for (i, cell) in row.enumerated() where i < columnCount { + columnWidths[i] = max(columnWidths[i], cell.count) + } + } + + for (rowIndex, row) in rows.enumerated() { + // Render each cell + for (cellIndex, cell) in row.enumerated() { + // Add cell content + var cellText = renderInlineMarkdown(cell) + + // Apply header style for first row + if rowIndex == 0 { + cellText.font = .system(size: stylesheet.body.fontSize, weight: .semibold) + } + + result.append(cellText) + + // Add separator between cells + if cellIndex < row.count - 1 { + var separator = AttributedString(" │ ") + separator.foregroundColor = Color.gray.opacity(0.5) + result.append(separator) + } + } + + result.append(AttributedString("\n")) + + // Add separator line after header + if rowIndex == 0 && rows.count > 1 { + var separatorLine = AttributedString(String(repeating: "─", count: 40) + "\n") + separatorLine.foregroundColor = Color.gray.opacity(0.3) + result.append(separatorLine) + } + } + + result.append(AttributedString("\n")) + return result + } } diff --git a/V2erTests/RichView/HTMLToMarkdownConverterTests.swift b/V2erTests/RichView/HTMLToMarkdownConverterTests.swift index ac37b8e..34f6f7a 100644 --- a/V2erTests/RichView/HTMLToMarkdownConverterTests.swift +++ b/V2erTests/RichView/HTMLToMarkdownConverterTests.swift @@ -233,6 +233,214 @@ class HTMLToMarkdownConverterTests: XCTestCase { XCTAssertTrue(markdown.contains("\\#")) } + // MARK: - Table Tests + + func testBasicTableConversion() throws { + let html = """ + + + +
Header 1Header 2
Cell 1Cell 2
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("| Header 1 | Header 2 |")) + XCTAssertTrue(markdown.contains("| --- | --- |")) + XCTAssertTrue(markdown.contains("| Cell 1 | Cell 2 |")) + } + + func testTableWithTheadTbody() throws { + let html = """ + + + +
NameValue
Item100
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("| Name | Value |")) + XCTAssertTrue(markdown.contains("| Item | 100 |")) + } + + func testTableWithMultipleRows() throws { + let html = """ + + + + +
功能模块详细说明
多种格式EPUB/MOBI/AZW3
数据同步多端覆盖
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("功能模块")) + XCTAssertTrue(markdown.contains("多种格式")) + XCTAssertTrue(markdown.contains("数据同步")) + } + + // MARK: - Strikethrough Tests + + func testDelTagConversion() throws { + let html = "Deleted text" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("~~Deleted text~~")) + } + + func testSTagConversion() throws { + let html = "Strikethrough text" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("~~Strikethrough text~~")) + } + + func testStrikeTagConversion() throws { + let html = "Old strike tag" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("~~Old strike tag~~")) + } + + // MARK: - Underline Tests + + func testUnderlineTagConversion() throws { + let html = "Underlined text" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("_Underlined text_")) + } + + func testInsTagConversion() throws { + let html = "Inserted text" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("_Inserted text_")) + } + + // MARK: - Superscript/Subscript Tests + + func testSuperscriptConversion() throws { + let html = "x2" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("^2")) + } + + func testSubscriptConversion() throws { + let html = "H2O" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("~2")) + } + + // MARK: - Mark/Highlight Tests + + func testMarkTagConversion() throws { + let html = "Highlighted text" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("==Highlighted text==")) + } + + // MARK: - Definition List Tests + + func testDefinitionListConversion() throws { + let html = """ +
+
Term
+
Definition of the term
+
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("**Term**")) + XCTAssertTrue(markdown.contains(": Definition of the term")) + } + + // MARK: - Semantic Element Tests + + func testAbbreviationWithTitle() throws { + let html = "HTML" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("HTML")) + XCTAssertTrue(markdown.contains("HyperText Markup Language")) + } + + func testCiteTagConversion() throws { + let html = "Book Title" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("*Book Title*")) + } + + func testKbdTagConversion() throws { + let html = "Press Ctrl+C" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("`Ctrl`")) + XCTAssertTrue(markdown.contains("`C`")) + } + + func testSampTagConversion() throws { + let html = "Error: File not found" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("`Error: File not found`")) + } + + func testVarTagConversion() throws { + let html = "The variable x is used" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("*x*")) + } + + func testFigcaptionConversion() throws { + let html = """ +
+ Image +
Image caption
+
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("![Image](image.png)")) + XCTAssertTrue(markdown.contains("*Image caption*")) + } + + func testAddressTagConversion() throws { + let html = "
Contact us at example@email.com
" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("*Contact us at example@email.com*")) + } + + func testTimeTagConversion() throws { + let html = "" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("January 1, 2024")) + } + + func testSummaryTagConversion() throws { + let html = """ +
+ Click to expand +

Hidden content

+
+ """ + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("**Click to expand**")) + XCTAssertTrue(markdown.contains("Hidden content")) + } + + // MARK: - Container Element Tests + + func testArticleContainerProcessing() throws { + let html = "
Article content
" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("Article content")) + } + + func testSectionContainerProcessing() throws { + let html = "
Section content
" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("Section content")) + } + + func testNavContainerProcessing() throws { + let html = "" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("[Link]")) + } + + func testHeaderFooterProcessing() throws { + let html = "
Header
Footer
" + let markdown = try converter.convert(html) + XCTAssertTrue(markdown.contains("Header")) + XCTAssertTrue(markdown.contains("Footer")) + } + // MARK: - Performance Tests func testPerformanceLargeHTML() throws { @@ -242,4 +450,16 @@ class HTMLToMarkdownConverterTests: XCTestCase { _ = try? converter.convert(repeatedHTML) } } + + func testPerformanceComplexTable() throws { + var tableHTML = "" + for i in 1...50 { + tableHTML += "" + } + tableHTML += "
Header 1Header 2Header 3
Row \(i) Col 1Row \(i) Col 2Row \(i) Col 3
" + + measure { + _ = try? converter.convert(tableHTML) + } + } } \ No newline at end of file From a0fc8ab11a8c92f0f9b8ef0dfc78a2d82e802426 Mon Sep 17 00:00:00 2001 From: graycreate Date: Mon, 1 Dec 2025 18:16:11 +0800 Subject: [PATCH 2/2] fix: address Copilot review comments for HTML tag support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Preserve HTML tags for underline instead of converting to _text_ to avoid conflict with italic styling - Preserve / HTML tags for superscript/subscript instead of ^/~ markers to avoid conflicts with regular text - Use regex pattern for table separator detection to avoid false positives when cell content contains "---" - Escape pipe characters in table cells to prevent markdown table structure breakage - Remove unused headerRowCount variable and related code - Update tests to reflect new behavior - Add test for pipe escaping in table cells 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Converters/HTMLToMarkdownConverter.swift | 18 ++++++----------- .../RichView/Renderers/MarkdownRenderer.swift | 4 ++-- .../HTMLToMarkdownConverterTests.swift | 20 +++++++++++++++---- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift b/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift index 0bd989b..67239a5 100644 --- a/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift +++ b/V2er/Sources/RichView/Converters/HTMLToMarkdownConverter.swift @@ -190,19 +190,19 @@ public class HTMLToMarkdownConverter { let content = try convertElement(childElement) result += "~~\(content)~~" - // Underline - no standard markdown, render as emphasized text + // Underline - no standard markdown, preserve as HTML for custom renderer case "u", "ins": let content = try convertElement(childElement) - result += "_\(content)_" + result += "\(content)" - // Superscript/subscript - render with markers + // Superscript/subscript - preserve as HTML for custom renderer case "sup": let content = try convertElement(childElement) - result += "^\(content)" + result += "\(content)" case "sub": let content = try convertElement(childElement) - result += "~\(content)" + result += "\(content)" // Mark/highlight - render with markers case "mark": @@ -325,16 +325,12 @@ public class HTMLToMarkdownConverter { private func convertTable(_ element: Element) throws -> String { var result = "\n" var rows: [[String]] = [] - var headerRowCount = 0 // Get all rows from thead and tbody let allRows = try element.select("tr") for row in allRows { var cells: [String] = [] - let thCells = try row.select("th") - let isHeaderRow = row.parent()?.tagName().lowercased() == "thead" - || !thCells.isEmpty() // Get th and td cells for cell in row.children() { @@ -342,6 +338,7 @@ public class HTMLToMarkdownConverter { if tagName == "th" || tagName == "td" { let content = try convertElement(cell) .replacingOccurrences(of: "\n", with: " ") + .replacingOccurrences(of: "|", with: "\\|") // Escape pipes for Markdown tables .trimmingCharacters(in: .whitespaces) cells.append(content) } @@ -349,9 +346,6 @@ public class HTMLToMarkdownConverter { if !cells.isEmpty { rows.append(cells) - if isHeaderRow && headerRowCount == 0 { - headerRowCount = 1 - } } } diff --git a/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift b/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift index 81d681e..d405aee 100644 --- a/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift +++ b/V2er/Sources/RichView/Renderers/MarkdownRenderer.swift @@ -384,8 +384,8 @@ public class MarkdownRenderer { break } - // Skip separator row (| --- | --- |) - if line.contains("---") { + // Skip separator row (| --- | --- | or with colons for alignment) + if line.range(of: #"^\|\s*(:?-+:?)\s*(\|\s*(:?-+:?)\s*)*\|$"#, options: .regularExpression) != nil { index += 1 continue } diff --git a/V2erTests/RichView/HTMLToMarkdownConverterTests.swift b/V2erTests/RichView/HTMLToMarkdownConverterTests.swift index 34f6f7a..b75393c 100644 --- a/V2erTests/RichView/HTMLToMarkdownConverterTests.swift +++ b/V2erTests/RichView/HTMLToMarkdownConverterTests.swift @@ -274,6 +274,18 @@ class HTMLToMarkdownConverterTests: XCTestCase { XCTAssertTrue(markdown.contains("数据同步")) } + func testTableWithPipeInContent() throws { + let html = """ + + + +
OptionDescription
A | BChoose A or B
+ """ + let markdown = try converter.convert(html) + // Pipes should be escaped in cell content + XCTAssertTrue(markdown.contains("A \\| B")) + } + // MARK: - Strikethrough Tests func testDelTagConversion() throws { @@ -299,13 +311,13 @@ class HTMLToMarkdownConverterTests: XCTestCase { func testUnderlineTagConversion() throws { let html = "Underlined text" let markdown = try converter.convert(html) - XCTAssertTrue(markdown.contains("_Underlined text_")) + XCTAssertTrue(markdown.contains("Underlined text")) } func testInsTagConversion() throws { let html = "Inserted text" let markdown = try converter.convert(html) - XCTAssertTrue(markdown.contains("_Inserted text_")) + XCTAssertTrue(markdown.contains("Inserted text")) } // MARK: - Superscript/Subscript Tests @@ -313,13 +325,13 @@ class HTMLToMarkdownConverterTests: XCTestCase { func testSuperscriptConversion() throws { let html = "x2" let markdown = try converter.convert(html) - XCTAssertTrue(markdown.contains("^2")) + XCTAssertTrue(markdown.contains("2")) } func testSubscriptConversion() throws { let html = "H2O" let markdown = try converter.convert(html) - XCTAssertTrue(markdown.contains("~2")) + XCTAssertTrue(markdown.contains("2")) } // MARK: - Mark/Highlight Tests