fix: split paragraphs before parsing line by line

vzhd1701 · Jan 4, 2022 · b692136 · b692136
1 parent b6cb781
commit b692136
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 3 deletions.
diff --git a/enex2notion/string_extractor.py b/enex2notion/string_extractor.py
@@ -1,8 +1,13 @@
-from bs4 import Tag
+import copy
+from typing import List
+
+from bs4 import NavigableString, Tag
 
 from enex2notion.notion_blocks import TextProp
 from enex2notion.string_extractor_properties import resolve_string_properties
 
+STANDALONES = ["h1", "h2", "h3", "div"]
+
 
 def extract_string(tag: Tag) -> TextProp:
     """Convert a block content into a string with properties
@@ -12,8 +17,9 @@ def extract_string(tag: Tag) -> TextProp:
          [["some text "], ["bold ", ["b"]], ["bold and italic", ["b", "i"]]]
     """
 
-    # Element is either a single div itself or a collection of div or h1-3 "lines"#
-    div_lines = tag.find_all(["h1", "h2", "h3", "div"]) or [tag]
+    # Element is either a single div itself or a collection of div or h1-3 "lines"
+    # it can also contain random inline strings, so we group them in separate lines
+    div_lines = _split_line(copy.copy(tag)) if tag.find_all(STANDALONES) else [tag]
 
     string_blocks = _extract_blocks(div_lines)
 
@@ -22,6 +28,43 @@ def extract_string(tag: Tag) -> TextProp:
     return TextProp(result_string, result_properties)
 
 
+def _split_line(element: Tag):
+    blocks = []
+    group = []
+
+    for sub in element.children:
+        is_inline = not isinstance(sub, Tag) or sub.name not in STANDALONES
+
+        if is_inline:
+            # skip mid-tag whitespaces
+            if isinstance(sub, NavigableString) and not sub.text.strip():
+                continue
+
+            group.append(sub)
+        else:
+            if group:
+                blocks.append(_make_block(group))
+                group = []
+
+            blocks.append(sub)
+
+    if group:
+        blocks.append(_make_block(group))
+
+    return blocks
+
+
+def _make_block(elements: List[Tag]):
+    """Make a single block from a list of elements"""
+
+    block = Tag(name="div")
+
+    for element in elements:
+        block.append(copy.copy(element))
+
+    return block
+
+
 def _extract_blocks(div_lines):
     """Get parent stack for each string in the line and convert them to properties
 

diff --git a/tests/test_string_extractor.py b/tests/test_string_extractor.py
@@ -30,6 +30,14 @@ def test_extract_text_newline_inline(parse_html):
     )
 
 
+def test_extract_text_tagless_strings(parse_html):
+    test_note = parse_html("<td>test1<div>test2</div></td>").find("td")
+
+    assert extract_string(test_note) == TextProp(
+        text="test1\ntest2", properties=[["test1\ntest2"]]
+    )
+
+
 def test_extract_text_overlap(parse_html):
     test_note = parse_html(
         "<div>head <b>middle_head <i>inside</i> middle_tail</b> tail</div>"