Skip to content

Commit

Permalink
fix: split paragraphs before parsing line by line
Browse files Browse the repository at this point in the history
  • Loading branch information
vzhd1701 committed Jan 4, 2022
1 parent b6cb781 commit b692136
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 3 deletions.
49 changes: 46 additions & 3 deletions enex2notion/string_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from bs4 import Tag
import copy
from typing import List

from bs4 import NavigableString, Tag

from enex2notion.notion_blocks import TextProp
from enex2notion.string_extractor_properties import resolve_string_properties

STANDALONES = ["h1", "h2", "h3", "div"]


def extract_string(tag: Tag) -> TextProp:
"""Convert a block content into a string with properties
Expand All @@ -12,8 +17,9 @@ def extract_string(tag: Tag) -> TextProp:
[["some text "], ["bold ", ["b"]], ["bold and italic", ["b", "i"]]]
"""

# Element is either a single div itself or a collection of div or h1-3 "lines"#
div_lines = tag.find_all(["h1", "h2", "h3", "div"]) or [tag]
# Element is either a single div itself or a collection of div or h1-3 "lines"
# it can also contain random inline strings, so we group them in separate lines
div_lines = _split_line(copy.copy(tag)) if tag.find_all(STANDALONES) else [tag]

string_blocks = _extract_blocks(div_lines)

Expand All @@ -22,6 +28,43 @@ def extract_string(tag: Tag) -> TextProp:
return TextProp(result_string, result_properties)


def _split_line(element: Tag):
blocks = []
group = []

for sub in element.children:
is_inline = not isinstance(sub, Tag) or sub.name not in STANDALONES

if is_inline:
# skip mid-tag whitespaces
if isinstance(sub, NavigableString) and not sub.text.strip():
continue

group.append(sub)
else:
if group:
blocks.append(_make_block(group))
group = []

blocks.append(sub)

if group:
blocks.append(_make_block(group))

return blocks


def _make_block(elements: List[Tag]):
"""Make a single block from a list of elements"""

block = Tag(name="div")

for element in elements:
block.append(copy.copy(element))

return block


def _extract_blocks(div_lines):
"""Get parent stack for each string in the line and convert them to properties
Expand Down
8 changes: 8 additions & 0 deletions tests/test_string_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ def test_extract_text_newline_inline(parse_html):
)


def test_extract_text_tagless_strings(parse_html):
test_note = parse_html("<td>test1<div>test2</div></td>").find("td")

assert extract_string(test_note) == TextProp(
text="test1\ntest2", properties=[["test1\ntest2"]]
)


def test_extract_text_overlap(parse_html):
test_note = parse_html(
"<div>head <b>middle_head <i>inside</i> middle_tail</b> tail</div>"
Expand Down

0 comments on commit b692136

Please sign in to comment.