Skip to content

Commit

Permalink
fix: fix parse errors for huge and empty nodes (#102)
Browse files Browse the repository at this point in the history
* Fix parse errors for huge and empty nodes

- Enabled the "huge_tree" option in the XML parser to prevent the
  "xmlSAX2Characters: huge text node" error.
- Fixed a "list index out of range" error that happened on some notes
  with title but no content.

Fixes #101.

* refactor: make empty dom check explicit

* test: add big resource note test

* test: add empty note dom test

---------

Co-authored-by: vzhd1701 <vzhd1701@gmail.com>
  • Loading branch information
zzamboni and vzhd1701 committed Oct 27, 2023
1 parent dd29618 commit 0520e93
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 0 deletions.
1 change: 1 addition & 0 deletions enex2notion/enex_parser_xml.py
Expand Up @@ -27,6 +27,7 @@ def iter_process_xml_elements(
recover=True,
strip_cdata=False,
resolve_entities=False,
huge_tree=True,
)

try:
Expand Down
3 changes: 3 additions & 0 deletions enex2notion/note_parser/note.py
Expand Up @@ -44,6 +44,9 @@ def _parse_note_dom(note: EvernoteNote) -> Optional[Tag]:
logger.error(f"Failed to extract DOM from note '{note.title}'")
return None

if len(note_dom.contents) == 0:
return None

return _filter_yinxiang_markdown(note_dom)


Expand Down
67 changes: 67 additions & 0 deletions tests/test_enex_parser.py
@@ -1,3 +1,4 @@
import base64
import datetime
import logging
from pathlib import Path
Expand Down Expand Up @@ -547,6 +548,72 @@ def test_iter_notes_single_with_resource(fs):
assert notes[0].resource_by_md5("000") is None


def test_iter_notes_single_with_huge_resource(fs, caplog):
test_enex_head = b"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20211218T085932Z" application="Evernote" version="10.25.6">
<note>
<title>test1</title>
<created>20211118T085332Z</created>
<updated>20211118T085920Z</updated>
<note-attributes>
</note-attributes>
<content>test</content>
<resource>
<data encoding="base64">
"""
test_enex_tail = b"""
</data>
<mime>image/gif</mime>
<resource-attributes>
<file-name>smallest.gif</file-name>
</resource-attributes>
</resource>
</note>
</en-export>
"""
test_enex_file = fs.create_file("test.enex", contents=test_enex_head)

# 10 MB
big_binary = b"\x00" * 10 * 1024 * 1024
big_binary_hash = "f1c9645dbc14efddc7d8a322685f26eb"

with Path("test.enex").open("ab+") as f:
f.write(base64.b64encode(big_binary))
f.write(test_enex_tail)

with caplog.at_level(logging.WARNING, logger="enex2notion"):
notes_count = count_notes(Path("test.enex"))

notes = list(iter_notes(Path("test.enex")))

expected_resource = EvernoteResource(
data_bin=big_binary,
size=len(big_binary),
md5=big_binary_hash,
mime="image/gif",
file_name="smallest.gif",
)

assert caplog.text == ""
assert notes_count == 1
assert notes == [
EvernoteNote(
title="test1",
created=datetime.datetime(2021, 11, 18, 8, 53, 32, tzinfo=tzutc()),
updated=datetime.datetime(2021, 11, 18, 8, 59, 20, tzinfo=tzutc()),
content="test",
tags=[],
author="",
url="",
is_webclip=False,
resources=[expected_resource],
),
]
assert notes[0].resource_by_md5(big_binary_hash) == expected_resource
assert notes[0].resource_by_md5("000") is None


def test_iter_notes_single_with_noext_resource(fs):
test_enex = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
Expand Down
16 changes: 16 additions & 0 deletions tests/test_note_parser.py
Expand Up @@ -654,6 +654,22 @@ def test_linebreaks_inside_root(parse_html):
]


def test_empty_note(parse_rules):
test_note = EvernoteNote(
title="test1",
created=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
updated=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
content="<en-note></en-note>",
tags=[],
author="",
url="",
is_webclip=False,
resources=[],
)

assert parse_note(test_note, parse_rules) == []


def test_yinxiang_markdown(parse_rules):
test_note = EvernoteNote(
title="test1",
Expand Down

0 comments on commit 0520e93

Please sign in to comment.