Skip to content

Commit

Permalink
End-tag closing element inside list item should not close list
Browse files Browse the repository at this point in the history
A difference between actual wikitext and our parser is that
if you have a list where you start an html-looking element,
like <ref>, you can have newlines inside that ref element
and not break the list item into separate lists.

This is a partial kludge to fix the very specific circumstance
where you have the </end-tag> at the very start of a line,
without extra newlines inside the element otherwise. This happens
the most of these cases, so hopefully there aren't cases
where people have just gone nuts with the freedom start
and ends tags provide.
  • Loading branch information
kristian-clausal committed Aug 11, 2023
1 parent 43b5355 commit 3952eda
Showing 1 changed file with 39 additions and 9 deletions.
48 changes: 39 additions & 9 deletions wikitextprocessor/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,8 +1320,34 @@ def tag_fn(ctx, token):
_parser_have(ctx, NodeKind.PARSER_FN)):
return text_fn(ctx, token)

# If we are at the beginning of a line, close pending list
close_begline_lists(ctx)
# If we are at the beginning of a line, close pending list,
# UNLESS we are closing a tag (</tag>) in which case if the
# element being closed is inside the newest link item,
# just continue the link item and allow newlines inside
# between the tags... XXX Double+ newlines break this still.
# """
# # Example <ref> the text...
# </ref> here is still part of the above list item, unexpectedly...
# """
end_tag_name = None
if token.startswith("</"):
# See if this looks like an end-tag
m = re.match(r"</([-a-zA-Z0-9]+)\s*>", token)
if m is None:
close_begline_lists(ctx)
else:
# end_tag_name is also saved for later, reusing the regex output
end_tag_name = m.group(1)
end_tag_name = end_tag_name.lower()
# See if we can find the opening tag from the stack
# or if we bump into a LIST_ITEM first, going from newest to oldest
for i in reversed(range(0, len(ctx.parser_stack))):
node = ctx.parser_stack[i]
if node.kind == NodeKind.HTML and node.args == end_tag_name:
break # do not close_begline_lists
if node.kind == NodeKind.LIST_ITEM:
close_begline_lists(ctx)
break

# Try to parse it as a start tag
m = re.match(r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9]+(=("[^"]*"|"""
Expand Down Expand Up @@ -1418,12 +1444,16 @@ def tag_fn(ctx, token):
return

# Since it was not a start tag, it should be an end tag
m = re.match(r"</([-a-zA-Z0-9]+)\s*>", token)
if m is None:
print("Could not match end tag token: {!r}".format(token))
assert False
name = m.group(1)
name = name.lower()
if end_tag_name:
# Duplicated code from above
name = end_tag_name
else:
m = re.match(r"</([-a-zA-Z0-9]+)\s*>", token)
if m is None:
print("Could not match end tag token: {!r}".format(token))
assert False
name = m.group(1)
name = name.lower()

# We should never see </section>
if name == "section":
Expand Down Expand Up @@ -1452,7 +1482,7 @@ def tag_fn(ctx, token):
"".format(name), sortid="parser/1320")

# See if we can find the opening tag from the stack
for i in range(0, len(ctx.parser_stack)):
for i in reversed(range(0, len(ctx.parser_stack))):
node = ctx.parser_stack[i]
if node.kind == NodeKind.HTML and node.args == name:
break
Expand Down

0 comments on commit 3952eda

Please sign in to comment.