-
Notifications
You must be signed in to change notification settings - Fork 23
/
parser.py
2237 lines (1967 loc) · 77.5 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Simple WikiMedia markup (WikiText) syntax parser
#
# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
import enum
import html
import re
from collections import defaultdict
from collections.abc import Iterator
from typing import (
TYPE_CHECKING,
Callable,
Dict,
List,
Optional,
Set,
Tuple,
Union,
)
from .common import (
MAGIC_FIRST,
MAGIC_LAST,
MAGIC_NOWIKI_CHAR,
MAGIC_SQUOTE_CHAR,
nowiki_quote,
)
from .parserfns import PARSER_FUNCTIONS
from .wikihtml import ALLOWED_HTML_TAGS
if TYPE_CHECKING:
from .core import Wtp
# Set of tags that can be parents of "flow" parents
HTML_FLOW_PARENTS: Set[str] = set(
k
for k, v in ALLOWED_HTML_TAGS.items()
if "flow" in v.get("content", []) or "*" in v.get("content", [])
)
# Set of tags that can be parents of "phrasing" parents (includes those
# of flow parents since flow implies phrasing)
HTML_PHRASING_PARENTS: Set[str] = set(
k
for k, v in ALLOWED_HTML_TAGS.items()
if "phrasing" in v.get("content", [])
or "flow" in v.get("content", [])
or "*" in v.get("content", [])
)
# Mapping from HTML tag or "text" to permitted parent tags
HTML_PERMITTED_PARENTS: Dict[str, Set[str]] = {
k: (
(
HTML_FLOW_PARENTS
if "flow" in v.get("parents", []) or "*" in v.get("parents", [])
else set()
)
| (
HTML_PHRASING_PARENTS
if "phrasing" in v.get("parents", []) or "*" in v.get("parents", [])
else set()
)
| set(v.get("parents", []))
)
for k, v in ALLOWED_HTML_TAGS.items()
}
HTML_PERMITTED_PARENTS["text"] = HTML_PHRASING_PARENTS
# Set of HTML tag like names that we treat as literal without any warning
SILENT_HTML_LIKE: Set[str] = set(
[
"gu",
"qu",
"e",
]
)
# MediaWiki magic words. See https://www.mediawiki.org/wiki/Help:Magic_words
MAGIC_WORDS: Set[str] = set(
[
"__NOTOC__",
"__FORCETOC__",
"__TOC__",
"__NOEDITSECTION__",
"__NEWSECTIONLINK__",
"__NONEWSECTIONLINK__",
"__NOGALLERY__",
"__HIDDENCAT__",
"__EXPECTUNUSEDCATEGORY__",
"__NOCONTENTCONVERT__",
"__NOCC__",
"__NOTITLECONVERT__",
"__NOTC__",
"__START__",
"__END__",
"__INDEX__",
"__NOINDEX__",
"__STATICREDIRECT__",
"__NOGLOBAL__",
"__DISAMBIG__",
]
)
@enum.unique
class NodeKind(enum.Flag):
"""Node types in the parse tree."""
# Root node of the tree. This represents the parsed document.
# Its arguments are [pagetitle].
ROOT = enum.auto()
# Level 1 title, used in Russian Wiktionary as language title.
LEVEL1 = enum.auto()
# Level2 subtitle. Arguments are the title, children are what the section
# contains.
LEVEL2 = enum.auto()
# Level3 subtitle
LEVEL3 = enum.auto()
# Level4 subtitle
LEVEL4 = enum.auto()
# Level5 subtitle
LEVEL5 = enum.auto()
# Level6 subtitle
LEVEL6 = enum.auto()
# Content to be rendered in italic. Content is in children.
ITALIC = enum.auto()
# Content to be rendered in bold. Content is in children.
BOLD = enum.auto()
# Horizontal line. No arguments or children.
HLINE = enum.auto()
# A list. Each list will be started with this node, also nested
# lists. Args is a string that contains the prefix used to open the list.
# Children will contain LIST_ITEM nodes that belong to this list.
# For definition lists the prefix ends in ";".
# Prefixes ending with : are either for items that are just meant
# to be indented (without numbers or list markers), or are part
# of a definition after ";". We leave this to be interpreted by
# the user, because it depends a bit on the Wiki-project itself
# how ":" is used in general; one of the uses is to concatenate
# data to the end of the parent list node, instead of creating
# sublists, but because this loses some data that is useful for
# parsing and interpretation, we do not perform the concatenation
# in Wikitextprocessor.
LIST = enum.auto() # args = prefix for all items of this list
# A list item. Nested items will be in children. Items on the same
# level will be on the same level. There is no explicit node for a list.
# Args is directly the token for this item (not as a list). Children
# is what goes in this list item. List items where the prefix ends in
# ";" are definition list items. For them, children contain the item
# to be defined and node.definition contains the definition, which has
# the same format as children (i.e., a list of strings and WikiNode).
LIST_ITEM = enum.auto() # args = token for this item
# Preformatted text were markup is interpreted. Content is in children.
# Indicated in WikiText by starting lines with a space.
PREFORMATTED = enum.auto() # Preformatted inline text
# Preformatted text where markup is NOT interpreted. Content is in
# children. Indicated in WikiText by <pre>...</pre>.
PRE = enum.auto() # Preformatted text where specials not interpreted
# An internal Wikimedia link (marked with [[...]]). The link arguments
# are in args. This tag is also used for media inclusion. Links with
# trailing word end immediately after the link have the trailing part
# in link children.
LINK = enum.auto()
# A template call (transclusion). Template name is in first argument
# and template arguments in subsequent args. Children are not used.
# In WikiText {{name|arg1|...}}.
TEMPLATE = enum.auto()
# A template argument expansion. Argument name is in first argument and
# subsequent arguments in remaining arguments. Children are not used.
# In WikiText {{{name|...}}}
TEMPLATE_ARG = enum.auto()
# A parser function invocation. This is also used for built-in
# variables such as {{PAGENAME}}. Parser function name is in
# first argument and subsequent arguments are its parameters.
# Children are not used. In WikiText {{name:arg1|arg2|...}}.
PARSER_FN = enum.auto()
# An external URL. The first argument is the URL. The second optional
# argument is the display text. Children are not used.
URL = enum.auto()
# A table. Content is in children.
TABLE = enum.auto()
# A table caption (under TABLE). Content is in children.
TABLE_CAPTION = enum.auto()
# A table row (under TABLE). Content is in children.
TABLE_ROW = enum.auto()
# A table header cell (under TABLE_ROW). Content is in children.
# Rows where all cells are header cells are header rows.
TABLE_HEADER_CELL = enum.auto()
# A table cell (under TABLE_ROW). Content is in children.
TABLE_CELL = enum.auto()
# A MediaWiki magic word. The magic word is assigned directly to args
# (not as a list). Children are not used.
MAGIC_WORD = enum.auto()
# HTML tag (open or close tag). Pairs of open and close tags are
# merged into a single node and the content between them is stored
# in the node's children. Args is the name of the tag directly
# (i.e., not a list and always without a slash). Attrs contains
# attributes from the HTML start tag. Contents in a paired tag
# are stored in ``children``.
HTML = enum.auto()
# Maps subtitle token to its kind
SUBTITLE_TO_KIND: Dict[str, NodeKind] = {
"=": NodeKind.LEVEL1,
"==": NodeKind.LEVEL2,
"===": NodeKind.LEVEL3,
"====": NodeKind.LEVEL4,
"=====": NodeKind.LEVEL5,
"======": NodeKind.LEVEL6,
}
# Maps subtitle node kind to its level. Keys include all title/subtitle nodes
# (this is also used like a set of all subtitle kinds, including the root).
KIND_TO_LEVEL: Dict[NodeKind, int] = {
v: len(k) for k, v in SUBTITLE_TO_KIND.items()
}
KIND_TO_LEVEL[NodeKind.ROOT] = 0
# Node types that have arguments separated by the vertical bar (|)
HAVE_ARGS_KINDS: Tuple[NodeKind, ...] = (
NodeKind.LINK,
NodeKind.TEMPLATE,
NodeKind.TEMPLATE_ARG,
NodeKind.PARSER_FN,
NodeKind.URL,
)
# Node kinds that generate an error if they have not been properly closed.
MUST_CLOSE_KINDS: Tuple[NodeKind, ...] = (
NodeKind.ITALIC,
NodeKind.BOLD,
NodeKind.PRE,
NodeKind.HTML,
NodeKind.LINK,
NodeKind.TEMPLATE,
NodeKind.TEMPLATE_ARG,
NodeKind.PARSER_FN,
NodeKind.URL,
NodeKind.TABLE,
)
# regex for finding html-tags so that we can replace single-quotes
# inside of them with magic characters.
# the (?:) signifies a non-capturing group, which is necessary for
# re.split; if the splitting pattern has capturing groups (like
# the outer parentheses here), those groups are sent out by
# the iterator; otherwise it skips the splitting pattern.
# This means that if you have nesting capturing groups,
# the contents will be repeated partly.
inside_html_tags_re = re.compile(
r"(<(?:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")[^><]*>)", re.IGNORECASE
)
# We don't have specs for this, so let's assume...
# HTML nodes have args be strings.
# Others have a list of lists, *at least*.
# Sometimes, args.append(children) happens, so those
# lists can contain at least strings and WikiNodes.
# I think there is no third list layer, maximum is args[x][y].
WikiNodeChildrenList = List[Union[str, "WikiNode"]]
WikiNodeArgsSublist = WikiNodeChildrenList # XXX Currently identical to above
# WikiNodeArgs = Union[str, # Just a string
# List[
# Union[
# WikiNodeArgsSublist,
# WikiNodeChildrenList]
# ]
# ]
WikiNodeStrArg = str
WikiNodeListArgs = List[Union[WikiNodeArgsSublist, WikiNodeChildrenList]]
WikiNodeHTMLAttrsDict = Dict[str, str] # XXX Probably not just HTML...
class WikiNode:
"""Node in the parse tree for WikiMedia text."""
__slots__ = (
"kind",
"sarg",
"largs",
"attrs",
"children",
"loc",
"definition",
"temp_head",
)
def __init__(self, kind: NodeKind, loc: int) -> None:
assert isinstance(kind, NodeKind)
assert isinstance(loc, int)
self.kind = kind
self.sarg: WikiNodeStrArg = ""
self.largs: WikiNodeListArgs = [] # String, or a list of lists
self.attrs: WikiNodeHTMLAttrsDict = {}
self.children: WikiNodeChildrenList = []
self.loc = loc # used for debugging lines
self.definition: Optional[WikiNodeChildrenList] = None
self.temp_head: Optional[WikiNodeChildrenList] = None
def __str__(self) -> str:
return "<{}({}){} {}>".format(
self.kind.name,
self.sarg if self.sarg else ", ".join(map(repr, self.largs)),
self.attrs,
", ".join(map(repr, self.children)),
)
def __repr__(self) -> str:
return self.__str__()
def find_child(
self,
target_kinds: NodeKind,
with_index: bool = False,
) -> Iterator[Union["WikiNode", Tuple[int, "WikiNode"]]]:
"""
Find direct child nodes that match the target node type, also return
the node index that could be used to divide child node list.
`target_kinds` could be a single NodeKind enum member or multiple
NodeKind members combined with the "|"(OR) operator.
"""
for index, child in enumerate(self.children):
if isinstance(child, WikiNode) and child.kind in target_kinds:
if with_index:
yield index, child
else:
yield child
def invert_find_child(
self,
target_kinds: NodeKind,
include_empty_str: bool = False,
) -> Iterator["WikiNode"]:
# Find direct child nodes that don't match the target node type.
for child in self.children:
if isinstance(child, str) and (
include_empty_str or len(child.strip()) > 0
):
yield child
elif isinstance(child, WikiNode) and child.kind not in target_kinds:
yield child
def _find_node_recursively(
self,
start_node: "WikiNode",
current_node: Union["WikiNode", str],
target_kinds: NodeKind,
) -> Iterator["WikiNode"]:
# Find nodes in WikiNode.children and WikiNode.largs recursively.
# Search WikiNode.largs probably is not needed, add it because the
# original `contains_list()` in wiktextract does this.
if isinstance(current_node, WikiNode):
if current_node != start_node and current_node.kind in target_kinds:
yield current_node
for child in current_node.children:
yield from self._find_node_recursively(
start_node, child, target_kinds
)
for arg_list in current_node.largs:
for arg in arg_list:
yield from self._find_node_recursively(
start_node, arg, target_kinds
)
def find_child_recursively(
self, target_kinds: NodeKind
) -> Iterator["WikiNode"]:
# Similar to `find_child()` but also search nested nodes.
yield from self._find_node_recursively(self, self, target_kinds)
def contain_node(self, target_kinds: NodeKind) -> bool:
for node in self._find_node_recursively(self, self, target_kinds):
return True
return False
def filter_empty_str_child(self) -> Iterator[Union[str, "WikiNode"]]:
# Remove string child nodes that only contain space or new line.
for node in self.children:
if isinstance(node, str):
if len(node.strip()) > 0:
yield node
else:
yield node
def find_html(
self,
target_tag: str,
with_index: bool = False,
attr_name: str = "",
attr_value: str = "",
) -> Iterator[Union["HTMLNode", Tuple[int, "HTMLNode"]]]:
# Find direct HTMl child nodes match the target tag and attribute.
for index, node in self.find_child(NodeKind.HTML, True):
if node.tag == target_tag:
if len(attr_name) > 0 and attr_value not in node.attrs.get(
attr_name, {}
):
continue
if with_index:
yield index, node
else:
yield node
def find_html_recursively(
self,
target_tag: str,
attr_name: str = "",
attr_value: str = "",
) -> Iterator["HTMLNode"]:
for node in self.find_child_recursively(NodeKind.HTML):
if node.tag == target_tag:
if len(attr_name) > 0 and attr_value not in node.attrs.get(
attr_name, {}
):
continue
yield node
class TemplateNode(WikiNode):
def __init__(self, linenum: int):
super().__init__(NodeKind.TEMPLATE, linenum)
self._template_parameters: Optional[
Dict[
Union[str, int],
Union[str, WikiNode, List[Union[str, WikiNode]]],
]
] = None
@property
def template_name(self) -> str:
return self.largs[0][0].strip()
@property
def template_parameters(
self,
) -> Dict[
Union[str, int], Union[str, WikiNode, List[Union[str, WikiNode]]]
]:
# Convert the list type arguments to a dictionary.
if self._template_parameters is not None:
return self._template_parameters
parameters = defaultdict(list)
unnamed_parameter_index = 0
for parameter_list in self.largs[1:]:
is_named = False
parameter_name = ""
if len(parameter_list) == 0:
unnamed_parameter_index += 1
parameters[unnamed_parameter_index] = ""
for index, parameter in enumerate(parameter_list):
if index == 0:
if not isinstance(parameter, str):
unnamed_parameter_index += 1
else:
if "=" in parameter:
is_named = True
else:
unnamed_parameter_index += 1
if is_named:
parameter = parameter.strip()
if len(parameter) == 0:
continue
if "=" in parameter:
equal_sign_index = parameter.index("=")
parameter_name = parameter[
:equal_sign_index
].strip()
parameter_value = parameter[
equal_sign_index + 1 :
].strip()
if parameter_name.isdigit(): # value contains "="
parameter_name = int(parameter_name)
is_named = False
if len(parameter_value) > 0:
parameters[parameter_name].append(
parameter_value
)
continue
if is_named and len(parameter_name) > 0:
parameters[parameter_name].append(parameter)
else:
parameters[unnamed_parameter_index].append(parameter)
for p_name, p_value in parameters.items():
if isinstance(p_value, list) and len(p_value) == 1:
parameters[p_name] = p_value[0]
self._template_parameters = parameters
return parameters
class HTMLNode(WikiNode):
def __init__(self, linenum: int):
super().__init__(NodeKind.HTML, linenum)
@property
def tag(self):
return self.sarg
class LevelNode(WikiNode):
def __init__(self, level_type: NodeKind, linenum: int):
super().__init__(level_type, linenum)
def find_content(self, target_types: NodeKind) -> Iterator[WikiNode]:
"""
Find WikiNode in `WikiNode.largs`. This method could be used to find
templates "inside" the level node but not the child nodes under the
level node.
"""
for content in (
level_node_arg
for level_node_arg_list in self.largs
for level_node_arg in level_node_arg_list
if isinstance(level_node_arg, WikiNode)
and level_node_arg.kind in target_types
):
yield content
def _parser_push(ctx: "Wtp", kind: NodeKind) -> WikiNode:
"""Pushes a new node of the specified kind onto the stack."""
assert isinstance(kind, NodeKind)
_parser_merge_str_children(ctx)
if kind == NodeKind.TEMPLATE:
node = TemplateNode(ctx.linenum)
elif kind == NodeKind.HTML:
node = HTMLNode(ctx.linenum)
elif kind in KIND_TO_LEVEL:
node = LevelNode(kind, ctx.linenum)
else:
node = WikiNode(kind, ctx.linenum)
prev = ctx.parser_stack[-1]
prev.children.append(node)
ctx.parser_stack.append(node)
ctx.suppress_special = False
return node
def _parser_merge_str_children(ctx: "Wtp") -> None:
"""Merges multiple consecutive str children into one. We merge them
as a separate step, because this gives linear worst-case time, vs.
quadratic worst case (albeit with lower constant factor) if we just
added to the previously accumulated string in text_fn() instead.
Importantly, this also finalizes string children so that any magic
characters are expanded and nowiki characters removed."""
node = ctx.parser_stack[-1]
new_children: WikiNodeChildrenList = []
strings: List[str] = []
for x in node.children:
if isinstance(x, str):
strings.append(x)
else:
if strings:
s = ctx._finalize_expand("".join(strings))
if s:
new_children.append(s)
strings = []
new_children.append(x)
if strings:
s = ctx._finalize_expand("".join(strings))
if s:
new_children.append(s)
node.children = new_children
def _parser_pop(ctx: "Wtp", warn_unclosed: bool) -> None:
"""Pops a node from the stack. If the node has arguments, this moves
remaining children of the node into its arguments. If ``warn_unclosed``
is True, this warns about nodes that should be explicitly closed
not having been closed. Also performs certain other operations on
the parse tree; this is a place for various kludges that manipulate
the nodes when their parsing completes."""
assert warn_unclosed in (True, False)
_parser_merge_str_children(ctx)
node = ctx.parser_stack[-1]
# Warn about unclosed syntaxes.
if warn_unclosed and node.kind in MUST_CLOSE_KINDS:
if node.kind == NodeKind.HTML:
ctx.debug(
"HTML tag <{}> not properly closed".format(node.sarg),
trace="started on line {}, detected on line {}".format(
node.loc, ctx.linenum
),
sortid="parser/304",
)
elif node.kind == NodeKind.PARSER_FN:
ctx.debug(
"parser function invocation {!r} not properly closed".format(
node.largs[0]
),
trace="started on line {}, detected on line {}".format(
node.loc, ctx.linenum
),
sortid="parser/309",
)
elif node.kind == NodeKind.URL and not node.children:
# This can happen at least when [ is inside template argument.
ctx.parser_stack.pop()
node2 = ctx.parser_stack[-1]
node3 = node2.children.pop()
assert node3 is node
text_fn(ctx, "[")
return
elif node.kind in (NodeKind.ITALIC, NodeKind.BOLD):
# Unbalanced italic/bold annotation is so extremely common
# in Wiktionary that let's suppress any warnings about
# them.
pass
else:
ctx.debug(
"{} not properly closed".format(node.kind.name),
trace="started on line {}, detected on line {}".format(
node.loc, ctx.linenum
),
sortid="parser/328",
)
# When popping BOLD and ITALIC nodes, if the node has no children,
# just remove the node from it's parent's children. We may otherwise
# generate spurious empty BOLD and ITALIC nodes when closing them
# out-of-order (which happens always with '''''bolditalic''''').
if node.kind in (NodeKind.BOLD, NodeKind.ITALIC) and not node.children:
ctx.parser_stack.pop()
if TYPE_CHECKING:
assert isinstance(ctx.parser_stack[-1].children[-1], WikiNode)
assert ctx.parser_stack[-1].children[-1].kind == node.kind
ctx.parser_stack[-1].children.pop()
return
# If the node has arguments, move remaining children to be the last
# argument
if node.kind in HAVE_ARGS_KINDS:
node.largs.append(node.children)
node.children = []
# When popping a TEMPLATE, check if its name is a constant that
# is a known parser function (including predefined variable).
# If so, turn this node into a PARSER_FN node.
if (
node.kind == NodeKind.TEMPLATE
and node.largs
and len(node.largs[0]) == 1
and isinstance(node.largs[0][0], str)
and node.largs[0][0] in PARSER_FUNCTIONS
):
# Change node type to PARSER_FN. Otherwise it has identical
# structure to a TEMPLATE.
node.kind = NodeKind.PARSER_FN
# When popping description list nodes that have a definition,
# shuffle WikiNode.temp_head and children to have head in children and
# definition in WikiNode.definition
if (
node.kind == NodeKind.LIST_ITEM
and node.sarg.endswith(";")
and node.temp_head
):
head = node.temp_head
node.temp_head = None
node.definition = node.children
node.children = head
# Remove the topmost node from the stack. It should be on its parent's
# children list.
ctx.parser_stack.pop()
def _parser_have(ctx: "Wtp", kind: NodeKind) -> bool:
"""Returns True if any node on the stack is of the given kind."""
assert isinstance(kind, NodeKind)
for node in ctx.parser_stack:
if node.kind == kind:
return True
return False
def close_begline_lists(ctx: "Wtp") -> None:
"""Closes currently open list if at the beginning of a line."""
if not (ctx.beginning_of_line and ctx.begline_enabled):
return
while _parser_have(ctx, NodeKind.LIST):
_parser_pop(ctx, True)
def pop_until_nth_list(ctx: "Wtp", list_token: str) -> None:
"""
Pop nodes in the parser stack until the correct depth.
"""
if not (ctx.beginning_of_line and ctx.begline_enabled):
return
list_count = len(list_token)
passed_nodes = 0
for node in ctx.parser_stack:
passed_nodes += 1
if node.kind == NodeKind.LIST:
list_count -= 1
if list_count == 0:
break
if list_token.startswith((":", ";")):
# pop until target list node's item child node is at the stack top
# in order to add a new nested list node
passed_nodes += 1
# pop until the stack top is the taregt list node
for _ in range(len(ctx.parser_stack) - passed_nodes):
_parser_pop(ctx, True)
def text_fn(ctx: "Wtp", token: str) -> None:
"""Inserts the token as raw text into the parse tree."""
close_begline_lists(ctx)
node = ctx.parser_stack[-1]
# Convert certain characters from the token into HTML entities
# XXX this breaks tags inside templates, e.g. <math> in
# "conjugacy class"/English examples
# token = re.sub(r"<", "<", token)
# token = re.sub(r">", ">", token)
# External links [https://...] require some magic. They only seem to
# be links if the content looks like a URL."""
if node.kind == NodeKind.URL:
if not node.largs and not node.children:
if not re.match(r"(https?:|mailto:|//)", token):
# It does not look like a URL
ctx.parser_stack.pop()
node2 = ctx.parser_stack[-1]
node3 = node2.children.pop()
assert node3 is node
return text_fn(ctx, "[" + token)
# Whitespaces inside an external link divide its first argument from its
# second argument. All remaining words go into the second argument.
if token.isspace() and not node.largs:
_parser_merge_str_children(ctx)
node.largs.append(node.children)
node.children = []
return
# Some nodes are automatically popped on newline/text
if ctx.beginning_of_line and ctx.begline_enabled:
while True:
node = ctx.parser_stack[-1]
if node.kind == NodeKind.LIST_ITEM:
if token.startswith(" ") or token[0].startswith("\t"):
node.children.append(token)
return
_parser_merge_str_children(ctx)
if (
node.children
and isinstance(node.children[-1], str)
and (
len(node.children) > 1
or not node.children[-1].isspace()
)
and node.children[-1].endswith("\n")
):
_parser_pop(ctx, False)
continue
elif node.kind == NodeKind.LIST:
_parser_pop(ctx, False)
continue
elif node.kind == NodeKind.PREFORMATTED:
_parser_merge_str_children(ctx)
if (
node.children
and isinstance(node.children[-1], str)
and node.children[-1].endswith("\n")
and not token.startswith(" ")
and not token.isspace()
):
_parser_pop(ctx, False)
continue
elif node.kind in (NodeKind.BOLD, NodeKind.ITALIC):
_parser_merge_str_children(ctx)
ctx.debug(
"{} not properly closed on the same line".format(
node.kind.name
),
sortid="parser/449",
)
_parser_pop(ctx, False)
break
# Spaces at the beginning of a line indicate preformatted text
if token.startswith(" ") or token.startswith("\t"):
if ctx.parser_stack[-1].kind in (
NodeKind.TABLE,
NodeKind.TABLE_ROW,
):
return
if node.kind != NodeKind.PREFORMATTED and not ctx.pre_parse:
node = _parser_push(ctx, NodeKind.PREFORMATTED)
# If the previous child was a link that doesn't yet have children,
# and the text to be added starts with valid word characters, assume
# they are link trail and add them as a child of the link.
if (
node.children
and isinstance(node.children[-1], WikiNode)
and node.children[-1].kind == NodeKind.LINK
and not node.children[-1].children
and not ctx.suppress_special
):
m = re.match(r"(?s)(\w+)(.*)", token)
if m:
node.children[-1].children.append(m.group(1))
token = m.group(2)
if not token:
return
# Add a text child
node.children.append(token)
def hline_fn(ctx: "Wtp", token: str) -> None:
"""Processes a horizontal line token."""
# Pop nodes from the stack until we reach a LEVEL2 subtitle or a
# table element. We also won't pop HTML nodes as they might appear
# in template definitions.
close_begline_lists(ctx)
while True:
node = ctx.parser_stack[-1]
if node.kind in (
NodeKind.ROOT,
NodeKind.LEVEL2,
NodeKind.TABLE,
NodeKind.TABLE_CAPTION,
NodeKind.TABLE_ROW,
NodeKind.TABLE_HEADER_CELL,
NodeKind.TABLE_CELL,
NodeKind.HTML,
):
break
_parser_pop(ctx, True)
_parser_push(ctx, NodeKind.HLINE)
_parser_pop(ctx, True)
def subtitle_start_fn(ctx, token) -> None:
"""Processes a subtitle start token. The token has < prepended to it."""
assert isinstance(token, str)
if ctx.pre_parse:
return text_fn(ctx, token)
close_begline_lists(ctx)
kind = SUBTITLE_TO_KIND[token[1:]]
level = KIND_TO_LEVEL[kind]
# Keep popping subtitles and other formats until the next subtitle
# is of a higher level - but only if there are remaining subtitles.
# Subtitles sometimes occur inside <noinclude> and similar tags, and we
# don't want to force closing those.
while any(x.kind in KIND_TO_LEVEL for x in ctx.parser_stack):
node = ctx.parser_stack[-1]
if KIND_TO_LEVEL.get(node.kind, 99) < level:
break
if node.kind == NodeKind.HTML and node.sarg not in ("span",):
break
_parser_pop(ctx, True)
# Push the subtitle node. Subtitle start nodes are guaranteed to have
# a close node, though the close node could have an incorrect level.
_parser_push(ctx, kind)
return None
def subtitle_end_fn(ctx: "Wtp", token: str) -> None:
"""Processes a subtitle end token. The token has > prepended to it."""
assert isinstance(token, str)
if ctx.pre_parse:
return text_fn(ctx, token)
kind = SUBTITLE_TO_KIND[token[1:]]
# Keep popping formats until we get to the subtitle node
while True:
node = ctx.parser_stack[-1]
if node.kind in KIND_TO_LEVEL:
break
_parser_pop(ctx, True)
# Move children of the subtitle node to be its first argument.
node = ctx.parser_stack[-1]
if node.kind != kind:
ctx.debug(
"subtitle start and end markers level mismatch", sortid="parser/545"
)
_parser_merge_str_children(ctx)
node.largs.append(node.children)
node.children = []
def italic_fn(ctx: "Wtp", token: str) -> None:
"""Processes an italic start/end token ('')."""
if ctx.pre_parse:
return text_fn(ctx, token)
close_begline_lists(ctx)
node = ctx.parser_stack[-1]
if node.kind in (NodeKind.TEMPLATE, NodeKind.TEMPLATE_ARG):
return text_fn(ctx, token)
if not _parser_have(ctx, NodeKind.ITALIC) or node.kind in (NodeKind.LINK,):
# Push new formatting node
_parser_push(ctx, NodeKind.ITALIC)
return
# Pop the italic. If there is an intervening BOLD, push it afterwards
# to allow closing them in either order.
push_bold = False
while True:
node = ctx.parser_stack[-1]
if node.kind == NodeKind.ITALIC:
_parser_pop(ctx, False)
break
if node.kind == NodeKind.BOLD:
push_bold = True
_parser_pop(ctx, False)
if push_bold:
_parser_push(ctx, NodeKind.BOLD)
def bold_fn(ctx: "Wtp", token: str) -> None:
"""Processes a bold start/end token (''')."""
if ctx.pre_parse:
return text_fn(ctx, token)
close_begline_lists(ctx)
node = ctx.parser_stack[-1]
if node.kind in (NodeKind.TEMPLATE, NodeKind.TEMPLATE_ARG):
return text_fn(ctx, token)
if not _parser_have(ctx, NodeKind.BOLD) or node.kind in (NodeKind.LINK,):
# Push new formatting node
_parser_push(ctx, NodeKind.BOLD)
return
# Pop the bold. If there is an intervening ITALIC, push it afterwards
# to allow closing them in either order.
push_italic = False
while True:
node = ctx.parser_stack[-1]
if node.kind == NodeKind.BOLD:
_parser_pop(ctx, False)
break
if node.kind == NodeKind.ITALIC:
push_italic = True
_parser_pop(ctx, False)
if push_italic:
_parser_push(ctx, NodeKind.ITALIC)
def elink_start_fn(ctx: "Wtp", token: str) -> None:
"""Processes an external link start token "["."""
if ctx.pre_parse:
return text_fn(ctx, token)
close_begline_lists(ctx)