-
Notifications
You must be signed in to change notification settings - Fork 23
/
core.py
1703 lines (1577 loc) · 78.6 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Definition of the processing context for Wikitext processing, and code for
# expanding templates, parser functions, and Lua macros.
#
# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
import os
import re
import sys
import html
import json
import time
import pickle
import platform
import tempfile
import traceback
import collections
import urllib.parse
import pkg_resources
import html.entities
import multiprocessing
from pathlib import Path
from .parserfns import PARSER_FUNCTIONS, call_parser_function, init_namespaces
from .wikihtml import ALLOWED_HTML_TAGS
from .luaexec import call_lua_sandbox
from .parser import parse_encoded, NodeKind
from .common import (MAGIC_FIRST, MAGIC_LAST, MAX_MAGICS, MAGIC_NOWIKI_CHAR)
from .dumpparser import process_dump
from .node_expand import to_wikitext, to_html, to_text
# Set of HTML tags that need an explicit end tag.
PAIRED_HTML_TAGS = set(k for k, v in ALLOWED_HTML_TAGS.items()
if not v.get("no-end-tag"))
# Warning: this function is not re-entrant. We store ctx and page_handler
# in global variables during dump processing, because they may not be
# pickleable.
_global_ctx = None
_global_page_handler = None
_global_page_autoload = True
def phase2_page_handler(dt):
"""Helper function for calling the Phase2 page handler (see
reprocess()). This is a global function in order to make this
pickleable. The implication is that process() and reprocess() are not
re-entrant (i.e., cannot be called safely from multiple threads or
recursively)"""
ctx = _global_ctx
autoload = _global_page_autoload
model, title = dt
start_t = time.time()
# Helps debug extraction hangs. This writes the path of each file being
# processed into /tmp/wiktextract*/wiktextract-*. Once a hang
# has been observed, these files contain page(s) that hang. They should
# be checked before aborting the process, as an interrupt might delete them.
with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname:
debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid())
with open(debug_path, "w", encoding="utf-8") as f:
f.write(title + "\n")
ctx.start_page(title)
if autoload:
data = ctx.read_by_title(title)
assert isinstance(data, str)
else:
data = None
try:
ret = _global_page_handler(model, title, data)
return True, title, start_t, ret
except Exception as e:
lst = traceback.format_exception(type(e), value=e,
tb=e.__traceback__)
msg = ("=== EXCEPTION while parsing page \"{}\":\n".format(title) +
"".join(lst))
return False, title, start_t, msg
class Wtp:
"""Context used for processing wikitext and for expanding templates,
parser functions and Lua macros. The indended usage pattern is to
initialize this context once (this holds template and module definitions),
and then using the context for processing many pages."""
__slots__ = (
"buf", # Buffer for reading/writing tmp_file
"buf_ofs", # Offset into buf
"buf_used", # Number of bytes in the buffer when reading
"buf_size", # Allocated size of buf, in bytes
"cache_file", # Prefix to cache files (instead of temporary file)
"cache_file_old", # Using pre-existing cache file
"cookies", # Mapping from magic cookie -> expansion data
"debugs", # List of debug messages (cleared for each new page)
"errors", # List of error messages (cleared for each new page)
"fullpage", # The unprocessed text of the current page (or None)
"lua", # Lua runtime or None if not yet initialized
"lua_depth", # Recursion depth in Lua calls
"lua_invoke", # Lua function used to invoke a Lua module
"lua_reset_env", # Function to reset Lua environment
"lua_path", # Path to Lua modules
"modules", # Lua code for defined Lua modules
"need_pre_expand", # Set of template names to be expanded before parse
"num_threads", # Number of parallel threads to use
"page_contents", # Full content for selected pages (e.g., Thesaurus)
"page_seq", # All content pages (title, model, ofs, len) in order
"quiet", # If True, don't print any messages during processing
"redirects", # Redirects in the wikimedia project
"rev_ht", # Mapping from text to magic cookie
"expand_stack", # Saved stack before calling Lua function
"templates", # dict temlate name -> definition
"title", # current page title
"tmp_file", # Temporary file used to store templates and pages
"tmp_ofs", # Next write offset
"transient_pages", # Unsaved pages added by extraction application
"transient_templates", # Unsaved templates added by application
"warnings", # List of warning messages (cleared for each new page)
# Data for parsing
"beginning_of_line", # Parser at beginning of line
"wsp_beginning_of_line", # Parser at beginning of line + whitespace
"linenum", # Current line number
"pre_parse", # XXX is pre-parsing still needed?
"parser_stack", # Parser stack
"section", # Section within page, for error messages
"subsection", # Subsection within page, for error messages
"suppress_special", # XXX never set to True???
"data_folder",
"NAMESPACE_DATA",
"namespaces",
"LANGUAGES_BY_CODE",
"lang_code",
"template_overrides",
)
def __init__(self, num_threads=None, cache_file=None, quiet=False,
lang_code="en", languages_by_code = {}):
assert num_threads is None or isinstance(num_threads, int)
assert cache_file is None or isinstance(cache_file, str)
assert quiet in (True, False)
if num_threads is None:
if platform.system() in ("Windows", "Darwin"):
# Default num_threads to 1 on Windows and MacOS, as they
# apparently don't use fork() for multiprocessing.Pool()
num_threads = 1
self.buf_ofs = 0
self.buf_size = 4 * 1024 * 1024
self.buf = bytearray(self.buf_size)
self.cache_file = cache_file
self.cache_file_old = False
self.cookies = []
self.errors = []
self.warnings = []
self.debugs = []
self.section = None
self.subsection = None
self.lua = None
self.lua_invoke = None
self.lua_reset_env = None
self.lua_depth = 0
self.quiet = quiet
self.rev_ht = {}
self.expand_stack = []
self.parser_stack = None
self.num_threads = num_threads
self.transient_pages = {}
self.transient_templates = {}
# Some predefined templates
self.need_pre_expand = None
self.template_overrides = {}
self.lang_code = lang_code
self.data_folder = Path(pkg_resources.resource_filename("wikitextprocessor", "data/")).joinpath(lang_code)
self.init_namespace_data()
self.namespaces = {}
init_namespaces(self)
self.LANGUAGES_BY_CODE = languages_by_code
# Open cache file if it exists; otherwise create new cache file or
# temporary file and reset saved pages.
self.tmp_file = None
if self.cache_file:
try:
# Load self.templates, self.page_contents, self.page_seq,
# self.redirects
with open(self.cache_file + ".pickle", "rb") as f:
dt = pickle.load(f)
version, dt = dt
if version == 1:
# Cache file version is compatible
self.tmp_file = open(self.cache_file, "rb", buffering=0)
self.page_contents, self.page_seq, self.redirects, \
self.templates, self.need_pre_expand = dt
self.need_pre_expand = set(self.need_pre_expand)
self.cache_file_old = True
except (FileNotFoundError, EOFError):
pass
if self.tmp_file is None:
self._reset_pages()
self.tmp_ofs = 0
self.buf_ofs = 0
def init_namespace_data(self):
with self.data_folder.joinpath("namespaces.json") \
.open(encoding="utf-8") as f:
self.NAMESPACE_DATA = json.load(f)
def _reset_pages(self):
"""Resets any stored pages and gets ready to receive more pages."""
self.tmp_file = None
self.page_contents = {}
self.page_seq = []
self.redirects = {}
self.templates = {}
self.need_pre_expand = None
self.cache_file_old = False
# Add predefined templates
self.templates["!"] = "|"
self.templates["!-"] = "|-"
self.templates[self._canonicalize_template_name("((")] = \
"{{" # {{((}}
self.templates[self._canonicalize_template_name("))")] = \
"}}" # {{))}}
# Create cache file or temporary file
if self.cache_file:
# Create new cache file
try:
os.remove(self.cache_file)
except FileNotFoundError:
pass
try:
os.remove(self.cache_file + ".pickle")
except FileNotFoundError:
pass
self.tmp_file = open(self.cache_file, "w+b", buffering=0)
else:
# Create temporary file
self.tmp_file = tempfile.TemporaryFile(mode="w+b", buffering=0)
def _fmt_errmsg(self, kind, msg, trace):
assert isinstance(kind, str)
assert isinstance(msg, str)
assert isinstance(trace, (str, type(None)))
loc = self.title
if self.section:
loc += "/" + self.section
if self.subsection:
loc += "/" + self.subsection
if self.expand_stack:
msg += " at {}".format(self.expand_stack)
if self.parser_stack:
titles = []
for node in self.parser_stack:
if node.kind in (NodeKind.LEVEL2, NodeKind.LEVEL3,
NodeKind.LEVEL4, NodeKind.LEVEL5,
NodeKind.LEVEL6):
if not node.args:
continue
lst = map(lambda x: x if isinstance(x, str) else "???",
node.args[0])
title = "".join(lst)
titles.append(title.strip())
msg += " parsing " + "/".join(titles)
if trace:
msg += "\n" + trace
print("{}: {}: {}".format(loc, kind,msg))
sys.stdout.flush()
def error(self, msg, trace=None, sortid="XYZunsorted"):
"""Prints an error message to stdout. The error is also saved in
self.errors."""
assert isinstance(msg, str)
assert isinstance(trace, (str, type(None)))
assert isinstance(sortid, str)
# sortid should be a static string only used to sort
# error messages into buckets based on where they
# have been called. There was previously some code for
# inspecting the stack trace here that did the same
# thing, but it was a bit costly.
self.errors.append({"msg": msg, "trace": trace,
"title": self.title,
"section": self.section,
"subsection": self.subsection,
"called_from": sortid,
"path": tuple(self.expand_stack)})
self._fmt_errmsg("ERROR", msg, trace)
def warning(self, msg, trace=None, sortid="XYZunsorted"):
"""Prints a warning message to stdout. The error is also saved in
self.warnings."""
assert isinstance(msg, str)
assert isinstance(trace, (str, type(None)))
assert isinstance(sortid, str)
self.warnings.append({"msg": msg, "trace": trace,
"title": self.title,
"section": self.section,
"subsection": self.subsection,
"called_from": sortid,
"path": tuple(self.expand_stack)})
self._fmt_errmsg("WARNING", msg, trace)
def debug(self, msg, trace=None, sortid="XYZunsorted"):
"""Prints a debug message to stdout. The error is also saved in
self.debug."""
assert isinstance(msg, str)
assert isinstance(trace, (str, type(None)))
assert isinstance(sortid, str)
self.debugs.append({"msg": msg, "trace": trace,
"title": self.title,
"section": self.section,
"subsection": self.subsection,
"called_from": sortid,
"path": tuple(self.expand_stack)})
self._fmt_errmsg("DEBUG", msg, trace)
def to_return(self):
"""Returns a dictionary with errors, warnings, and debug messages
from the context. Note that the values are reset whenever starting
processing a new word. The value returned by this function is
JSON-compatible and can easily be returned by a paralle process."""
return {
"errors": self.errors,
"warnings": self.warnings,
"debugs": self.debugs,
}
def _canonicalize_template_name(self, name):
"""Canonicalizes a template name by making its first character
uppercase and replacing underscores by spaces and sequences of
whitespace by a single whitespace."""
assert isinstance(name, str)
if name.lower().startswith(self.NAMESPACE_DATA["Template"]["name"].lower() + ":"):
name = name[len(self.NAMESPACE_DATA["Template"]["name"]) + 1:]
name = re.sub(r"_", " ", name)
name = re.sub(r"\s+", " ", name)
name = re.sub(r"\(", "%28", name)
name = re.sub(r"\)", "%29", name)
name = re.sub(r"&", "%26", name)
name = re.sub(r"\+", "%2B", name)
name = name.strip()
#if name:
# name = name[0].upper() + name[1:]
return name
def _canonicalize_parserfn_name(self, name):
"""Canonicalizes a parser function name by making its first character
uppercase and replacing underscores by spaces and sequences of
whitespace by a single whitespace."""
assert isinstance(name, str)
name = re.sub(r"_", " ", name)
name = re.sub(r"\s+", " ", name)
name = name.strip()
if name not in PARSER_FUNCTIONS:
name = name.lower() # Parser function names are case-insensitive
return name
def _save_value(self, kind, args, nowiki):
"""Saves a value of a particular kind and returns a unique magic
cookie for it."""
assert kind in ("T", # Template {{ ... }}
"A", # Template argument {{{ ... }}}
"L", # link
"E", # external link
"N", # nowiki text
)
assert isinstance(args, (list, tuple))
assert nowiki in (True, False)
# print("save_value", kind, args, nowiki)
args = tuple(args)
v = (kind, args, nowiki)
if v in self.rev_ht:
return self.rev_ht[v]
idx = len(self.cookies)
if idx >= MAX_MAGICS:
self.error("too many templates, arguments,"
"or parser function calls",
sortid="core/372")
return ""
self.cookies.append(v)
ch = chr(MAGIC_FIRST + idx)
self.rev_ht[v] = ch
ret = ch
return ret
def _encode(self, text):
"""Encode all templates, template arguments, and parser function calls
in the text, from innermost to outermost."""
def vbar_split(v):
args = list(m.group(1) for m in re.finditer(
r"(?si)\|((<\s*([-a-zA-z0-9]+)\b[^>]*>[^][{}]*?<\s*/\s*\3\s*>|"
r"[^|])*)", "|" + v))
return args
def repl_arg(m):
"""Replacement function for template arguments."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
orig = m.group(1)
args = vbar_split(orig)
return self._save_value("A", args, nowiki)
def repl_arg_err(m):
"""Replacement function for template arguments, with error."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
prefix = m.group(1)
orig = m.group(2)
args = vbar_split(orig)
self.debug("heuristically added missing }} to template arg {}"
# a single "}" needs to be escaped as "}}" with .format
.format(args[0].strip()),
sortid="core/405")
return prefix + self._save_value("A", args, nowiki)
def repl_templ(m):
"""Replacement function for templates {{name|...}} and parser
functions."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
v = m.group(1)
args = vbar_split(v)
# print("REPL_TEMPL: args={}".format(args))
return self._save_value("T", args, nowiki)
def repl_templ_err(m):
"""Replacement function for templates {{name|...}} and parser
functions, with error."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
prefix = m.group(1)
v = m.group(2)
args = vbar_split(v)
self.debug("heuristically added missing }} to template {}"
# a single "}" needs to be escaped as "}}" with .format
.format(args[0].strip()),
sortid="core/427")
return prefix + self._save_value("T", args, nowiki)
def repl_link(m):
"""Replacement function for links [[...]]."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
orig = m.group(1)
args = vbar_split(orig)
# print("REPL_LINK: orig={!r}".format(orig))
return self._save_value("L", args, nowiki)
def repl_extlink(m):
"""Replacement function for external links [...]. This is also
used to replace bracketed sections, such as [...]."""
nowiki = m.group(0).find(MAGIC_NOWIKI_CHAR) >= 0
orig = m.group(1)
args = [orig]
return self._save_value("E", args, nowiki)
# Main loop of encoding. We encode repeatedly, always the innermost
# template, argument, or parser function call first. We also encode
# links as they affect the interpretation of templates.
# As a preprocessing step, remove comments from the text.
text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
while True:
prev = text
# Encode template arguments. We repeat this until there are
# no more matches, because otherwise we could encode the two
# innermost braces as a template transclusion.
while True:
prev2 = text
# Encode links.
while True:
text = re.sub(r"(?s)\[" + MAGIC_NOWIKI_CHAR +
r"?\[(([^][{}]|<[-+*a-zA-Z0-9]*>)+)\]" +
MAGIC_NOWIKI_CHAR + r"?\]",
repl_link, text)
if text == prev2:
break
prev2 = text
# Encode external links.
text = re.sub(r"(?s)\[([^][{}<>|]+)\]", repl_extlink, text)
# Encode template arguments
text = re.sub(r"(?s)\{" + MAGIC_NOWIKI_CHAR +
r"?\{" + MAGIC_NOWIKI_CHAR +
r"?\{(([^{}]|\{\|[^{}]*\|\})*?)\}" +
MAGIC_NOWIKI_CHAR + r"?\}" +
MAGIC_NOWIKI_CHAR + r"?\}",
repl_arg, text)
if text == prev2:
# When everything else has been done, see if we can find
# template arguments that have one missing closing bracket.
# This is so common in Wiktionary that I'm suspecting it
# might be allowed by the MediaWiki parser.
# This needs to be done before processing templates, as
# otherwise the argument with a missing closing brace would
# be interpreted as a template.
# Note: we don't want to do this for {{{!}}, as that is
# sometimes used inside {{#if|...}} for table start/end.
text = re.sub(r"(?s)([^{])\{" + MAGIC_NOWIKI_CHAR +
r"?\{" + MAGIC_NOWIKI_CHAR +
r"?\{([^{}!]*?)\}" +
MAGIC_NOWIKI_CHAR + r"?\}",
repl_arg_err, text)
if text != prev2:
continue
break
# Replace template invocation
text = re.sub(r"(?si)\{" + MAGIC_NOWIKI_CHAR +
r"?\{(("
r"\{\|[^{}]*?\|\}|"
r"\}[^{}]|"
r"[^{}](\{[^{}|])?"
r")+?)\}" +
MAGIC_NOWIKI_CHAR + r"?\}",
repl_templ, text)
# We keep looping until there is no change during the iteration
if text == prev:
# When everything else has been done, see if we can find
# template calls that have one missing closing bracket.
# This is so common in Wiktionary that I'm suspecting it
# might be allowed by the MediaWiki parser. We must allow
# tables {| ... |} inside these.
text = re.sub(r"(?s)([^{])\{" + MAGIC_NOWIKI_CHAR +
r"?\{(([^{}]|\{\|[^{}]*\|\}|\}[^{}])+?)\}",
repl_templ_err, text)
if text != prev:
continue
break
prev = text
# Replace any remaining braces etc by corresponding character entities
#text = re.sub(r"\{([&|])", r"{\1", text)
#text = re.sub(r"\{([&|])", r"{\1", text)
#text = re.sub(r"[^|]\}", r"\1}", text)
#text = re.sub(r"[^|]\}", r"\1}", text)
#text = re.sub(r"\|", "|", text)
return text
def _template_to_body(self, title, text):
"""Extracts the portion to be transcluded from a template body. This
returns an str."""
assert isinstance(title, str)
assert isinstance(text, str)
# Remove all comments
text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
# Remove all text inside <noinclude> ... </noinclude>
text = re.sub(r"(?is)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
"", text)
# Handle <noinclude> without matching </noinclude> by removing the
# rest of the file. <noinclude/> is handled specially elsewhere, as
# it appears to be used as a kludge to prevent normal interpretation
# of e.g. [[ ... ]] by placing it between the brackets.
text = re.sub(r"(?is)<\s*noinclude\s*>.*", "", text)
# Apparently unclosed <!-- at the end of a template body is ignored
text = re.sub(r"(?s)<!\s*--.*", "", text)
# <onlyinclude> tags, if present, include the only text that will be
# transcluded. All other text is ignored.
onlys = list(re.finditer(r"(?is)<\s*onlyinclude\s*>(.*?)"
r"<\s*/\s*onlyinclude\s*>|"
r"<\s*onlyinclude\s*/\s*>",
text))
if onlys:
text = "".join(m.group(1) or "" for m in onlys)
# Remove <includeonly>. They mark text that is not visible on the page
# itself but is included in transclusion. Also text outside these tags
# is included in transclusion.
text = re.sub(r"(?is)<\s*(/\s*)?includeonly\s*(/\s*)?>", "", text)
return text
def add_page(self, model, title, text, transient=False):
"""Collects information about the page. For templates and modules,
this keeps the content in memory. For other pages, this saves the
content in a temporary file so that it can be accessed later. There
must be enough space on the volume containing the temporary file
to store the entire contents of the uncompressed WikiMedia dump.
The content is saved because it is common for Wiktionary Lua macros
to access content from arbitrary pages. Furthermore, this way we
only need to decompress and parse the dump file once. ``model``
is "wikitext" for normal pages, "redirect" for redirects (in which
case ``text`` is the page pointed to), or "Scribunto" for Lua code;
other values may also be encountered. If ``transient`` is True, then
this page will not be saved but will replace any saved page. This can
be used, for example, to add Lua code for data extraction, or for
debugging Lua modules."""
assert isinstance(model, str)
assert isinstance(title, str)
assert isinstance(text, str)
assert transient in (True, False)
if transient:
self.transient_pages[title] = (title, model, text)
if (title.startswith(self.NAMESPACE_DATA["Template"]["name"] + ":") and
not title.endswith("/documentation") and
not title.endswith("/testcases")):
name = self._canonicalize_template_name(title)
body = self._template_to_body(title, text)
self.transient_templates[name] = body
return
# If we have previously analyzed pages and this is called again,
# reset all previously saved pages (e.g., in case we are to update
# existing cache file).
if self.need_pre_expand is not None:
self._reset_pages()
# Save the page in our temporary file and metadata in memory
rawtext = text.encode("utf-8")
if self.buf_ofs + len(rawtext) > self.buf_size:
bufview = memoryview(self.buf)[0: self.buf_ofs]
self.tmp_file.write(bufview)
self.buf_ofs = 0
ofs = self.tmp_ofs
self.tmp_ofs += len(rawtext)
if len(rawtext) >= self.buf_ofs:
self.tmp_file.write(rawtext)
else:
self.buf[self.buf_ofs: self.buf_ofs + len(rawtext)] = rawtext
# XXX should we canonicalize title in page_contents
self.page_contents[title] = (title, model, ofs, len(rawtext))
self.page_seq.append((model, title))
if not self.quiet and len(self.page_seq) % 10000 == 0:
print(" ... {} raw pages collected"
.format(len(self.page_seq)))
sys.stdout.flush()
if model == "redirect":
self.redirects[title] = text
return
if not title.startswith(self.NAMESPACE_DATA["Template"]["name"] + ":"):
return
if title.endswith("/documentation"):
return
if title.endswith("/testcases"):
return
# It is a template
name = self._canonicalize_template_name(title)
body = self._template_to_body(title, text)
assert isinstance(body, str)
self.templates[name] = body
if self.lang_code == "zh":
self.add_chinese_lower_case_template(name, body)
def add_chinese_lower_case_template(self, name, body):
# Chinese Wiktionary capitalizes the first letter of template name
# in template pages but uses lower case in word pages
lower_case_name = name[0].lower() + name[1:]
if lower_case_name not in self.templates:
self.templates[lower_case_name] = body
def _analyze_template(self, name, body):
"""Analyzes a template body and returns a set of the canonicalized
names of all other templates it calls and a boolean that is True
if it should be pre-expanded before final parsing and False if it
need not be pre-expanded. The pre-expanded flag is determined
based on that body only; the caller should propagate it to
templates that include the given template. This does not work for
template and template function calls where the name is generated by
other expansions."""
assert isinstance(body, str)
included_templates = set()
pre_expand = False
# Determine if the template starts with a list item
# XXX should we expand other templates that produce list items???
contains_list = re.match(r"(?s)^[#*;:]", body) is not None
# Remove paired tables
prev = body
while True:
unpaired_text = re.sub(
r"(?s)(^|\n)\{\|([^\n]|\n+[^{|]|\n+\|[^}]|\n+\{[^|])*?\n+\|\}",
r"", prev)
if unpaired_text == prev:
break
prev = unpaired_text
#print("unpaired_text {!r}".format(unpaired_text))
# Determine if the template contains an unpaired table
contains_unpaired_table = re.search(r"(?s)(^|\n)(\{\||\|\})",
unpaired_text) is not None
# Determine if the template contains table element tokens
# outside paired table start/end. We only try to look for
# these outside templates, as it is common to write each
# template argument on its own line starting with a "|".
outside = unpaired_text
while True:
#print("=== OUTSIDE ITER")
prev = outside
while True:
newt = re.sub(r"(?s)\{\{\{([^{}]|\}[^}]|\}\}[^}])*?\}\}\}",
"", prev)
if newt == prev:
break
prev = newt
#print("After arg elim: {!r}".format(newt))
newt = re.sub(r"(?s)\{\{([^{}]|\}[^}])*?\}\}", "", newt)
#print("After templ elim: {!r}".format(newt))
if newt == outside:
break
outside = newt
# Check if the template contains certain table elements
m = re.search(r"(?s)(^|\n)(\|\+|\|-|\!)", outside)
m2 = re.match(r"(?si)\s*(<includeonly>|<!\s*--.*?--\s*>)(\|\||!!)",
outside)
contains_table_element = m is not None or m2 is not None
# if contains_table_element:
# print("contains_table_element {!r} at {}"
# .format(m.group(0), m.start()))
# print("... {!r} ...".format(outside[m.start() - 10:m.end() + 10]))
# print(repr(outside))
# Check for unpaired HTML tags
tag_cnts = collections.defaultdict(int)
for m in re.finditer(r"(?si)<\s*(/\s*)?({})\b\s*[^>]*(/\s*)?>"
r"".format("|".join(PAIRED_HTML_TAGS)), outside):
start_slash = m.group(1)
tagname = m.group(2)
end_slash = m.group(3)
if start_slash:
tag_cnts[tagname] -= 1
elif not end_slash:
tag_cnts[tagname] += 1
contains_unbalanced_html = any(v != 0 for v in tag_cnts.values())
# if contains_unbalanced_html:
# print(name, "UNBALANCED HTML")
# for k, v in tag_cnts.items():
# if v != 0:
# print(" {} {}".format(v, k))
# Chinese Wiktionary uses templates for language and POS headings
# Language templates: https://zh.wiktionary.org/wiki/Category:语言模板
# POS templates: https://zh.wiktionary.org/wiki/Category:詞類模板
is_chinese_heading = (self.lang_code == "zh" and
name.startswith(("-", "=")))
# Determine whether this template should be pre-expanded
pre_expand = (contains_list or contains_unpaired_table or
contains_table_element or contains_unbalanced_html or
is_chinese_heading)
# if pre_expand:
# print(name,
# {"list": contains_list,
# "unpaired_table": contains_unpaired_table,
# "table_element": contains_table_element,
# "unbalanced_html": contains_unbalanced_html,
# "pre_expand": pre_expand,
# })
# Determine which other templates are called from unpaired text.
# None of the flags we currently gather propagate outside a paired
# table start/end.
for m in re.finditer(r"(?s)(^|[^{])(\{\{)?\{\{([^{]*?)(\||\}\})",
unpaired_text):
name = m.group(3)
name = re.sub(r"(?si)<\s*nowiki\s*/\s*>", "", name)
name = self._canonicalize_template_name(name)
if not name:
continue
included_templates.add(name)
return included_templates, pre_expand
def analyze_templates(self):
"""Analyzes templates to determine which of them might create elements
essential to parsing Wikitext syntax, such as table start or end
tags. Such templates generally need to be expanded before
parsing the page."""
self.need_pre_expand = set()
expand_q = []
included_map = collections.defaultdict(set)
for name, body in self.templates.items():
included_templates, pre_expand = self._analyze_template(name, body)
for x in included_templates:
included_map[x].add(name)
if pre_expand:
self.need_pre_expand.add(name)
expand_q.append(name)
# XXX consider encoding template bodies here (also need to save related
# cookies). This could speed up their expansion, where the first
# operation is to encode them. (Consider whether cookie numbers from
# nested template expansions could conflict)
# Propagate pre_expand from lower-level templates to all templates that
# refer to them
while expand_q:
name = expand_q.pop()
if name not in included_map:
continue
for inc in included_map[name]:
if inc in self.need_pre_expand:
continue
#print("propagating EXP {} -> {}".format(name, inc))
self.need_pre_expand.add(inc)
expand_q.append(inc)
# Copy template definitions to redirects to them
for k, v in self.redirects.items():
if not k.startswith(self.NAMESPACE_DATA["Template"]["name"] + ":"):
# print("Unhandled redirect src", k)
continue
if not v.startswith(self.NAMESPACE_DATA["Template"]["name"] + ":"):
# print("Unhandled redirect dst", v)
continue
k = self._canonicalize_template_name(k)
v = self._canonicalize_template_name(v)
if v not in self.templates:
# print("{} redirects to non-existent template {}".format(k, v))
continue
if k in self.templates:
# print("{} -> {} is redirect but already in templates"
# .format(k, v))
continue
self.templates[k] = self.templates[v]
if v in self.need_pre_expand or (self.lang_code == "zh" and
k.startswith(("-", "="))):
self.need_pre_expand.add(k)
if self.lang_code == "zh":
self.add_chinese_lower_case_template(k, self.templates[v])
# Save cache data
if self.cache_file is not None and not self.cache_file_old:
with open(self.cache_file + ".pickle", "wb") as f:
pickle.dump((1, (self.page_contents, self.page_seq,
self.redirects, self.templates,
list(sorted(self.need_pre_expand)))),
f)
def start_page(self, title):
"""Starts a new page for expanding Wikitext. This saves the title and
full page source in the context. Calling this is mandatory
for each page; expand_wikitext() can then be called multiple
times for the same page. This clears the self.errors,
self.warnings, and self.debugs lists and any current section
or subsection."""
assert isinstance(title, str)
self.title = title
self.errors = []
self.warnings = []
self.debugs = []
self.section = None
self.subsection = None
self.cookies = []
self.rev_ht = {}
self.expand_stack = [title]
def start_section(self, title):
"""Starts processing a new section of the current page. Calling this
is optional, but can help provide better error messages. This clears
any current subsection."""
assert title is None or isinstance(title, str)
self.section = title
self.subsection = None
def start_subsection(self, title):
"""Starts processing a new subsection of the current section on the
current page. Calling this is optional, but can help provide better
error messages."""
assert title is None or isinstance(title, str)
self.subsection = title
def _unexpanded_template(self, args, nowiki):
"""Formats an unexpanded template (whose arguments may have been
partially or fully expanded)."""
if nowiki:
return ("{{" +
"|".join(args) +
"}}")
return "{{" + "|".join(args) + "}}"
def _unexpanded_arg(self, args, nowiki):
"""Formats an unexpanded template argument reference."""
if nowiki:
return ("{{{" +
"|".join(args) +
"}}}")
return "{{{" + "|".join(args) + "}}}"
def _unexpanded_link(self, args, nowiki):
"""Formats an unexpanded link."""
if nowiki:
return "[[" + "|".join(args) + "]]"
return "[[" + "|".join(args) + "]]"
def _unexpanded_extlink(self, args, nowiki):
"""Formats an unexpanded external link."""
if nowiki:
return "[" + "|".join(args) + "]"
return "[" + "|".join(args) + "]"
def preprocess_text(self, text):
"""Preprocess the text by handling <nowiki> and comments."""
assert isinstance(text, str)
# print("PREPROCESS_TEXT: {!r}".format(text))
def _nowiki_sub_fn(m):
"""This function escapes the contents of a <nowiki> ... </nowiki>
pair."""
text = m.group(1)
return self._save_value("N", (text,), False)
text = re.sub(r"(?si)<\s*nowiki\s*>(.*?)<\s*/\s*nowiki\s*>",
_nowiki_sub_fn, text)
text = re.sub(r"(?si)<\s*nowiki\s*/\s*>", MAGIC_NOWIKI_CHAR, text)
text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
# print("PREPROCESSED_TEXT: {!r}".format(text))
return text
def expand(self, text, parent=None, pre_expand=False,
template_fn=None, post_template_fn=None,
templates_to_expand=None,
templates_to_not_expand=None,
expand_parserfns=True, expand_invoke=True, quiet=False,
timeout=None):
"""Expands templates and parser functions (and optionally Lua macros)
from ``text`` (which is from page with title ``title``).
``templates_to_expand`` should be None to expand all
templates, or a set or dictionary whose keys are those
canonicalized template names that should be expanded; if
``pre_expand`` is set to True, then only templates needing
pre-expansion before parsing plus those in
``templates_to_expand`` are expanded, ignoring those in
``templates_to_not_expand`` (which will preserve their name,
so that they can be extracted later as a node).
``template_fn``, if given, will be be called as
template_fn(name, args_ht) to expand templates;
if it is not defined or returns None, the
default expansion will be used (it can also be used to capture
template arguments). If ``post_template_fn`` is given, it
will be called as post_template_fn(name, args_ht, expanded)
and if it returns other than None, its return value will
replace the template expansion. This returns the text with
the given templates expanded."""
assert isinstance(text, str)
assert parent is None or (isinstance(parent, (list, tuple)) and
len(parent) == 2)
assert pre_expand in (True, False)
assert template_fn is None or callable(template_fn)
assert post_template_fn is None or callable(post_template_fn)
assert isinstance(templates_to_expand, (set, dict, type(None)))
assert self.title is not None # start_page() must have been called
assert quiet in (False, True)
assert timeout is None or isinstance(timeout, (int, float))
# Handle <nowiki> in a preprocessing step
text = self.preprocess_text(text)
# If requesting to pre_expand, then add templates needing pre-expand
# to those to be expanded (and don't expand everything).
if pre_expand:
if self.need_pre_expand is None:
if self.cache_file and not self.cache_file_old:
raise RuntimeError("You have specified a cache file "
"but have not properly initialized "
"the cache file.")
raise RuntimeError("analyze_templates() must be run first to "
"determine which templates need pre-expand")
if (templates_to_expand is not None and
templates_to_not_expand is not None):
templates_to_expand = (set(templates_to_expand) |
set(self.need_pre_expand))
templates_to_expand = (templates_to_expand -
set(templates_to_not_expand))
elif (templates_to_expand is not None and
templates_to_not_expand is None):
templates_to_expand = (set(templates_to_expand) |
set(self.need_pre_expand))
elif (templates_to_expand is None and
templates_to_not_expand is not None):
templates_to_expand = (set(self.need_pre_expand) -
set(templates_to_not_expand))
else:
templates_to_expand = self.need_pre_expand
# Create set or dict of all defined templates
if self.transient_templates:
all_templates = (set(self.templates) |
set(self.transient_templates))
else:
all_templates = self.templates
# If templates_to_expand is None, then expand all known templates
if templates_to_expand is None:
templates_to_expand = all_templates
def invoke_fn(invoke_args, expander, parent):
"""This is called to expand a #invoke parser function."""
assert isinstance(invoke_args, (list, tuple))
assert callable(expander)
assert isinstance(parent, (tuple, type(None)))
# print("INVOKE_FN", invoke_args, parent)
# sys.stdout.flush()
# Use the Lua sandbox to execute a Lua macro. This will initialize
# the Lua environment and store it in self.lua if it does not
# already exist (it needs to be re-created for each new page).
ret = call_lua_sandbox(self, invoke_args, expander, parent, timeout)
# print("invoke_fn: invoke_args={} parent={} LUA ret={!r}"
# .format(invoke_args, parent, ret))
return ret
def expand_recurse(coded, parent, templates_to_expand):