-
Notifications
You must be signed in to change notification settings - Fork 191
/
replace.py
executable file
·1147 lines (945 loc) · 42.5 KB
/
replace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
r"""
This bot will make direct text replacements.
It will retrieve information on which pages might need changes either from
an XML dump or a text file, or only change a single page.
These command line parameters can be used to specify which pages to work on:
¶ms;
Furthermore, the following command line parameters are supported:
-mysqlquery Retrieve information from a local database mirror.
If no query specified, bot searches for pages with
given replacements.
-xml Retrieve information from a local XML dump
(pages-articles or pages-meta-current, see
https://dumps.wikimedia.org). Argument can also
be given as "-xml:filename".
-regex Make replacements using regular expressions. If this argument
isn't given, the bot will make simple text replacements.
-nocase Use case insensitive regular expressions.
-dotall Make the dot match any character at all, including a newline.
Without this flag, '.' will match anything except a newline.
-multiline '^' and '$' will now match begin and end of each line.
-xmlstart (Only works with -xml) Skip all articles in the XML dump
before the one specified (may also be given as
-xmlstart:Article).
-addcat:cat_name Adds "cat_name" category to every altered page.
-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex
argument is given, XYZ will be regarded as a regular
expression.
-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex
argument is given, XYZ will be regarded as a regular
expression.
-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex
argument is given, XYZ will be regarded as a regular
expression.
-exceptinside:XYZ Skip occurrences of the to-be-replaced text which lie
within XYZ. If the -regex argument is given, XYZ will be
regarded as a regular expression.
-exceptinsidetag:XYZ Skip occurrences of the to-be-replaced text which lie
within an XYZ tag.
-summary:XYZ Set the summary message text for the edit to XYZ, bypassing
the predefined message texts with original and replacements
inserted. To add the replacements to your summary use the
%(description)s placeholder, for example:
-summary:"Bot operated replacement: %(description)s"
Can't be used with -automaticsummary.
-automaticsummary Uses an automatic summary for all replacements which don't
have a summary defined. Can't be used with -summary.
-sleep:123 If you use -fix you can check multiple regex at the same time
in every page. This can lead to a great waste of CPU because
the bot will check every regex without waiting using all the
resources. This will slow it down between a regex and another
in order not to waste too much CPU.
-fix:XYZ Perform one of the predefined replacements tasks, which are
given in the dictionary 'fixes' defined inside the files
fixes.py and user-fixes.py.
&fixes-help;
-manualinput Request manual replacements via the command line input even
if replacements are already defined. If this option is set
(or no replacements are defined via -fix or the arguments)
it'll ask for additional replacements at start.
-pairsfile Lines from the given file name(s) will be read as replacement
arguments. i.e. a file containing lines "a" and "b", used as:
python pwb.py replace -page:X -pairsfile:file c d
will replace 'a' with 'b' and 'c' with 'd'.
-always Don't prompt you for each replacement
-quiet Don't prompt a message if a page keeps unchanged
-nopreload Do not preload pages. Useful if disabled on a wiki.
-recursive Recurse replacement as long as possible. Be careful, this
might lead to an infinite loop.
-allowoverlap When occurrences of the pattern overlap, replace all of them.
Be careful, this might lead to an infinite loop.
-fullsummary Use one large summary for all command line replacements.
*Replacement parameters*
Replacement parameters are pairs of arguments given to the script.
The First argument is the old text to be replaced, the second
argument is the new text. If the ``-regex`` argument is given, the
first argument will be regarded as a regular expression, and the
second argument might contain expressions like ``\1`` or ``\g<name>``.
The second parameter can also be specified as empty string, usually
``""``. It is possible to introduce more than one pair of
replacement parameters.
.. admonition:: **Empty string arguments with PowerShell**
:class: attention
Using PowerShell as command shell removes empty strings during
PowerShell's command line parsing. To enable empty strings with
PowerShell you have either to escape quotation marks with gravis
symbols in front of them like ```"`"`` or to disable command line
parsing with ``--%`` symbol for all following command parts like
:samp:`python pwb replace --% -start:! foo ""` which disables parsing
for all replace options and arguments following this delimiter and
enables empty strings.
Examples
--------
If you want to change templates from the old syntax, e.g.
``{{msg:Stub}}``, to the new syntax, e.g. ``{{Stub}}``, download an XML
dump file (pages-articles) from https://dumps.wikimedia.org, then use
this command:
python pwb.py replace -xml -regex "{{msg:(.*?)}}" "{{\1}}"
If you have a dump called ``foobar.xml`` and want to fix typos in
articles, e.g. Errror -> Error, use this:
python pwb.py replace -xml:foobar.xml "Errror" "Error" -namespace:0
If you want to do more than one replacement at a time, use this:
python pwb.py replace -xml:foobar.xml "Errror" "Error" "Faail" "Fail" \
-namespace:0
If you have a page called 'John Doe' and want to fix the format of ISBNs,
use:
python pwb.py replace -page:John_Doe -fix:isbn
This command will change 'referer' to 'referrer', but not in pages which
talk about HTTP, where the typo has become part of the standard:
python pwb.py replace referer referrer -file:typos.txt -excepttext:HTTP
.. seealso:: :mod:`scripts.template` to modify or remove templates.
.. Please type "python pwb.py replace -help | more" if you can't read
the top of the help.
"""
#
# (C) Pywikibot team, 2004-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations
import re
from collections.abc import Sequence
from contextlib import suppress
from pathlib import Path
from typing import Any
import pywikibot
from pywikibot import editor, fixes, i18n, pagegenerators, textlib
from pywikibot.backports import Generator, Pattern, batched
from pywikibot.bot import ExistingPageBot, SingleSiteBot
from pywikibot.exceptions import InvalidPageError, NoPageError
from pywikibot.tools import chars
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
'&fixes-help;': fixes.parameter_help,
}
def precompile_exceptions(exceptions, use_regex, flags) -> None:
"""Compile the exceptions with the given flags."""
if not exceptions:
return
for exceptionCategory in [
'title', 'require-title', 'text-contains', 'inside']:
if exceptionCategory in exceptions:
patterns = exceptions[exceptionCategory]
if isinstance(patterns, str):
patterns = [patterns]
if not use_regex:
patterns = [re.escape(pattern) for pattern in patterns]
patterns = [re.compile(pattern, flags) for pattern in patterns]
exceptions[exceptionCategory] = patterns
def _get_text_exceptions(exceptions):
"""Get exceptions on text (inside exceptions)."""
return exceptions.get('inside-tags', []) + exceptions.get('inside', [])
class ReplacementBase:
"""The replacement instructions."""
def __init__(
self,
old,
new,
edit_summary=None,
default_summary=True
) -> None:
"""Create a basic replacement instance."""
self.old = old
self.old_regex = None
self.new = new
self._edit_summary = edit_summary
self.default_summary = default_summary
@property
def edit_summary(self) -> str:
"""Return the edit summary for this fix."""
return self._edit_summary
@property
def description(self) -> str:
"""Description of the changes that this replacement applies.
This description is used as the default summary of the replacement. If
you do not specify an edit summary on the command line or in some other
way, whenever you apply this replacement to a page and submit the
changes to the MediaWiki server, the edit summary includes the
descriptions of each replacement that you applied to the page.
"""
return f'-{self.old} +{self.new}'
@property
def container(self):
"""Container object which contains this replacement.
A container object is an object that groups one or more replacements
together and provides some properties that are common to all of them.
For example, containers may define a common name for a group of
replacements, or a common edit summary.
Container objects must have a "name" attribute.
"""
return None
def _compile(self, use_regex, flags) -> None:
"""Compile the search text without modifying the flags."""
# This does not update use_regex and flags depending on this instance
if not use_regex:
self.old_regex = re.escape(self.old)
else:
self.old_regex = self.old
self.old_regex = re.compile(self.old_regex, flags)
def compile(self, use_regex, flags) -> None:
"""Compile the search text."""
# Set the regular expression flags
if self.case_insensitive is False:
flags &= ~re.IGNORECASE
elif self.case_insensitive:
flags |= re.IGNORECASE
if self.use_regex is not None:
use_regex = self.use_regex # this replacement overrides it
self._compile(use_regex, flags)
class Replacement(ReplacementBase):
"""A single replacement with it's own data."""
def __init__(self, old, new, use_regex=None, exceptions=None,
case_insensitive=None, edit_summary=None,
default_summary=True) -> None:
"""Create a single replacement entry unrelated to a fix."""
super().__init__(old, new, edit_summary, default_summary)
self._use_regex = use_regex
self.exceptions = exceptions
self._case_insensitive = case_insensitive
@classmethod
def from_compiled(cls, old_regex, new, **kwargs):
"""Create instance from already compiled regex."""
if kwargs.get('use_regex', True) is not True:
raise ValueError('The use_regex parameter can only be True.')
repl = cls(old_regex.pattern, new, **kwargs)
repl.old_regex = old_regex
return repl
@property
def case_insensitive(self):
"""Return whether the search text is case insensitive."""
return self._case_insensitive
@property
def use_regex(self):
"""Return whether the search text is using regex."""
return self._use_regex
def _compile(self, use_regex, flags) -> None:
"""Compile the search regex and exceptions."""
super()._compile(use_regex, flags)
precompile_exceptions(self.exceptions, use_regex, flags)
def get_inside_exceptions(self):
"""Get exceptions on text (inside exceptions)."""
return _get_text_exceptions(self.exceptions or {})
class ReplacementList(list):
"""
A list of replacements which all share some properties.
The shared properties are:
* use_regex
* exceptions
* case_insensitive
Each entry in this list should be a ReplacementListEntry. The exceptions
are compiled only once.
"""
def __init__(self, use_regex, exceptions, case_insensitive, edit_summary,
name) -> None:
"""Create a fix list which can contain multiple replacements."""
super().__init__()
self.use_regex = use_regex
self._exceptions = exceptions
self.exceptions = None
self.case_insensitive = case_insensitive
self.edit_summary = edit_summary
self.name = name
def _compile_exceptions(self, use_regex, flags) -> None:
"""Compile the exceptions if not already done."""
if not self.exceptions and self._exceptions is not None:
self.exceptions = dict(self._exceptions)
precompile_exceptions(self.exceptions, use_regex, flags)
class ReplacementListEntry(ReplacementBase):
"""A replacement entry for ReplacementList."""
def __init__(self, old, new, fix_set, edit_summary=None,
default_summary=True) -> None:
"""Create a replacement entry inside a fix set."""
super().__init__(old, new, edit_summary, default_summary)
self.fix_set = fix_set
@property
def case_insensitive(self):
"""Return whether the fix set is case insensitive."""
return self.fix_set.case_insensitive
@property
def use_regex(self):
"""Return whether the fix set is using regex."""
return self.fix_set.use_regex
@property
def exceptions(self):
"""Return the exceptions of the fix set."""
return self.fix_set.exceptions
@property
def edit_summary(self):
"""Return this entry's edit summary or the fix's summary."""
if self._edit_summary is None:
return self.fix_set.edit_summary
return self._edit_summary
@property
def container(self):
"""Container object which contains this replacement.
A container object is an object that groups one or more replacements
together and provides some properties that are common to all of them.
For example, containers may define a common name for a group of
replacements, or a common edit summary.
Container objects must have a "name" attribute.
"""
return self.fix_set
def _compile(self, use_regex, flags) -> None:
"""Compile the search regex and the fix's exceptions."""
super()._compile(use_regex, flags)
self.fix_set._compile_exceptions(use_regex, flags)
def get_inside_exceptions(self):
"""Get exceptions on text (inside exceptions)."""
return _get_text_exceptions(self.fix_set.exceptions or {})
class XmlDumpReplacePageGenerator:
"""
Iterator that will yield Pages that might contain text to replace.
These pages will be retrieved from a local XML dump file.
:param xmlFilename: The dump's path, either absolute or relative
:param xmlStart: Skip all articles in the dump before this one
:param replacements: A list of 2-tuples of original text (as a
compiled regular expression) and replacement text (as a string).
:param exceptions: A dictionary which defines when to ignore an
occurrence. See docu of the ReplaceRobot initializer below.
:type exceptions: dict
"""
def __init__(self,
xmlFilename: str,
xmlStart: str,
replacements: list[tuple[Any, str]],
exceptions: dict[str, Any],
site) -> None:
"""Initializer."""
self.xmlFilename = xmlFilename
self.replacements = replacements
self.exceptions = exceptions
self.xmlStart = xmlStart
self.skipping = bool(xmlStart)
self.excsInside = []
if 'inside-tags' in self.exceptions:
self.excsInside += self.exceptions['inside-tags']
if 'inside' in self.exceptions:
self.excsInside += self.exceptions['inside']
from pywikibot import xmlreader
if site:
self.site = site
else:
self.site = pywikibot.Site()
dump = xmlreader.XmlDump(self.xmlFilename, on_error=pywikibot.error)
self.parser = dump.parse()
def __iter__(self):
"""Iterator method."""
try:
for entry in self.parser:
if self.skipping:
if entry.title != self.xmlStart:
continue
self.skipping = False
if self.isTitleExcepted(entry.title) \
or self.isTextExcepted(entry.text):
continue
new_text = entry.text
for replacement in self.replacements:
# This doesn't do an actual replacement but just
# checks if at least one does apply
new_text = textlib.replaceExcept(
new_text, replacement.old_regex, replacement.new,
self.excsInside + replacement.get_inside_exceptions(),
site=self.site)
if new_text != entry.text:
yield pywikibot.Page(self.site, entry.title)
except KeyboardInterrupt:
with suppress(NameError):
if not self.skipping:
pywikibot.info(
'To resume, use "-xmlstart:{}" on the command line.'
.format(entry.title))
def isTitleExcepted(self, title) -> bool:
"""Return True if one of the exceptions applies for the given title."""
if 'title' in self.exceptions:
for exc in self.exceptions['title']:
if exc.search(title):
return True
if 'require-title' in self.exceptions:
for req in self.exceptions['require-title']:
if not req.search(title): # if not all requirements are met:
return True
return False
def isTextExcepted(self, text) -> bool:
"""Return True if one of the exceptions applies for the given text."""
if 'text-contains' in self.exceptions:
return any(exc.search(text)
for exc in self.exceptions['text-contains'])
return False
class ReplaceRobot(SingleSiteBot, ExistingPageBot):
"""A bot that can do text replacements.
:param generator: generator that yields Page objects
:type generator: generator
:param replacements: a list of Replacement instances or sequences of
length 2 with the original text (as a compiled regular expression)
and replacement text (as a string).
:param exceptions: a dictionary which defines when not to change an
occurrence. This dictionary can have these keys:
title
A list of regular expressions. All pages with titles that
are matched by one of these regular expressions are skipped.
text-contains
A list of regular expressions. All pages with text that
contains a part which is matched by one of these regular
expressions are skipped.
inside
A list of regular expressions. All occurrences are skipped which
lie within a text region which is matched by one of these
regular expressions.
inside-tags
A list of strings. These strings must be keys from the
dictionary in :func:`textlib._create_default_regexes` or must be
accepted by :func:`textlib.get_regexes`.
:keyword allowoverlap: when matches overlap, all of them are replaced.
:type allowoverlap: bool
:keyword recursive: Recurse replacement as long as possible.
:type recursive: bool
:keyword addcat: category to be added to every page touched
:type addcat: pywikibot.Category or str or None
:keyword sleep: slow down between processing multiple regexes
:type sleep: int
:keyword summary: Set the summary message text bypassing the default
:type summary: str
:keyword always: the user won't be prompted before changes are made
:type keyword: bool
:keyword site: Site the bot is working on.
.. warning::
- Be careful with `recursive` parameter, this might lead to an
infinite loop.
- `site` parameter should be passed to constructor.
Otherwise the bot takes the current site and warns the operator
about the missing site
"""
def __init__(self, generator,
replacements: list[tuple[Any, str]],
exceptions: dict[str, Any] | None = None,
**kwargs) -> None:
"""Initializer."""
self.available_options.update({
'addcat': None,
'allowoverlap': False,
'quiet': False,
'recursive': False,
'sleep': 0.0,
'summary': None,
})
super().__init__(generator=generator, **kwargs)
for i, replacement in enumerate(replacements):
if isinstance(replacement, Sequence):
if len(replacement) != 2:
raise ValueError('Replacement number {} does not have '
'exactly two elements: {}'.format(
i, replacement))
# Replacement assumes it gets strings but it's already compiled
replacements[i] = Replacement.from_compiled(replacement[0],
replacement[1])
self.replacements = replacements
self.exceptions = exceptions or {}
if self.opt.addcat and isinstance(self.opt.addcat, str):
self.opt.addcat = pywikibot.Category(self.site, self.opt.addcat)
def isTitleExcepted(self, title, exceptions=None) -> bool:
"""Return True if one of the exceptions applies for the given title."""
if exceptions is None:
exceptions = self.exceptions
if 'title' in exceptions:
for exc in exceptions['title']:
if exc.search(title):
return True
if 'require-title' in exceptions:
for req in exceptions['require-title']:
if not req.search(title):
return True
return False
def isTextExcepted(self, text, exceptions=None) -> bool:
"""Return True iff one of the exceptions applies for the given text."""
if exceptions is None:
exceptions = self.exceptions
if 'text-contains' in exceptions:
return any(exc.search(text) for exc in exceptions['text-contains'])
return False
def apply_replacements(self, original_text, applied, page=None):
"""
Apply all replacements to the given text.
:rtype: str, set
"""
if page is None:
pywikibot.warn(
'You must pass the target page as the "page" parameter to '
'apply_replacements().', DeprecationWarning, stacklevel=2)
new_text = original_text
exceptions = _get_text_exceptions(self.exceptions)
skipped_containers = set()
for replacement in self.replacements:
if self.opt.sleep:
pywikibot.sleep(self.opt.sleep)
if (replacement.container
and replacement.container.name in skipped_containers):
continue
if page is not None and self.isTitleExcepted(
page.title(), replacement.exceptions):
if replacement.container:
pywikibot.info(
'Skipping fix "{}" on {} because the title is on '
'the exceptions list.'.format(
replacement.container.name,
page.title(as_link=True)))
skipped_containers.add(replacement.container.name)
else:
pywikibot.info(
'Skipping unnamed replacement ({}) on {} because '
'the title is on the exceptions list.'.format(
replacement.description, page.title(as_link=True)))
continue
if self.isTextExcepted(original_text, replacement.exceptions):
continue
old_text = new_text
new_text = textlib.replaceExcept(
new_text, replacement.old_regex, replacement.new,
exceptions + replacement.get_inside_exceptions(),
allowoverlap=self.opt.allowoverlap, site=self.site)
if old_text != new_text:
applied.add(replacement)
return new_text
def generate_summary(self, applied_replacements):
"""Generate a summary message for the replacements."""
# all replacements which are merged into the default message
default_summaries = set()
# all message parts
summary_messages = set()
for replacement in applied_replacements:
if replacement.edit_summary:
summary_messages.add(replacement.edit_summary)
elif replacement.default_summary:
default_summaries.add((replacement.old, replacement.new))
summary_messages = sorted(summary_messages)
if default_summaries:
if self.opt.summary:
msg = self.opt.summary
else:
msg = i18n.twtranslate(self.site, 'replace-replacing')
comma = self.site.mediawiki_message('comma-separator')
default_summary = comma.join(
'-{} +{}'.format(*default_summary)
for default_summary in default_summaries)
desc = {'description': f' ({default_summary})'}
summary_messages.insert(0, msg % desc)
semicolon = self.site.mediawiki_message('semicolon-separator')
return semicolon.join(summary_messages)
def skip_page(self, page):
"""Check whether treat should be skipped for the page."""
if super().skip_page(page):
return True
if self.isTitleExcepted(page.title()):
pywikibot.warning(
f'Skipping {page} because the title is on the exceptions list.'
)
return True
if not page.has_permission():
pywikibot.warning(f"You can't edit page {page}")
return True
return False
def treat(self, page) -> None:
"""Work on each page retrieved from generator."""
try:
original_text = page.text
except InvalidPageError as e:
pywikibot.error(e)
return
if self.isTextExcepted(original_text):
pywikibot.info(f'Skipping {page} because it contains text '
f'that is on the exceptions list.')
return
applied = set()
new_text = original_text
last_text = None
while new_text != last_text:
last_text = new_text
new_text = self.apply_replacements(last_text, applied, page)
if not self.opt.recursive:
break
if new_text == original_text:
if not self.opt.quiet:
pywikibot.info(f'No changes were necessary in {page}')
return
if self.opt.addcat:
# Fetch only categories in wikitext, otherwise the others
# will be explicitly added.
cats = textlib.getCategoryLinks(new_text, site=page.site)
if self.opt.addcat not in cats:
cats.append(self.opt.addcat)
new_text = textlib.replaceCategoryLinks(new_text, cats,
site=page.site)
context = 0
while True:
# Show the title of the page we're working on.
# Highlight the title in purple.
self.current_page = page
pywikibot.showDiff(original_text, new_text, context=context)
if self.opt.always:
break
choice = pywikibot.input_choice(
'Do you want to accept these changes?',
[('Yes', 'y'), ('No', 'n'), ('Edit original', 'e'),
('edit Latest', 'l'), ('open in Browser', 'b'),
('More context', 'm'), ('All', 'a')],
default='N')
if choice == 'm':
context = context * 3 if context else 3
continue
if choice in ('e', 'l'):
text_editor = editor.TextEditor()
edit_text = original_text if choice == 'e' else new_text
as_edited = text_editor.edit(edit_text)
# if user didn't press Cancel
if as_edited and as_edited != new_text:
new_text = as_edited
continue
if choice == 'b':
# open in browser and leave
pywikibot.bot.open_webbrowser(page)
try:
page.get(get_redirect=True, force=True)
except NoPageError:
pywikibot.info(f'Page {page.title()} has been deleted.')
return
if choice == 'n':
return
if choice == 'a':
self.opt.always = True
# break if choice is 'y' or 'a' to save
break
self.save(page, original_text, new_text, applied, show_diff=False,
asynchronous=not self.opt.always)
def save(self, page, oldtext, newtext, applied, **kwargs) -> None:
"""Save the given page."""
self.userPut(page, oldtext, newtext,
summary=self.generate_summary(applied),
ignore_save_related_errors=True, **kwargs)
def user_confirm(self, question) -> bool:
"""Always return True due to our own input choice."""
return True
def prepareRegexForMySQL(pattern: str) -> str:
"""Convert regex to MySQL syntax."""
pattern = pattern.replace(r'\s', '[:space:]')
pattern = pattern.replace(r'\d', '[:digit:]')
pattern = pattern.replace(r'\w', '[:alnum:]')
pattern = pattern.replace("'", '\\' + "'")
return pattern
EXC_KEYS = {
'-excepttitle': 'title',
'-requiretitle:': 'require-title',
'-excepttext': 'text-contains',
'-exceptinside': 'inside',
'-exceptinsidetag': 'inside-tags'
}
"""Dictionary to convert exceptions command line options to exceptions keys.
.. versionadded:: 7.0
"""
def handle_exceptions(*args: str) -> tuple[list[str], dict[str, str]]:
"""Handle exceptions args to ignore pages which contain certain texts.
.. versionadded:: 7.0
"""
exceptions = {key: [] for key in EXC_KEYS.values()}
local_args = []
for argument in args:
arg, _, value = argument.partition(':')
if arg in EXC_KEYS:
exceptions[EXC_KEYS[arg]].append(value)
else:
local_args.append(argument)
return local_args, exceptions
def handle_pairsfile(filename: str) -> list[str] | None:
"""Handle -pairsfile argument.
.. versionadded:: 7.0
.. versionchanged:: 9.2
replacement patterns are printed it they are incomplete.
"""
if not filename:
filename = pywikibot.input(
'Please enter the filename to read replacements from:')
try:
with Path(filename).open(encoding='utf-8') as f:
replacements = f.readlines()
if not replacements:
raise OSError(f'{filename} is empty.')
except OSError as e:
pywikibot.error(f'Error loading {filename}: {e}')
return None
if len(replacements) % 2:
pywikibot.error(f'{filename} contains an incomplete pattern '
f'replacement pair:\n{replacements}')
return None
# Strip BOM from first line
replacements[0].lstrip('\uFEFF')
return replacements
def handle_manual() -> list[str]:
"""Handle manual input.
.. versionadded:: 7.0
"""
pairs = []
old = pywikibot.input('Please enter the text that should be replaced:')
while old:
new = pywikibot.input('Please enter the new text:')
pairs += [old, new]
old = pywikibot.input(
'Please enter another text that should be replaced,\n'
'or press Enter to start:')
return pairs
def handle_sql(sql: str,
replacements: list[Pattern],
exceptions: list[Pattern]) -> Generator:
"""Handle default sql query.
.. versionadded:: 7.0
"""
if not sql:
where_clause = 'WHERE ({})'.format(' OR '.join(
"old_text RLIKE '{}'"
.format(prepareRegexForMySQL(repl.old_regex.pattern))
for repl in replacements))
if exceptions:
except_clause = 'AND NOT ({})'.format(' OR '.join(
"old_text RLIKE '{}'"
.format(prepareRegexForMySQL(exc.pattern))
for exc in exceptions))
else:
except_clause = ''
sql = f"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
{where_clause}
{except_clause}
LIMIT 200"""
return pagegenerators.MySQLPageGenerator(sql)
def main(*args: str) -> None: # noqa: C901
"""Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
.. versionchanged:: 9.2
replacement patterns are printed it they are incomplete.
:param args: command line arguments
"""
options = {}
gen = None
# summary message
edit_summary = ''
# Array which will collect commandline parameters.
# First element is original text, second element is replacement text.
preload = True # preload pages
commandline_replacements = []
file_replacements = []
# A list of 2-tuples of original text and replacement text.
replacements = []
# Should the elements of 'replacements' and 'exceptions' be interpreted
# as regular expressions?
regex = False
# Predefined fixes from dictionary 'fixes' (see above).
fixes_set = []
# the dump's path, either absolute or relative, which will be used
# if -xml flag is present
xmlFilename = None
xmlStart = None
sql_query: str | None = None
# Set the default regular expression flags
flags = 0
# Request manual replacements even if replacements are already defined
manual_input = False
# Read commandline parameters.
genFactory = pagegenerators.GeneratorFactory(
disabled_options=['mysqlquery'])
local_args = pywikibot.handle_args(args)
local_args = genFactory.handle_args(local_args)
local_args, exceptions = handle_exceptions(*local_args)
for arg in local_args:
opt, _, value = arg.partition(':')
if opt == '-regex':
regex = True
elif opt == '-xmlstart':
xmlStart = value or pywikibot.input(
'Please enter the dumped article to start with:')
elif opt == '-xml':
xmlFilename = value or i18n.input('pywikibot-enter-xml-filename')
elif opt == '-mysqlquery':
sql_query = value
elif opt == '-fix':
fixes_set.append(value)
elif opt == '-sleep':
options['sleep'] = float(value)
elif opt in ('-allowoverlap', '-always', '-quiet', '-recursive'):
options[opt[1:]] = True
elif opt == '-nocase':
flags |= re.IGNORECASE
elif opt == '-dotall':
flags |= re.DOTALL
elif opt == '-multiline':
flags |= re.MULTILINE
elif opt == '-addcat':
options['addcat'] = value
elif opt == '-summary':
edit_summary = value
elif opt == '-automaticsummary':
edit_summary = True
elif opt == '-manualinput':
manual_input = True
elif opt == '-pairsfile':
file_replacements = handle_pairsfile(value)
elif opt == '-nopreload':
preload = False
else:
commandline_replacements.append(arg)
if file_replacements is None:
return