-
Notifications
You must be signed in to change notification settings - Fork 0
/
boilerplate.py
144 lines (123 loc) · 4.7 KB
/
boilerplate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pickle
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from typing import Optional, List, Iterable
import pywikibot
import requests
from pywikibot import Page
from pywikibot.bot import SingleSiteBot
from pywikibot.pagegenerators import PreloadingGenerator
import wikitextparser as wtp
from utils.config import get_default_save_params
from utils.utils import search_pages
BOILERPLATE_BOT_SUMMARY = "去除预加载残留"
BOILERPLATE_PATH = Path("texts/boilerplates")
AUTO_PATH = BOILERPLATE_PATH.joinpath("auto.pickle")
if AUTO_PATH.exists():
black_list = set(pickle.load(open(AUTO_PATH, "rb")))
else:
black_list = set()
def should_remove(text: str) -> bool:
"""
Determine whether a comment should be removed.
Used to be complicated, but greatly simplified now.
:param text: Comment
:return: True if comment should be removed; false otherwise
"""
if "保留这里的注释" in text:
return False
if text in {']]'}:
return False
if text in black_list:
return True
return False
def treat_boilerplate(text: str) -> str:
# requested by LUO1P
if '虚拟UP主' in text:
return text
parsed = wtp.parse(text)
for c in parsed.comments:
if should_remove(c.contents.strip()):
c.string = ""
return str(parsed)
def find_search_string(text: str) -> Optional[str]:
chs = r'\u4E00-\u9FFF\u3400-\u4DBF'
candidates = re.findall(rf"[{chs}a-zA-Z\d]+", text)
candidates.sort(key=len, reverse=True)
candidates = [c for c in candidates if re.search(f'[{chs}]', c)]
if len(candidates) > 0 and len(candidates[0]) > 3:
return candidates[0]
return None
def get_search_strings() -> Iterable[str]:
res = set()
for w in black_list:
r = find_search_string(w)
if r:
res.add(r)
return res
class BoilerplateBot(SingleSiteBot):
def treat(self, page: Page) -> None:
text = treat_boilerplate(page.text)
if text != page.text:
page.text = text
page.save(summary=BOILERPLATE_BOT_SUMMARY, **get_default_save_params())
def download_boilerplate():
"""
Download boilerplate templates from subpages of Template:页面格式 (including past versions),
extract all comments in these templates and store them in a file.
:return: None
"""
from bs4 import BeautifulSoup
import urllib
from utils.sites import mgp
# Don't know the api for subpages, so simply parse all the links in the HTML response
response = requests.get("https://mzh.moegirl.org.cn/index.php?title=Special%3A%E5%89%8D%E7%BC%80%E7%B4%A2%E5%BC%95"
"&prefix=%E9%A1%B5%E9%9D%A2%E6%A0%BC%E5%BC%8F&namespace=10").text
soup = BeautifulSoup(response, 'html.parser')
pages = []
for link in soup.find_all(name="a"):
href = urllib.parse.unquote(link.attrs['href'], encoding='utf-8')
if '/' == href[0]:
href = href[1:]
if 'Template:页面格式/' in href and '/doc' not in href:
pages.append(Page(source=mgp(), title=href))
# find all revision of all pages and add all comments into black list
result = set()
for index, page in enumerate(PreloadingGenerator(pages)):
print(f"Processing page {index}: " + page.title())
for revision in page.revisions(content=True):
try:
parsed = wtp.parse(revision['*'])
for comment in parsed.comments:
s = comment.contents.strip()
if s != "":
result.add(s)
except AttributeError:
print("Skipping a revision for page titled", page.title())
# store black list in a file
BOILERPLATE_PATH.mkdir(parents=True, exist_ok=True)
pickle.dump(result, open(AUTO_PATH, "wb"))
def run_boilerplate_bot():
# all arguments are treated as search keywords
p = ArgumentParser()
p.add_argument("keywords", nargs="*", default=[])
p.add_argument("-u", "--update", action="store_true")
p.add_argument("-a", "--all", action="store_true")
args = p.parse_args(sys.argv[2:])
keywords = args.keywords
if args.update:
pywikibot.output("Updating boilerplate templates...")
download_boilerplate()
if args.all:
pywikibot.output("Using the following as search keyword: ")
keywords = get_search_strings()
pywikibot.output(str(keywords))
if len(keywords) == 0:
pywikibot.output("No search keyword provided.")
return
if len(black_list) == 0:
pywikibot.warning("Black list does not exist! Please download first.")
bot = BoilerplateBot(generator=search_pages(*keywords, preload=True))
bot.run()