-
Notifications
You must be signed in to change notification settings - Fork 0
/
isbn.py
74 lines (60 loc) · 2.11 KB
/
isbn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import re
from typing import Set
import wikitextparser as wtp
from re import Match
from pywikibot import Page
from pywikibot.bot import SingleSiteBot
from utils.config import get_default_save_params
from utils.utils import search_pages
ISBN_BOT_SUMMARY = "使用[[T:ISBN]]"
def sub_isbn(text: str, blacklist: Set[str]) -> str:
def replace(m: Match) -> str:
raw = m.group(0)
lower = raw.lower()
# skip images
if ".jp" in lower or ".png" in lower or ".webp" in lower or ".gif" in lower:
return raw
isbn = m.group(1)
# skip isbn in wiki links and urls
for forbid in blacklist:
if isbn in forbid:
return raw
extra1 = extra2 = ""
# keep ISBN: and ISBN:
if "ISBN:" == raw[:5] or "ISBN:" == raw[:5]:
extra1 = raw[:5]
extra2 = "|" + isbn
return extra1 + "{{ISBN|" + isbn + extra2 + "}}" + m.group(2)
return re.sub(r"(?<![/.])ISBN[ ::]?([\d-]{10,20}[xX]?)(?<!-)(.*)",
replace,
text,
re.IGNORECASE)
def treat_isbn(text: str) -> str:
blacklist = set()
parsed = wtp.parse(text)
for link in parsed.wikilinks:
blacklist.add(link.string)
for link in parsed.external_links:
blacklist.add(link.string)
for link in re.findall(r"\[https?://[^]]+]", text):
blacklist.add(link)
for link in re.findall("https?://" + r"""(((?![ \]{}<|\n])[ -~])*)""", text):
blacklist.add(link[0])
while True:
# re.sub sometimes doesn't work for multiple isbn,
# so apply sub repeatedly
res = sub_isbn(text, blacklist)
if res == text:
return res
text = res
class IsbnBot(SingleSiteBot):
def treat(self, page: Page):
text = treat_isbn(page.text)
if text != page.text:
page.text = text
page.save(summary=ISBN_BOT_SUMMARY, **get_default_save_params())
def isbn_adjust():
from utils.sites import mgp
gen = search_pages("ISBN", preload=True)
bot = IsbnBot(site=mgp(), generator=gen)
bot.run()