Skip to content

Commit

Permalink
Feature/protect bot (#846)
Browse files Browse the repository at this point in the history
- give it a try as nightly run
  • Loading branch information
the-it committed Sep 8, 2022
1 parent 764b92a commit 628f6b8
Show file tree
Hide file tree
Showing 10 changed files with 182 additions and 115 deletions.
6 changes: 1 addition & 5 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file
[*]
indent_style = space
trim_trailing_whitespace = true
end_of_line = lf
charset = utf-8

# 4 space indentation
[*.py]
indent_size = 4
max_line_length = 119
insert_final_newline = true

[*.json]
indent_size = 2

# Tab indentation (no size specified)
[Makefile]
indent_style = tab

# Matches the exact files either package.json or .travis.yml
[config.yml]
indent_size = 2

63 changes: 63 additions & 0 deletions service/protect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from datetime import datetime, timedelta

from pywikibot import Site, Page

from tools.bots import BotException
from tools.bots.pi import CanonicalBot
from tools.petscan import PetScan


class Protect(CanonicalBot):
def __init__(self, wiki, debug):
CanonicalBot.__init__(self, wiki, debug, log_to_wiki=False)
self.timeout: timedelta = timedelta(hours=2)

def __enter__(self):
super().__enter__()
if not self.data:
self.logger.warning("Try to get the deprecated data back.")
try:
self.data.get_deprecated()
except BotException:
self.logger.warning("There isn't deprecated data to reload.")
return self

@staticmethod
def _prepare_searcher() -> PetScan:
searcher = PetScan()
searcher.add_positive_category("Fertig")
searcher.add_negative_category("Korrigiert")
searcher.add_negative_category("Unkorrigiert")
searcher.add_negative_category("Unvollständig")
searcher.set_sort_criteria("date")
searcher.set_sortorder_decending()
searcher.set_search_depth(1)
searcher.set_timeout(120)
searcher.last_change_after(datetime(year=2022, month=9, day=3))
return searcher

def task(self) -> bool:
searcher = self._prepare_searcher()
self.logger.info(str(searcher))
lemma_list = searcher.get_combined_lemma_list(self.data)
print(len(lemma_list))
for idx, lemma_str in enumerate(lemma_list):
self.data[lemma_str] = datetime.now().strftime("%Y%m%d%H%M%S")
lemma = Page(self.wiki, lemma_str)
self.logger.debug(f"check lemma {lemma.title()} for protection")
if not lemma.protection():
self.logger.debug(f"protect lemma {lemma.title()}")
lemma.protect(reason="Schutz fertiger Seiten",
protections={'move': 'autoconfirmed', 'edit': 'autoconfirmed'})
if self._watchdog():
self.logger.info(f"checked {idx} lemmas")
break
return True


# PYWIKIBOT_DIR=/home/esommer/.pywikibot_protect

if __name__ == "__main__":
WS_WIKI = Site(code="de", fam="wikisource", user="THEprotectIT")
with Protect(wiki=WS_WIKI, debug=False) as bot:
bot.run()
2 changes: 2 additions & 0 deletions service/starter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ sudo /usr/local/bin/python3.10 -m pip install -r requirements.txt
export PYTHONPATH=${PYTHONPATH}:${BASE_DIR}
source /etc/environment
/usr/local/bin/python3.10 service/runner.py
export PYWIKIBOT_DIR=/home/pi/.pywikibot_protect/
/usr/local/bin/python3.10 service/protect.py
13 changes: 13 additions & 0 deletions service/test_protect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from unittest import TestCase, mock


class TestProtect(TestCase):
def setUp(self):
self.petscan_patcher = mock.patch("service.protect.PetScan")
self.petscan_mock = self.petscan_patcher.start()
self.run_mock = mock.Mock()
self.petscan_mock.return_value = mock.Mock(run=self.run_mock)
self.addCleanup(mock.patch.stopall)

def tearDown(self):
mock.patch.stopall()
47 changes: 7 additions & 40 deletions service/ws_re/scanner/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import traceback
from contextlib import suppress
from datetime import timedelta, datetime
from operator import itemgetter
from typing import List, Optional, Dict, Callable

import pywikibot
Expand All @@ -17,10 +16,9 @@
from service.ws_re.scanner.tasks.wikidata.task import DATATask
from service.ws_re.template import ReDatenException
from service.ws_re.template.re_page import RePage
from tools._typing import PetscanLemma
from tools.bots import BotException
from tools.bots.pi import CanonicalBot
from tools.petscan import PetScan, PetScanException
from tools.petscan import PetScan


class ReScanner(CanonicalBot):
Expand All @@ -43,41 +41,6 @@ def __enter__(self):
self.logger.warning("There isn't deprecated data to reload.")
return self

def compile_lemma_list(self) -> List[str]:
self.logger.info("Compile the lemma list")
self.logger.info("Searching for lemmas")
raw_lemma_list = self._petscan_search()
self.statistic["len_raw_lemma_list"] = len(raw_lemma_list)
self.logger.info("Filter new_lemma_list")
# all items which wasn't process before
new_lemma_list = []
for lemma in raw_lemma_list:
try:
self.data[lemma]
except KeyError:
new_lemma_list.append(lemma)
self.statistic["len_new_lemma_list"] = len(new_lemma_list)
self.logger.info("Sort old_lemma_list")
# before processed lemmas ordered by last process time
old_lemma_list = [x[0] for x in sorted(self.data.items(), key=itemgetter(1))]
# first iterate new items then the old ones (oldest first)
self.logger.info("Add the two lists")
self.statistic["len_old_lemma_list"] = len(old_lemma_list)
self.logger.info(f"raw: {self.statistic['len_raw_lemma_list']}, "
f"new: {self.statistic['len_new_lemma_list']}, "
f"old: {self.statistic['len_old_lemma_list']}")
return new_lemma_list + old_lemma_list

def _petscan_search(self) -> List[str]:
searcher = self._prepare_searcher()
self.logger.info(f"[{searcher} {searcher}]")
raw_lemma_list: List[PetscanLemma] = []
try:
raw_lemma_list = searcher.run()
except PetScanException:
self.logger.error("Search timed out.")
return [item["nstext"] + ":" + item["title"] for item in raw_lemma_list]

def _prepare_searcher(self) -> PetScan:
searcher = PetScan()
searcher.add_yes_template("REDaten")
Expand All @@ -95,6 +58,11 @@ def _prepare_searcher(self) -> PetScan:
searcher.set_timeout(120)
return searcher

@property
def lemma_list(self) -> list[str]:
searcher = self._prepare_searcher()
return searcher.get_combined_lemma_list(self.data)

def _activate_tasks(self) -> List[ReScannerTask]:
active_tasks = []
for task in self.tasks:
Expand Down Expand Up @@ -135,10 +103,9 @@ def get_oldest_datetime(self) -> datetime:
def task(self) -> bool:
active_tasks = self._activate_tasks()
error_task = ERROTask(wiki=self.wiki, debug=self.debug, logger=self.logger)
lemma_list = self.compile_lemma_list()
self.logger.info("Start processing the lemmas.")
processed_lemmas = 0
for idx, lemma in enumerate(lemma_list):
for idx, lemma in enumerate(self.lemma_list):
self.logger.debug(f"Process [https://de.wikisource.org/wiki/{lemma} {lemma}]")
list_of_done_tasks = []
try:
Expand Down
49 changes: 6 additions & 43 deletions service/ws_re/scanner/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,24 @@
from service.ws_re.scanner.tasks.base_task import ReScannerTask
from service.ws_re.template import ReDatenException
from tools.bots.test_pi import setup_data_path, teardown_data_path, _DATA_PATH_TEST
from tools.test import SearchStringChecker


class TestReScanner(TestCase):
def setUp(self):
self.petscan_patcher = mock.patch("service.ws_re.scanner.base.PetScan")
self.petscan_patcher = mock.patch("service.ws_re.scanner.base.PetScan.get_combined_lemma_list")
self.petscan_mock = self.petscan_patcher.start()
self.run_mock = mock.Mock()
self.petscan_mock.return_value = mock.Mock(run=self.run_mock)
setup_data_path(self)
self.addCleanup(mock.patch.stopall)

def tearDown(self):
teardown_data_path()
mock.patch.stopall()

class SearchStringChecker:
def __init__(self, search_string: str):
self.search_string = search_string

def is_part_of_searchstring(self, part: str):
pre_length = len(self.search_string)
self.search_string = "".join(self.search_string.split(part))
return pre_length != len(self.search_string)

def is_empty(self):
return len(self.search_string) == 0

def test_search_prepare_debug(self):
mock.patch.stopall()
with ReScanner(log_to_screen=False, log_to_wiki=False) as bot:
checker = self.SearchStringChecker(str(bot._prepare_searcher()))
checker = SearchStringChecker(str(bot._prepare_searcher()))
self.assertTrue(checker.is_part_of_searchstring(
r"https://petscan.wmflabs.org/?language=de&project=wikisource"))
self.assertTrue(checker.is_part_of_searchstring("&templates_yes=REDaten"))
Expand All @@ -51,7 +38,7 @@ def test_search_prepare_debug(self):
def test_search_prepare(self):
mock.patch.stopall()
with ReScanner(log_to_screen=False, log_to_wiki=False, debug=False) as bot:
checker = self.SearchStringChecker(str(bot._prepare_searcher()))
checker = SearchStringChecker(str(bot._prepare_searcher()))
self.assertTrue(checker.is_part_of_searchstring(
"https://petscan.wmflabs.org/?language=de&project=wikisource"))
self.assertTrue(checker.is_part_of_searchstring(
Expand All @@ -63,30 +50,6 @@ def test_search_prepare(self):
self.assertTrue(checker.is_part_of_searchstring("&sortorder=descending"))
self.assertTrue(checker.is_empty())

result_of_searcher = [{"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '',
"title": "RE:Lemma1", "touched": "20010101232359"},
{"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '',
"title": "RE:Lemma2", "touched": "20000101232359"},
{"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '',
"title": "RE:Lemma3", "touched": "19990101232359"}
]

def test_compile_lemmas_no_old_lemmas(self):
self.run_mock.return_value = self.result_of_searcher
with ReScanner(log_to_screen=False, log_to_wiki=False) as bot:
self.assertEqual([":RE:Lemma1", ":RE:Lemma2", ":RE:Lemma3"], bot.compile_lemma_list())

def test_compile_lemmas_old_lemmas(self):
self.run_mock.return_value = self.result_of_searcher
with ReScanner(log_to_screen=False, log_to_wiki=False) as bot:
with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101232359"}):
self.assertEqual([":RE:Lemma2", ":RE:Lemma3", ":RE:Lemma1"],
bot.compile_lemma_list())
with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101232359",
":RE:Lemma3": "20020101232359"}):
self.assertEqual([":RE:Lemma2", ":RE:Lemma1", ":RE:Lemma3"],
bot.compile_lemma_list())

def test_get_oldest_processed(self):
with ReScanner(log_to_screen=False, log_to_wiki=False) as bot:
with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101000000",
Expand All @@ -113,8 +76,8 @@ def test_activate_tasks(self):

def _mock_surroundings(self):
# pylint: disable=attribute-defined-outside-init
lemma_patcher = mock.patch("service.ws_re.scanner.base.ReScanner.compile_lemma_list",
mock.Mock())
lemma_patcher = mock.patch("service.ws_re.scanner.base.ReScanner.lemma_list",
mock.PropertyMock())
page_patcher = mock.patch("service.ws_re.scanner.base.pywikibot.Page")
page_patcher_error = mock.patch("service.ws_re.scanner.tasks.base_task.pywikibot.Page")
re_page_patcher = mock.patch("service.ws_re.scanner.base.RePage")
Expand Down
4 changes: 2 additions & 2 deletions tools/bots/pi.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Dict, Any, Iterator, List
from typing import TypedDict # pylint: disable=no-name-in-module

from pywikibot import Page, Site, Category
from pywikibot import Page, Site, Category, BaseSite
from pywikibot.pagegenerators import CategorizedPageGenerator

from tools.bots import BotException
Expand Down Expand Up @@ -196,7 +196,7 @@ def __init__(self, wiki: Site = None, debug: bool = True,
" def task(self):\n"
" do_stuff()")
self.timestamp: PersistedTimestamp = PersistedTimestamp(bot_name=self.bot_name)
self.wiki: Page = wiki
self.wiki: BaseSite = wiki
self.debug: bool = debug
self.timeout: timedelta = timedelta(days=1)
self.logger: WikiLogger = WikiLogger(self.bot_name,
Expand Down
23 changes: 22 additions & 1 deletion tools/petscan.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# pylint: disable=ungrouped-imports
import json
from datetime import datetime
from typing import List, Union
from operator import itemgetter
from typing import List, Union, Mapping
from urllib.parse import quote

import requests
Expand Down Expand Up @@ -242,3 +243,23 @@ def run(self) -> List[PetscanLemma]:
response_byte = response.content
response_dict = json.loads(response_byte.decode("utf8"))
return response_dict["*"][0]["a"]["*"] # type: ignore

def get_combined_lemma_list(self, old_lemmas: Mapping) -> list[str]:
"""
Executes the search. Filters out all preprocessed lemmas from a provided dictionary.
Interlaces this two lists to a combined list sorted by:
* every new lemma
* old lemmas sorted by dictionary value (probably a timestamp)
"""
raw_lemma_list = [item["nstext"] + ":" + item["title"] for item in self.run()]
# all items which wasn't process before
new_lemma_list = []
for lemma in raw_lemma_list:
try:
old_lemmas[lemma]
except KeyError:
new_lemma_list.append(lemma)
# before processed lemmas ordered by last process time
old_lemma_list = [x[0] for x in sorted(old_lemmas.items(), key=itemgetter(1))]
# first iterate new items then the old ones (oldest first)
return new_lemma_list + old_lemma_list
13 changes: 13 additions & 0 deletions tools/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,16 @@
def real_wiki_test(func):
wrapper = skipUnless(REAL_WIKI_TEST, "only execute in test against real wiki")(func)
return wrapper


class SearchStringChecker:
def __init__(self, search_string: str):
self.search_string = search_string

def is_part_of_searchstring(self, part: str):
pre_length = len(self.search_string)
self.search_string = "".join(self.search_string.split(part))
return pre_length != len(self.search_string)

def is_empty(self):
return len(self.search_string) == 0

0 comments on commit 628f6b8

Please sign in to comment.