diff --git a/examples/scoring.py b/examples/scoring.py index 2210bf96..64a96baf 100644 --- a/examples/scoring.py +++ b/examples/scoring.py @@ -9,6 +9,3 @@ user_agent="revscoring demo")) values = extractor.extract(123456789, model.features) print(model.score(values)) - {'prediction': True, - 'probability': {False: 0.4694409344514984, - True: 0.5305590655485017}} diff --git a/revscoring/features/wikitext/__init__.py b/revscoring/features/wikitext/__init__.py index 1b69ee74..3c119dd8 100644 --- a/revscoring/features/wikitext/__init__.py +++ b/revscoring/features/wikitext/__init__.py @@ -39,6 +39,8 @@ The number of punctuation characters **break_chars** : `int` The number of break characters + **longest_repeated_char** : `int` + The length of the most longest character repetition :Tokenized features: **tokens** : `int` @@ -63,6 +65,10 @@ The number of punctuation tokens **breaks** : `int` The number of break tokens + **longest_token** : `int` + The length of the longest token + **longest_word** : `int` + The length of the longest word-token :Parsed features: **content_chars** : `int` diff --git a/revscoring/features/wikitext/features/chars.py b/revscoring/features/wikitext/features/chars.py index 02c0f5b6..ffd33336 100644 --- a/revscoring/features/wikitext/features/chars.py +++ b/revscoring/features/wikitext/features/chars.py @@ -66,6 +66,12 @@ def __init__(self, *args, **kwargs): ) "`int` : The number of break characters in the text" + self.longest_repeated_char = \ + Feature(self._name + ".longest_repeated_char", + _process_longest_repeated_char, + returns=int, depends_on=[self.datasources.text]) + "`int` : The most repeated character" + class Diff: @@ -218,3 +224,11 @@ def _process_longest_repeated_char_added(diff_segments_added): for _, group in groupby(segment.lower())) else: return 1 + + +def _process_longest_repeated_char(text): + if len(text) > 0: + return max(sum(1 for _ in group) + for _, group in groupby(text.lower())) + else: + return 1 diff --git a/revscoring/features/wikitext/features/revision_oriented.py b/revscoring/features/wikitext/features/revision_oriented.py index 3a265004..b4af9b92 100644 --- a/revscoring/features/wikitext/features/revision_oriented.py +++ b/revscoring/features/wikitext/features/revision_oriented.py @@ -38,7 +38,7 @@ class Revision(parsed.Revision, chars.Revision, tokenized.Revision, class BaseDiff(DependentSet): - def __init__(self, name, diff_datasources): + def __init__(self, name, diff_datasources, *args, **kwargs): super().__init__(name) self.datasources = diff_datasources diff --git a/revscoring/features/wikitext/features/tokenized.py b/revscoring/features/wikitext/features/tokenized.py index 16c40081..331a7003 100644 --- a/revscoring/features/wikitext/features/tokenized.py +++ b/revscoring/features/wikitext/features/tokenized.py @@ -1,4 +1,4 @@ -from ....datasources.meta import dicts, filters +from ....datasources.meta import dicts, filters, mappers from ...meta import aggregators @@ -30,6 +30,12 @@ def __init__(self, *args, **kwargs): "`int` : The number of punctuation tokens in the revision" self.breaks = aggregators.len(self.datasources.breaks) "`int` : The number of break tokens in the revision" + self.longest_token = aggregators.max( + mappers.map(len, self.datasources.tokens), returns=int) + "`int` : The longest single token in the revision" + self.longest_word = aggregators.max( + mappers.map(len, self.datasources.words), returns=int) + "`int` : The longest single word-token in the revision" class Diff: diff --git a/revscoring/features/wikitext/tests/test_chars.py b/revscoring/features/wikitext/tests/test_chars.py index cbbd642a..bacd8907 100644 --- a/revscoring/features/wikitext/tests/test_chars.py +++ b/revscoring/features/wikitext/tests/test_chars.py @@ -218,16 +218,23 @@ def test_break_chars(): revision.diff.break_chars_removed) -def test_longest_repeated_char_added(): +def test_longest_repeated_char(): cache = {p_text: "This is words.", r_text: "This is aaaa words. kkkkkkkkkkkk"} + # Test an addition of a very long repeated char + eq_(solve(revision.longest_repeated_char, cache=cache), 12) + eq_(solve(revision.parent.longest_repeated_char, cache=cache), 1) eq_(solve(revision.diff.longest_repeated_char_added, cache=cache), 12) - eq_(pickle.loads(pickle.dumps(revision.diff.longest_repeated_char_added)), - revision.diff.longest_repeated_char_added) - + # Test the no-change case cache = {p_text: "This is words.", r_text: "This is words."} - eq_(solve(revision.diff.longest_repeated_char_added, cache=cache), 1) + + eq_(pickle.loads(pickle.dumps(revision.longest_repeated_char)), + revision.longest_repeated_char) + eq_(pickle.loads(pickle.dumps(revision.parent.longest_repeated_char)), + revision.parent.longest_repeated_char) + eq_(pickle.loads(pickle.dumps(revision.diff.longest_repeated_char_added)), + revision.diff.longest_repeated_char_added) diff --git a/revscoring/features/wikitext/tests/test_tokenized.py b/revscoring/features/wikitext/tests/test_tokenized.py index 565e3420..e1470eaa 100644 --- a/revscoring/features/wikitext/tests/test_tokenized.py +++ b/revscoring/features/wikitext/tests/test_tokenized.py @@ -110,6 +110,19 @@ def test_punctuations(): {'.': 4, ':': 1, '?': 1, '。': 1}) +def test_longest_token(): + eq_(solve(revision.longest_token, cache={r_text: text}), 20) + + eq_(pickle.loads(pickle.dumps(revision.longest_token)), + revision.longest_token) + +def test_longest_word(): + eq_(solve(revision.longest_word, cache={r_text: text}), 8) + + eq_(pickle.loads(pickle.dumps(revision.longest_word)), + revision.longest_word) + + def test_diff(): diff = revision.diff