Skip to content

Commit

Permalink
Adds basic longest word and longest repeated char features to wikitex…
Browse files Browse the repository at this point in the history
…t.Revision
  • Loading branch information
halfak committed Jan 20, 2016
1 parent 0f759a2 commit 7cc3547
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 10 deletions.
3 changes: 0 additions & 3 deletions examples/scoring.py
Expand Up @@ -9,6 +9,3 @@
user_agent="revscoring demo"))
values = extractor.extract(123456789, model.features)
print(model.score(values))
{'prediction': True,
'probability': {False: 0.4694409344514984,
True: 0.5305590655485017}}
6 changes: 6 additions & 0 deletions revscoring/features/wikitext/__init__.py
Expand Up @@ -39,6 +39,8 @@
The number of punctuation characters
**break_chars** : `int`
The number of break characters
**longest_repeated_char** : `int`
The length of the most longest character repetition
:Tokenized features:
**tokens** : `int`
Expand All @@ -63,6 +65,10 @@
The number of punctuation tokens
**breaks** : `int`
The number of break tokens
**longest_token** : `int`
The length of the longest token
**longest_word** : `int`
The length of the longest word-token
:Parsed features:
**content_chars** : `int`
Expand Down
14 changes: 14 additions & 0 deletions revscoring/features/wikitext/features/chars.py
Expand Up @@ -66,6 +66,12 @@ def __init__(self, *args, **kwargs):
)
"`int` : The number of break characters in the text"

self.longest_repeated_char = \
Feature(self._name + ".longest_repeated_char",
_process_longest_repeated_char,
returns=int, depends_on=[self.datasources.text])
"`int` : The most repeated character"


class Diff:

Expand Down Expand Up @@ -218,3 +224,11 @@ def _process_longest_repeated_char_added(diff_segments_added):
for _, group in groupby(segment.lower()))
else:
return 1


def _process_longest_repeated_char(text):
if len(text) > 0:
return max(sum(1 for _ in group)
for _, group in groupby(text.lower()))
else:
return 1
2 changes: 1 addition & 1 deletion revscoring/features/wikitext/features/revision_oriented.py
Expand Up @@ -38,7 +38,7 @@ class Revision(parsed.Revision, chars.Revision, tokenized.Revision,

class BaseDiff(DependentSet):

def __init__(self, name, diff_datasources):
def __init__(self, name, diff_datasources, *args, **kwargs):
super().__init__(name)
self.datasources = diff_datasources

Expand Down
8 changes: 7 additions & 1 deletion revscoring/features/wikitext/features/tokenized.py
@@ -1,4 +1,4 @@
from ....datasources.meta import dicts, filters
from ....datasources.meta import dicts, filters, mappers
from ...meta import aggregators


Expand Down Expand Up @@ -30,6 +30,12 @@ def __init__(self, *args, **kwargs):
"`int` : The number of punctuation tokens in the revision"
self.breaks = aggregators.len(self.datasources.breaks)
"`int` : The number of break tokens in the revision"
self.longest_token = aggregators.max(
mappers.map(len, self.datasources.tokens), returns=int)
"`int` : The longest single token in the revision"
self.longest_word = aggregators.max(
mappers.map(len, self.datasources.words), returns=int)
"`int` : The longest single word-token in the revision"


class Diff:
Expand Down
17 changes: 12 additions & 5 deletions revscoring/features/wikitext/tests/test_chars.py
Expand Up @@ -218,16 +218,23 @@ def test_break_chars():
revision.diff.break_chars_removed)


def test_longest_repeated_char_added():
def test_longest_repeated_char():
cache = {p_text: "This is words.",
r_text: "This is aaaa words. kkkkkkkkkkkk"}

# Test an addition of a very long repeated char
eq_(solve(revision.longest_repeated_char, cache=cache), 12)
eq_(solve(revision.parent.longest_repeated_char, cache=cache), 1)
eq_(solve(revision.diff.longest_repeated_char_added, cache=cache), 12)

eq_(pickle.loads(pickle.dumps(revision.diff.longest_repeated_char_added)),
revision.diff.longest_repeated_char_added)

# Test the no-change case
cache = {p_text: "This is words.",
r_text: "This is words."}

eq_(solve(revision.diff.longest_repeated_char_added, cache=cache), 1)

eq_(pickle.loads(pickle.dumps(revision.longest_repeated_char)),
revision.longest_repeated_char)
eq_(pickle.loads(pickle.dumps(revision.parent.longest_repeated_char)),
revision.parent.longest_repeated_char)
eq_(pickle.loads(pickle.dumps(revision.diff.longest_repeated_char_added)),
revision.diff.longest_repeated_char_added)
13 changes: 13 additions & 0 deletions revscoring/features/wikitext/tests/test_tokenized.py
Expand Up @@ -110,6 +110,19 @@ def test_punctuations():
{'.': 4, ':': 1, '?': 1, '。': 1})


def test_longest_token():
eq_(solve(revision.longest_token, cache={r_text: text}), 20)

eq_(pickle.loads(pickle.dumps(revision.longest_token)),
revision.longest_token)

def test_longest_word():
eq_(solve(revision.longest_word, cache={r_text: text}), 8)

eq_(pickle.loads(pickle.dumps(revision.longest_word)),
revision.longest_word)


def test_diff():
diff = revision.diff

Expand Down

0 comments on commit 7cc3547

Please sign in to comment.