-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #36 from wikimedia/chtnnh-ptwiki
New model for ptwiki.
- Loading branch information
Showing
7 changed files
with
497 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
sphinx>=1.3 | ||
sphinx==2.4.4 | ||
sphinx-py3doc-enhanced-theme | ||
m2r | ||
-e . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
from revscoring.features import wikitext | ||
from revscoring.features.modifiers import max, sub | ||
from revscoring.languages import portuguese | ||
|
||
char_based = [ | ||
wikitext.revision.chars, | ||
wikitext.revision.whitespace_chars, | ||
wikitext.revision.markup_chars, | ||
wikitext.revision.cjk_chars, | ||
wikitext.revision.entity_chars, | ||
wikitext.revision.url_chars, | ||
wikitext.revision.word_chars, | ||
wikitext.revision.uppercase_word_chars, | ||
wikitext.revision.punctuation_chars, | ||
wikitext.revision.break_chars, | ||
wikitext.revision.longest_repeated_char, | ||
wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.markup_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.cjk_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.entity_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.url_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.word_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.uppercase_word_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.break_chars / max(wikitext.revision.chars, 1), | ||
wikitext.revision.longest_repeated_char / max(wikitext.revision.chars, 1) | ||
] | ||
|
||
token_based = [ | ||
wikitext.revision.tokens, | ||
wikitext.revision.numbers, | ||
wikitext.revision.whitespaces, | ||
wikitext.revision.markups, | ||
wikitext.revision.cjks, | ||
wikitext.revision.entities, | ||
wikitext.revision.urls, | ||
wikitext.revision.words, | ||
wikitext.revision.uppercase_words, | ||
wikitext.revision.punctuations, | ||
wikitext.revision.breaks, | ||
wikitext.revision.longest_token, | ||
wikitext.revision.longest_word, | ||
wikitext.revision.numbers / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.whitespaces / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.markups / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.cjks / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.entities / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.urls / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.words / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.uppercase_words / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.punctuations / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.breaks / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.longest_token / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.longest_word / max(wikitext.revision.tokens, 1) | ||
] | ||
|
||
parse_based = [ | ||
wikitext.revision.content_chars, | ||
wikitext.revision.headings, | ||
wikitext.revision.external_links, | ||
wikitext.revision.wikilinks, | ||
wikitext.revision.tags, | ||
wikitext.revision.ref_tags, | ||
wikitext.revision.templates, | ||
wikitext.revision.content_chars / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.headings / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.external_links / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.wikilinks / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.tags / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.ref_tags / max(wikitext.revision.tokens, 1), | ||
wikitext.revision.templates / max(wikitext.revision.tokens, 1) | ||
] | ||
|
||
badwords = [ | ||
portuguese.badwords.revision.matches, | ||
portuguese.badwords.revision.matches / max(wikitext.revision.words, 1) | ||
] | ||
|
||
informals = [ | ||
portuguese.informals.revision.matches, | ||
portuguese.informals.revision.matches / max(wikitext.revision.words, 1) | ||
] | ||
|
||
dict_words = [ | ||
portuguese.dictionary.revision.dict_words, | ||
portuguese.dictionary.revision.non_dict_words, | ||
portuguese.dictionary.revision.dict_words / | ||
max(wikitext.revision.words, 1), | ||
(portuguese.dictionary.revision.non_dict_words / | ||
max(wikitext.revision.words, 1)) | ||
] | ||
|
||
# Templates | ||
infobox_templates = wikitext.revision.template_names_matching( | ||
r"Info", name="ptwiki.revision.infobox_templates") | ||
|
||
CN_TEMPLATES = [ | ||
r"Carece de fontes", | ||
r"Carece de fontes2", | ||
r"Carece de fontes/bloco", | ||
r"Carece de fontes/bloco2" | ||
] | ||
cn_templates = wikitext.revision.template_names_matching( | ||
"|".join(CN_TEMPLATES), name="ptwiki.revision.cn_templates") | ||
MAIN_TEMPLATES = [ | ||
r"Artigo[ _]principal", | ||
r"Ver[ _]artigo[ _]principali", | ||
r"Principal", | ||
r"Ver[ _]também artigo[ _]principal", | ||
r"Main", | ||
r"Detalhes", | ||
r"Mais", | ||
r"Artigoprincipal", | ||
r"AP", r"Details", r"Ver[ _]artigo" | ||
] | ||
main_article_templates = wikitext.revision.template_names_matching( | ||
"|".join(MAIN_TEMPLATES), name="ptwiki.main_article_templates") | ||
CITE_TEMPLATES = [ | ||
r"Cite", | ||
r"Citar", | ||
r"Citar web", | ||
r"Citar livro", | ||
r"Harvard[_ ]citation[_ ]no[_ ]brackets", r"harvnb", | ||
r"Harvard citation", r"harv", | ||
r"Harvard citation text", r"harvtxt", | ||
r"Harvcoltxt", | ||
r"Harvcol", | ||
r"Harvcolnb", | ||
r"Harvard citations", r"harvs", | ||
r"Harvp" | ||
] | ||
cite_templates = wikitext.revision.template_names_matching( | ||
"|".join(CITE_TEMPLATES), name="ptwiki.revision.cite_templates") | ||
proportion_of_templated_references = \ | ||
cite_templates / max(wikitext.revision.ref_tags, 1) | ||
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0) | ||
non_cite_templates = sub( | ||
wikitext.revision.templates, cite_templates, | ||
name="ptwiki.revision.non_cite_templates" | ||
) | ||
|
||
# Links | ||
category_links = wikitext.revision.wikilink_titles_matching( | ||
r"i(Category|Categoria)\:", name="ptwiki.revision.category_links") | ||
image_links = wikitext.revision.wikilink_titles_matching( | ||
r"(File|Image|Ficheiro|Arquivo|Imagem)\:", | ||
name="ptwiki.revision.image_links") | ||
|
||
local_wiki = [ | ||
image_links, | ||
image_links / max(wikitext.revision.content_chars, 1), | ||
category_links, | ||
category_links / max(wikitext.revision.content_chars, 1), | ||
cite_templates, | ||
cite_templates / max(wikitext.revision.content_chars, 1), | ||
proportion_of_templated_references, | ||
non_templated_references, | ||
non_templated_references / max(wikitext.revision.content_chars, 1), | ||
non_cite_templates, | ||
non_cite_templates / max(wikitext.revision.content_chars, 1), | ||
infobox_templates, | ||
cn_templates, | ||
cn_templates / max(wikitext.revision.content_chars, 1), | ||
main_article_templates, | ||
main_article_templates / max(wikitext.revision.content_chars, 1), | ||
portuguese.stemmed.revision.stem_chars / | ||
max(wikitext.revision.content_chars, 1) | ||
] | ||
|
||
|
||
draft_quality = (char_based + token_based + parse_based + | ||
badwords + informals + dict_words + local_wiki) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
Model Information: | ||
- type: GradientBoosting | ||
- version: 0.2.1 | ||
- params: {'scale': False, 'population_rates': None, 'subsample': 1.0, 'loss': 'deviance', 'max_depth': 7, 'max_features': 'log2', 'min_impurity_decrease': 0.0, 'validation_fraction': 0.1, 'n_estimators': 700, 'min_impurity_split': None, 'criterion': 'friedman_mse', 'labels': ['OK', 'spam', 'unsuitable'], 'min_samples_split': 2, 'min_samples_leaf': 1, 'init': None, 'center': False, 'multilabel': False, 'max_leaf_nodes': None, 'learning_rate': 0.01, 'label_weights': None, 'n_iter_no_change': None, 'presort': 'auto', 'verbose': 0, 'random_state': None, 'tol': 0.0001, 'min_weight_fraction_leaf': 0.0, 'warm_start': False} | ||
Environment: | ||
- revscoring_version: '2.6.9' | ||
- platform: 'Linux-4.9.0-8-amd64-x86_64-with-debian-9.4' | ||
- machine: 'x86_64' | ||
- version: '#1 SMP Debian 4.9.144-3.1 (2019-02-19)' | ||
- system: 'Linux' | ||
- processor: '' | ||
- python_build: ('default', 'Sep 27 2018 17:25:39') | ||
- python_compiler: 'GCC 6.3.0 20170516' | ||
- python_branch: '' | ||
- python_implementation: 'CPython' | ||
- python_revision: '' | ||
- python_version: '3.5.3' | ||
- release: '4.9.0-8-amd64' | ||
|
||
Statistics: | ||
counts (n=3947): | ||
label n ~OK ~spam ~unsuitable | ||
------------ ---- --- ----- ------- ------------- | ||
'OK' 1600 --> 1297 71 232 | ||
'spam' 766 --> 72 425 269 | ||
'unsuitable' 1581 --> 191 130 1260 | ||
rates: | ||
'OK' 'spam' 'unsuitable' | ||
---------- ------ -------- -------------- | ||
sample 0.405 0.194 0.401 | ||
population 0.971 0.01 0.02 | ||
match_rate (micro=0.772, macro=0.36): | ||
OK spam unsuitable | ||
---- ------ ------------ | ||
0.79 0.068 0.223 | ||
filter_rate (micro=0.228, macro=0.64): | ||
OK spam unsuitable | ||
---- ------ ------------ | ||
0.21 0.932 0.777 | ||
recall (micro=0.808, macro=0.721): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.811 0.555 0.797 | ||
!recall (micro=0.886, macro=0.871): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.888 0.937 0.788 | ||
precision (micro=0.969, macro=0.381): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.996 0.078 0.07 | ||
!precision (micro=0.149, macro=0.705): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.124 0.995 0.995 | ||
f1 (micro=0.871, macro=0.386): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.894 0.137 0.129 | ||
!f1 (micro=0.237, macro=0.687): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.217 0.965 0.88 | ||
accuracy (micro=0.814, macro=0.845): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.813 0.933 0.788 | ||
fpr (micro=0.114, macro=0.129): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.112 0.063 0.212 | ||
roc_auc (micro=0.91, macro=0.895): | ||
OK spam unsuitable | ||
---- ------ ------------ | ||
0.91 0.899 0.875 | ||
pr_auc (micro=0.973, macro=0.434): | ||
OK spam unsuitable | ||
----- ------ ------------ | ||
0.997 0.119 0.186 | ||
|
||
- score_schema: {'type': 'object', 'title': 'Scikit learn-based classifier score with probability', 'properties': {'probability': {'type': 'object', 'description': 'A mapping of probabilities onto each of the potential output labels', 'properties': {'unsuitable': {'type': 'number'}, 'spam': {'type': 'number'}, 'OK': {'type': 'number'}}}, 'prediction': {'type': 'string', 'description': 'The most likely label predicted by the estimator'}}} | ||
|
Git LFS file not shown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.