Skip to content

Commit

Permalink
Merge pull request #36 from wikimedia/chtnnh-ptwiki
Browse files Browse the repository at this point in the history
New model for ptwiki.
  • Loading branch information
halfak committed Apr 17, 2020
2 parents 5b235e0 + affed3c commit f2ca783
Show file tree
Hide file tree
Showing 7 changed files with 497 additions and 2 deletions.
63 changes: 62 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
.DELETE_ON_ERROR:

models: \
enwiki_models
enwiki_models \
ptwiki_models

draft_quality_major_minor = 0.2

############################# English Wikipedia ##############################
#datasets/enwiki.draft_quality.50k_stratified.json: \
# datasets/enwiki.draft_quality.201508-201608.tsv.bz2
# ( \
Expand Down Expand Up @@ -259,3 +261,62 @@ datasets/enwiki.draft_quality.201608-201701.with_cache.json.bz2: \
articlequality extract_from_text \
draftquality.feature_lists.enwiki.draft_quality \
--verbose | bzip2 -c > $@

#################### Portuguese Wikipedia #######################
datasets/ptwiki.draft_quality.201903202003.json:
wget -qO- https://quarry.wmflabs.org/run/444292/output/0/json-lines?download=true > $@

datasets/ptwiki.draft_quality.balanced_3k.json.bz2: \
datasets/ptwiki.draft_quality.201903202003.json
(cat $< | grep '"draft_quality": "OK"' | shuf -n 1600; \
cat $< | grep -v '"draft_quality": "OK"') | \
shuf | bzip2 -c > $@

datasets/ptwiki.draft_quality.balanced_3k.with_text.json.bz2: \
datasets/ptwiki.draft_quality.balanced_3k.json.bz2
bzcat $< | \
revscoring fetch_text --host https://pt.wikipedia.org --threads 4 \
--verbose | bzip2 -c > $@

datasets/ptwiki.draft_quality.balanced_3k.with_cache.json.bz2: \
datasets/ptwiki.draft_quality.balanced_3k.with_text.json.bz2
bzcat $< | \
articlequality extract_from_text \
draftquality.feature_lists.ptwiki.draft_quality \
--verbose | bzip2 -c > $@

tuning_reports/ptwiki.draft_quality.md: \
datasets/ptwiki.draft_quality.balanced_3k.with_cache.json.bz2
bzcat $< | \
revscoring tune \
config/classifiers.params.yaml \
draftquality.feature_lists.ptwiki.draft_quality \
draft_quality \
roc_auc.macro \
--pop-rate '"OK"=0.97080700532 ' \
--pop-rate '"spam"=0.0095278372' \
--pop-rate '"unsuitable"=0.01966515747 ' \
--scale --center \
--cv-timeout=90 \
--debug > $@

models/ptwiki.draft_quality.gradient_boosting.model.bz2: \
datasets/ptwiki.draft_quality.balanced_3k.with_cache.json.bz2
bzcat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
draftquality.feature_lists.ptwiki.draft_quality \
draft_quality \
-p 'n_estimators=700' \
-p 'learning_rate=0.01' \
-p 'max_depth=7' \
-p 'max_features="log2"' \
--pop-rate '"OK"=0.97080700532 ' \
--pop-rate '"spam"=0.0095278372' \
--pop-rate '"unsuitable"=0.01966515747 ' \
--version $(draft_quality_major_minor).1 | bzip2 -c > $@

revscoring model_info $@ > model_info/ptwiki.draft_quality.md

ptwiki_models: \
models/ptwiki.draft_quality.gradient_boosting.model.bz2
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sphinx>=1.3
sphinx==2.4.4
sphinx-py3doc-enhanced-theme
m2r
-e .
172 changes: 172 additions & 0 deletions draftquality/feature_lists/ptwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
from revscoring.features import wikitext
from revscoring.features.modifiers import max, sub
from revscoring.languages import portuguese

char_based = [
wikitext.revision.chars,
wikitext.revision.whitespace_chars,
wikitext.revision.markup_chars,
wikitext.revision.cjk_chars,
wikitext.revision.entity_chars,
wikitext.revision.url_chars,
wikitext.revision.word_chars,
wikitext.revision.uppercase_word_chars,
wikitext.revision.punctuation_chars,
wikitext.revision.break_chars,
wikitext.revision.longest_repeated_char,
wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1),
wikitext.revision.markup_chars / max(wikitext.revision.chars, 1),
wikitext.revision.cjk_chars / max(wikitext.revision.chars, 1),
wikitext.revision.entity_chars / max(wikitext.revision.chars, 1),
wikitext.revision.url_chars / max(wikitext.revision.chars, 1),
wikitext.revision.word_chars / max(wikitext.revision.chars, 1),
wikitext.revision.uppercase_word_chars / max(wikitext.revision.chars, 1),
wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1),
wikitext.revision.break_chars / max(wikitext.revision.chars, 1),
wikitext.revision.longest_repeated_char / max(wikitext.revision.chars, 1)
]

token_based = [
wikitext.revision.tokens,
wikitext.revision.numbers,
wikitext.revision.whitespaces,
wikitext.revision.markups,
wikitext.revision.cjks,
wikitext.revision.entities,
wikitext.revision.urls,
wikitext.revision.words,
wikitext.revision.uppercase_words,
wikitext.revision.punctuations,
wikitext.revision.breaks,
wikitext.revision.longest_token,
wikitext.revision.longest_word,
wikitext.revision.numbers / max(wikitext.revision.tokens, 1),
wikitext.revision.whitespaces / max(wikitext.revision.tokens, 1),
wikitext.revision.markups / max(wikitext.revision.tokens, 1),
wikitext.revision.cjks / max(wikitext.revision.tokens, 1),
wikitext.revision.entities / max(wikitext.revision.tokens, 1),
wikitext.revision.urls / max(wikitext.revision.tokens, 1),
wikitext.revision.words / max(wikitext.revision.tokens, 1),
wikitext.revision.uppercase_words / max(wikitext.revision.tokens, 1),
wikitext.revision.punctuations / max(wikitext.revision.tokens, 1),
wikitext.revision.breaks / max(wikitext.revision.tokens, 1),
wikitext.revision.longest_token / max(wikitext.revision.tokens, 1),
wikitext.revision.longest_word / max(wikitext.revision.tokens, 1)
]

parse_based = [
wikitext.revision.content_chars,
wikitext.revision.headings,
wikitext.revision.external_links,
wikitext.revision.wikilinks,
wikitext.revision.tags,
wikitext.revision.ref_tags,
wikitext.revision.templates,
wikitext.revision.content_chars / max(wikitext.revision.tokens, 1),
wikitext.revision.headings / max(wikitext.revision.tokens, 1),
wikitext.revision.external_links / max(wikitext.revision.tokens, 1),
wikitext.revision.wikilinks / max(wikitext.revision.tokens, 1),
wikitext.revision.tags / max(wikitext.revision.tokens, 1),
wikitext.revision.ref_tags / max(wikitext.revision.tokens, 1),
wikitext.revision.templates / max(wikitext.revision.tokens, 1)
]

badwords = [
portuguese.badwords.revision.matches,
portuguese.badwords.revision.matches / max(wikitext.revision.words, 1)
]

informals = [
portuguese.informals.revision.matches,
portuguese.informals.revision.matches / max(wikitext.revision.words, 1)
]

dict_words = [
portuguese.dictionary.revision.dict_words,
portuguese.dictionary.revision.non_dict_words,
portuguese.dictionary.revision.dict_words /
max(wikitext.revision.words, 1),
(portuguese.dictionary.revision.non_dict_words /
max(wikitext.revision.words, 1))
]

# Templates
infobox_templates = wikitext.revision.template_names_matching(
r"Info", name="ptwiki.revision.infobox_templates")

CN_TEMPLATES = [
r"Carece de fontes",
r"Carece de fontes2",
r"Carece de fontes/bloco",
r"Carece de fontes/bloco2"
]
cn_templates = wikitext.revision.template_names_matching(
"|".join(CN_TEMPLATES), name="ptwiki.revision.cn_templates")
MAIN_TEMPLATES = [
r"Artigo[ _]principal",
r"Ver[ _]artigo[ _]principali",
r"Principal",
r"Ver[ _]também artigo[ _]principal",
r"Main",
r"Detalhes",
r"Mais",
r"Artigoprincipal",
r"AP", r"Details", r"Ver[ _]artigo"
]
main_article_templates = wikitext.revision.template_names_matching(
"|".join(MAIN_TEMPLATES), name="ptwiki.main_article_templates")
CITE_TEMPLATES = [
r"Cite",
r"Citar",
r"Citar web",
r"Citar livro",
r"Harvard[_ ]citation[_ ]no[_ ]brackets", r"harvnb",
r"Harvard citation", r"harv",
r"Harvard citation text", r"harvtxt",
r"Harvcoltxt",
r"Harvcol",
r"Harvcolnb",
r"Harvard citations", r"harvs",
r"Harvp"
]
cite_templates = wikitext.revision.template_names_matching(
"|".join(CITE_TEMPLATES), name="ptwiki.revision.cite_templates")
proportion_of_templated_references = \
cite_templates / max(wikitext.revision.ref_tags, 1)
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0)
non_cite_templates = sub(
wikitext.revision.templates, cite_templates,
name="ptwiki.revision.non_cite_templates"
)

# Links
category_links = wikitext.revision.wikilink_titles_matching(
r"i(Category|Categoria)\:", name="ptwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"(File|Image|Ficheiro|Arquivo|Imagem)\:",
name="ptwiki.revision.image_links")

local_wiki = [
image_links,
image_links / max(wikitext.revision.content_chars, 1),
category_links,
category_links / max(wikitext.revision.content_chars, 1),
cite_templates,
cite_templates / max(wikitext.revision.content_chars, 1),
proportion_of_templated_references,
non_templated_references,
non_templated_references / max(wikitext.revision.content_chars, 1),
non_cite_templates,
non_cite_templates / max(wikitext.revision.content_chars, 1),
infobox_templates,
cn_templates,
cn_templates / max(wikitext.revision.content_chars, 1),
main_article_templates,
main_article_templates / max(wikitext.revision.content_chars, 1),
portuguese.stemmed.revision.stem_chars /
max(wikitext.revision.content_chars, 1)
]


draft_quality = (char_based + token_based + parse_based +
badwords + informals + dict_words + local_wiki)
82 changes: 82 additions & 0 deletions model_info/ptwiki.draft_quality.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
Model Information:
- type: GradientBoosting
- version: 0.2.1
- params: {'scale': False, 'population_rates': None, 'subsample': 1.0, 'loss': 'deviance', 'max_depth': 7, 'max_features': 'log2', 'min_impurity_decrease': 0.0, 'validation_fraction': 0.1, 'n_estimators': 700, 'min_impurity_split': None, 'criterion': 'friedman_mse', 'labels': ['OK', 'spam', 'unsuitable'], 'min_samples_split': 2, 'min_samples_leaf': 1, 'init': None, 'center': False, 'multilabel': False, 'max_leaf_nodes': None, 'learning_rate': 0.01, 'label_weights': None, 'n_iter_no_change': None, 'presort': 'auto', 'verbose': 0, 'random_state': None, 'tol': 0.0001, 'min_weight_fraction_leaf': 0.0, 'warm_start': False}
Environment:
- revscoring_version: '2.6.9'
- platform: 'Linux-4.9.0-8-amd64-x86_64-with-debian-9.4'
- machine: 'x86_64'
- version: '#1 SMP Debian 4.9.144-3.1 (2019-02-19)'
- system: 'Linux'
- processor: ''
- python_build: ('default', 'Sep 27 2018 17:25:39')
- python_compiler: 'GCC 6.3.0 20170516'
- python_branch: ''
- python_implementation: 'CPython'
- python_revision: ''
- python_version: '3.5.3'
- release: '4.9.0-8-amd64'

Statistics:
counts (n=3947):
label n ~OK ~spam ~unsuitable
------------ ---- --- ----- ------- -------------
'OK' 1600 --> 1297 71 232
'spam' 766 --> 72 425 269
'unsuitable' 1581 --> 191 130 1260
rates:
'OK' 'spam' 'unsuitable'
---------- ------ -------- --------------
sample 0.405 0.194 0.401
population 0.971 0.01 0.02
match_rate (micro=0.772, macro=0.36):
OK spam unsuitable
---- ------ ------------
0.79 0.068 0.223
filter_rate (micro=0.228, macro=0.64):
OK spam unsuitable
---- ------ ------------
0.21 0.932 0.777
recall (micro=0.808, macro=0.721):
OK spam unsuitable
----- ------ ------------
0.811 0.555 0.797
!recall (micro=0.886, macro=0.871):
OK spam unsuitable
----- ------ ------------
0.888 0.937 0.788
precision (micro=0.969, macro=0.381):
OK spam unsuitable
----- ------ ------------
0.996 0.078 0.07
!precision (micro=0.149, macro=0.705):
OK spam unsuitable
----- ------ ------------
0.124 0.995 0.995
f1 (micro=0.871, macro=0.386):
OK spam unsuitable
----- ------ ------------
0.894 0.137 0.129
!f1 (micro=0.237, macro=0.687):
OK spam unsuitable
----- ------ ------------
0.217 0.965 0.88
accuracy (micro=0.814, macro=0.845):
OK spam unsuitable
----- ------ ------------
0.813 0.933 0.788
fpr (micro=0.114, macro=0.129):
OK spam unsuitable
----- ------ ------------
0.112 0.063 0.212
roc_auc (micro=0.91, macro=0.895):
OK spam unsuitable
---- ------ ------------
0.91 0.899 0.875
pr_auc (micro=0.973, macro=0.434):
OK spam unsuitable
----- ------ ------------
0.997 0.119 0.186

- score_schema: {'type': 'object', 'title': 'Scikit learn-based classifier score with probability', 'properties': {'probability': {'type': 'object', 'description': 'A mapping of probabilities onto each of the potential output labels', 'properties': {'unsuitable': {'type': 'number'}, 'spam': {'type': 'number'}, 'OK': {'type': 'number'}}}, 'prediction': {'type': 'string', 'description': 'The most likely label predicted by the estimator'}}}

3 changes: 3 additions & 0 deletions models/ptwiki.draft_quality.gradient_boosting.model.bz2
Git LFS file not shown
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ revscoring >= 2.6.2, < 2.6.999
mysqltsv >= 0.0.7, < 0.0.999
yamlconf >= 0.2.2, < 0.2.999
json2tsv >= 0.1.2
articlequality >=0.4.1,<0.4.999

0 comments on commit f2ca783

Please sign in to comment.