Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Japanese Wikipedia damaging/goodfaith config & models #201

Merged
merged 1 commit into from Jun 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
98 changes: 74 additions & 24 deletions Makefile
Expand Up @@ -2258,6 +2258,10 @@ itwiki_tuning_reports: \


############################# Japanese Wikipedia ################################
datasets/jawiki.human_labeled_revisions.5k_2016.json:
./utility fetch_labels \
https://labels.wmflabs.org/campaigns/jawiki/15 > $@

# From https://quarry.wmflabs.org/query/9927
datasets/jawiki.sampled_revisions.40k_2016.json:
wget -qO- https://quarry.wmflabs.org/run/89016/output/0/json-lines?download=true > $@
Expand All @@ -2271,54 +2275,98 @@ datasets/jawiki.autolabeled_revisions.40k_2016.json: \
--revert-radius=5 \
--verbose > $@

datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json: \
datasets/jawiki.labeled_revisions.40k_2016.json: \
datasets/jawiki.human_labeled_revisions.5k_2016.json \
datasets/jawiki.autolabeled_revisions.40k_2016.json
./utility merge_labels $^ > $@

datasets/jawiki.labeled_revisions.w_cache.40k_2016.json: \
datasets/jawiki.labeled_revisions.40k_2016.json
cat $< | \
revscoring extract \
editquality.feature_lists.jawiki.reverted \
editquality.feature_lists.jawiki.damaging \
editquality.feature_lists.jawiki.goodfaith \
--host https://ja.wikipedia.org \
--extractors $(max_extractors) \
--verbose > $@

tuning_reports/jawiki.reverted.md: \
datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json
tuning_reports/jawiki.damaging.md: \
datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.jawiki.reverted \
reverted_for_damage \
$(reverted_tuning_statistic) \
--label-weight $(reverted_weight) \
--pop-rate "true=0.03256945140908635" \
--pop-rate "false=0.9674305485909136" \
editquality.feature_lists.jawiki.damaging \
damaging \
$(damaging_tuning_statistic) \
--label-weight $(damaging_weight) \
--pop-rate "true=0.010758453070269498" \
--pop-rate "false=0.9892415469297305" \
--center --scale \
--cv-timeout 60 \
--debug > $@

models/jawiki.reverted.gradient_boosting.model: \
datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json
models/jawiki.damaging.gradient_boosting.model: \
datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.jawiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).0 \
-p 'learning_rate=0.01' \
-p 'max_depth=7' \
editquality.feature_lists.jawiki.damaging \
damaging \
--version=$(damaging_major_minor).0 \
-p 'learning_rate=0.1' \
-p 'max_depth=1' \
-p 'max_features="log2"' \
-p 'n_estimators=700' \
--label-weight $(reverted_weight) \
--pop-rate "true=0.03256945140908635" \
--pop-rate "false=0.9674305485909136" \
-p 'min_samples_leaf=3' \
-p 'n_estimators=300' \
--label-weight $(damaging_weight) \
--pop-rate "true=0.010758453070269498" \
--pop-rate "false=0.9892415469297305" \
--center --scale > $@

revscoring model_info $@ > model_info/jawiki.damaging.md

tuning_reports/jawiki.goodfaith.md: \
datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.jawiki.goodfaith \
goodfaith \
$(goodfaith_tuning_statistic) \
--label-weight $(goodfaith_weight) \
--pop-rate "true=0.00461076560154407" \
--pop-rate "false=0.995389234398456" \
--center --scale \
--cv-timeout 60 \
--debug > $@

models/jawiki.goodfaith.gradient_boosting.model: \
datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.jawiki.goodfaith \
goodfaith \
--version=$(goodfaith_major_minor).0 \
-p 'learning_rate=0.1' \
-p 'max_depth=1' \
-p 'max_features="log2"' \
-p 'min_samples_leaf=7' \
-p 'n_estimators=300' \
--label-weight $(goodfaith_weight) \
--pop-rate "true=0.00461076560154407" \
--pop-rate "false=0.995389234398456" \
--center --scale > $@

revscoring model_info $@ > model_info/jawiki.reverted.md
revscoring model_info $@ > model_info/jawiki.goodfaith.md

jawiki_models: \
models/jawiki.reverted.gradient_boosting.model
models/jawiki.damaging.gradient_boosting.model \
models/jawiki.goodfaith.gradient_boosting.model

jawiki_tuning_reports: \
tuning_reports/jawiki.reverted.md
tuning_reports/jawiki.damaging.md \
tuning_reports/jawiki.goodfaith.md


############################# Korean Wikipedia ################################
Expand Down Expand Up @@ -3790,6 +3838,7 @@ models/zhwiki.damaging.gradient_boosting.model: \
-p 'learning_rate=0.01' \
-p 'max_depth=3' \
-p 'max_features="log2"' \
-p 'min_samples_leaf=7' \
-p 'n_estimators=700' \
--label-weight $(damaging_weight) \
--pop-rate "true=0.0405" \
Expand Down Expand Up @@ -3824,6 +3873,7 @@ models/zhwiki.goodfaith.gradient_boosting.model: \
-p 'learning_rate=0.01' \
-p 'max_depth=3' \
-p 'max_features="log2"' \
-p 'min_samples_leaf=5' \
-p 'n_estimators=500' \
--label-weight $(goodfaith_weight) \
--pop-rate "true=0.9682" \
Expand Down
43 changes: 33 additions & 10 deletions config/wikis/jawiki.yaml
Expand Up @@ -6,6 +6,8 @@ external_samples:
sampled_revisions.40k_2016:
quarry_page: https://quarry.wmflabs.org/query/9927
quarry_url: https://quarry.wmflabs.org/run/89016/output/0/json-lines?download=true
human_labeled_revisions.5k_2016:
labeling_campaign: https://labels.wmflabs.org/campaigns/jawiki/15

autolabeled_samples:
trusted_edits: 1000
Expand All @@ -22,22 +24,43 @@ autolabeled_samples:
labeled_samples:
autolabeled_revisions.40k_2016: sampled_revisions.40k_2016

merged_samples:
labeled_revisions.40k_2016:
- human_labeled_revisions.5k_2016
- autolabeled_revisions.40k_2016

extracted_samples:
autolabeled_revisions.w_cache.40k_2016:
sample: autolabeled_revisions.40k_2016
labeled_revisions.w_cache.40k_2016:
sample: labeled_revisions.40k_2016
features_for:
- reverted
- damaging
- goodfaith

models:
reverted:
observations: autolabeled_revisions.w_cache.40k_2016
label: reverted_for_damage
pop_rate_true: 0.03256945140908635
damaging:
observations: labeled_revisions.w_cache.40k_2016
label: damaging
pop_rate_true: 0.010758453070269498
tune: true
cv_train:
algorithm: GradientBoosting
parameters:
learning_rate: 0.01
max_depth: 7
learning_rate: 0.1
max_depth: 1
max_features: log2
n_estimators: 700
n_estimators: 300
min_samples_leaf: 3
goodfaith:
observations: labeled_revisions.w_cache.40k_2016
label: goodfaith
pop_rate_true: 0.00461076560154407
tune: true
cv_train:
algorithm: GradientBoosting
parameters:
learning_rate: 0.1
max_depth: 1
max_features: log2
n_estimators: 300
min_samples_leaf: 7

81 changes: 81 additions & 0 deletions model_info/jawiki.damaging.md
@@ -0,0 +1,81 @@
Model Information:
- type: GradientBoosting
- version: 0.5.0
- params: {'label_weights': OrderedDict([(True, 10)]), 'subsample': 1.0, 'random_state': None, 'population_rates': None, 'criterion': 'friedman_mse', 'max_leaf_nodes': None, 'multilabel': False, 'min_impurity_split': None, 'loss': 'deviance', 'n_estimators': 300, 'min_samples_leaf': 3, 'presort': 'auto', 'min_weight_fraction_leaf': 0.0, 'learning_rate': 0.1, 'warm_start': False, 'scale': True, 'min_impurity_decrease': 0.0, 'init': None, 'verbose': 0, 'labels': [True, False], 'max_features': 'log2', 'center': True, 'max_depth': 1, 'min_samples_split': 2}
Environment:
- revscoring_version: '2.4.0'
- platform: 'Linux-4.9.0-9-amd64-x86_64-with-debian-9.9'
- machine: 'x86_64'
- version: '#1 SMP Debian 4.9.168-1+deb9u2 (2019-05-13)'
- system: 'Linux'
- processor: ''
- python_build: ('default', 'Sep 27 2018 17:25:39')
- python_compiler: 'GCC 6.3.0 20170516'
- python_branch: ''
- python_implementation: 'CPython'
- python_revision: ''
- python_version: '3.5.3'
- release: '4.9.0-9-amd64'

Statistics:
counts (n=27875):
label n ~True ~False
------- ----- --- ------- --------
True 298 --> 42 256
False 27577 --> 194 27383
rates:
True False
---------- ------ -------
sample 0.011 0.989
population 0.011 0.989
match_rate (micro=0.981, macro=0.5):
False True
------- ------
0.992 0.008
filter_rate (micro=0.019, macro=0.5):
False True
------- ------
0.008 0.992
recall (micro=0.984, macro=0.567):
False True
------- ------
0.993 0.141
!recall (micro=0.15, macro=0.567):
False True
------- ------
0.141 0.993
precision (micro=0.982, macro=0.585):
False True
------- ------
0.991 0.179
!precision (micro=0.188, macro=0.585):
False True
------- ------
0.179 0.991
f1 (micro=0.983, macro=0.575):
False True
------- ------
0.992 0.158
!f1 (micro=0.167, macro=0.575):
False True
------- ------
0.158 0.992
accuracy (micro=0.984, macro=0.984):
False True
------- ------
0.984 0.984
fpr (micro=0.85, macro=0.433):
False True
------- ------
0.859 0.007
roc_auc (micro=0.85, macro=0.85):
False True
------- ------
0.85 0.85
pr_auc (micro=0.988, macro=0.537):
False True
------- ------
0.998 0.075

- score_schema: {'title': 'Scikit learn-based classifier score with probability', 'properties': {'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}, 'probability': {'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}, 'description': 'A mapping of probabilities onto each of the potential output labels', 'type': 'object'}}, 'type': 'object'}

81 changes: 81 additions & 0 deletions model_info/jawiki.goodfaith.md
@@ -0,0 +1,81 @@
Model Information:
- type: GradientBoosting
- version: 0.5.0
- params: {'learning_rate': 0.1, 'init': None, 'scale': True, 'labels': [True, False], 'min_samples_leaf': 7, 'random_state': None, 'loss': 'deviance', 'criterion': 'friedman_mse', 'verbose': 0, 'n_estimators': 300, 'presort': 'auto', 'population_rates': None, 'min_impurity_split': None, 'warm_start': False, 'min_weight_fraction_leaf': 0.0, 'max_depth': 1, 'max_leaf_nodes': None, 'center': True, 'max_features': 'log2', 'multilabel': False, 'min_impurity_decrease': 0.0, 'label_weights': OrderedDict([(False, 10)]), 'subsample': 1.0, 'min_samples_split': 2}
Environment:
- revscoring_version: '2.4.0'
- platform: 'Linux-4.9.0-9-amd64-x86_64-with-debian-9.9'
- machine: 'x86_64'
- version: '#1 SMP Debian 4.9.168-1+deb9u2 (2019-05-13)'
- system: 'Linux'
- processor: ''
- python_build: ('default', 'Sep 27 2018 17:25:39')
- python_compiler: 'GCC 6.3.0 20170516'
- python_branch: ''
- python_implementation: 'CPython'
- python_revision: ''
- python_version: '3.5.3'
- release: '4.9.0-9-amd64'

Statistics:
counts (n=27875):
label n ~True ~False
------- ----- --- ------- --------
True 27748 --> 27707 41
False 127 --> 117 10
rates:
True False
---------- ------ -------
sample 0.995 0.005
population 0.005 0.995
match_rate (micro=0.082, macro=0.5):
False True
------- ------
0.078 0.922
filter_rate (micro=0.918, macro=0.5):
False True
------- ------
0.922 0.078
recall (micro=0.083, macro=0.539):
False True
------- ------
0.079 0.999
!recall (micro=0.994, macro=0.539):
False True
------- ------
0.999 0.079
precision (micro=0.995, macro=0.502):
False True
------- ------
1 0.005
!precision (micro=0.01, macro=0.502):
False True
------- ------
0.005 1
f1 (micro=0.145, macro=0.078):
False True
------- ------
0.146 0.01
!f1 (micro=0.011, macro=0.078):
False True
------- ------
0.01 0.146
accuracy (micro=0.083, macro=0.083):
False True
------- ------
0.083 0.083
fpr (micro=0.006, macro=0.461):
False True
------- ------
0.001 0.921
roc_auc (micro=0.833, macro=0.834):
False True
------- ------
0.833 0.834
pr_auc (micro=0.985, macro=0.588):
False True
------- ------
0.989 0.188

- score_schema: {'title': 'Scikit learn-based classifier score with probability', 'type': 'object', 'properties': {'probability': {'description': 'A mapping of probabilities onto each of the potential output labels', 'type': 'object', 'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}}, 'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}}}