Skip to content

Commit

Permalink
Template huwiki (#135)
Browse files Browse the repository at this point in the history
That was easy
  • Loading branch information
Ladsgroup authored and adamwight committed Feb 26, 2018
1 parent 0811455 commit d037594
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 73 deletions.
72 changes: 72 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,78 @@ hrwiki_models: \
hrwiki_tuning_reports: \
tuning_reports/hrwiki.reverted.md

############################# Hungarian Wikipedia ################################

datasets/huwiki.sampled_revisions.40k_2016.json:
wget -qO- http://quarry.wmflabs.org/run/79645/output/0/json-lines?download=true > $@

datasets/huwiki.autolabeled_revisions.40k_2016.json: \
datasets/huwiki.sampled_revisions.40k_2016.json
cat $< | \
./utility autolabel --host=https://hu.wikipedia.org \
--trusted-groups=sysop,oversight,trusted,bot,rollbacker,checkuser,abusefilter,bureaucrat \
--trusted-edits=1000 \
--revert-radius=3 \
--revert-window=48 \
--verbose > $@

datasets/huwiki.revisions_for_review.5k_2016.json: \
datasets/huwiki.autolabeled_revisions.40k_2016.json
( \
cat $< | \
grep '"needs_review": true' | \
shuf -n 2500; \
cat $< | \
grep '"needs_review": false' | \
shuf -n 2500 \
) | shuf > $@

datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json: \
datasets/huwiki.autolabeled_revisions.40k_2016.json
cat $< | \
revscoring extract \
editquality.feature_lists.huwiki.reverted \
--host https://hu.wikipedia.org \
--extractor $(max_extractors) \
--verbose > $@

tuning_reports/huwiki.reverted.md: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale \
--cv-timeout 60 \
--debug > $@

models/huwiki.reverted.rf.model: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.RandomForest \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).1 \
-p 'criterion=entropy' \
-p 'min_samples_leaf=13' \
-p 'n_estimators=320' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale > $@

huwiki_models: \
models/huwiki.reverted.rf.model

huwiki_tuning_reports: \
tuning_reports/huwiki.reverted.md

############################# Indonesian Wikipedia ################################

datasets/idwiki.sampled_revisions.100k_2016.json:
Expand Down
71 changes: 0 additions & 71 deletions Makefile.manual
Original file line number Diff line number Diff line change
Expand Up @@ -593,77 +593,6 @@ hewiki_tuning_reports: \
tuning_reports/hewiki.damaging.md \
tuning_reports/hewiki.goodfaith.md

############################### Hungarian Wikipedia ###########################

datasets/huwiki.sampled_revisions.40k_2016.json:
wget -qO- http://quarry.wmflabs.org/run/79645/output/0/json-lines?download=true > $@

datasets/huwiki.autolabeled_revisions.40k_2016.json: \
datasets/huwiki.sampled_revisions.40k_2016.json
cat $< | \
./utility autolabel --host=https://hu.wikipedia.org \
--trusted-groups=sysop,oversight,trusted,bot,rollbacker,checkuser,abusefilter,bureaucrat \
--trusted-edits=1000 \
--verbose > $@

datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json: \
datasets/huwiki.autolabeled_revisions.40k_2016.json
cat $< | \
revscoring extract \
editquality.feature_lists.huwiki.reverted \
--host https://hu.wikipedia.org \
--extractor $(max_extractors) \
--verbose > $@

tuning_reports/huwiki.reverted.md: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale \
--cv-timeout=60 \
--debug > $@

models/huwiki.reverted.rf.model: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.RandomForest \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).1 \
-p 'criterion="entropy"' \
-p 'max_features="log2"' \
-p 'n_estimators=320' \
-p 'min_samples_leaf=13' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale > $@

datasets/huwiki.revisions_for_review.5k_2016.json: \
datasets/huwiki.autolabeled_revisions.40k_2016.json
( \
cat $< | \
grep '"needs_review": true' | \
shuf -n 2500; \
cat $< | \
grep '"needs_review": false' | \
shuf -n 2500 \
) | shuf > $@

huwiki_models: \
models/huwiki.reverted.rf.model

huwiki_tuning_reports: \
tuning_reports/huwiki.reverted.md

############################# Polish Wikipedia ############################

datasets/plwiki.sampled_revisions.500k_2015.json:
Expand Down
1 change: 0 additions & 1 deletion config/manual_wikis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ manual_wikis:
- enwiktionary
- fiwiki
- hewiki
- huwiki
- plwiki
- svwiki
- wikidatawiki
Expand Down
1 change: 1 addition & 0 deletions config/wikis/huwiki.yaml.tmp → config/wikis/huwiki.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ models:
n_estimators: 320
min_samples_leaf: 13
pop_rate_true: 0.014812583163867339
build_number: 1
3 changes: 2 additions & 1 deletion editquality/codegen/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def load_wiki(wiki, config):
continue
model = wiki["models"][model_name]
model_defaults = copy.deepcopy(config["model_defaults"])
model = util.deep_update(model_defaults, model)
if not model.get('rf'):
model = util.deep_update(model_defaults, model)
result[model_name] = model

wiki["models"] = result
Expand Down

0 comments on commit d037594

Please sign in to comment.