-
Notifications
You must be signed in to change notification settings - Fork 25
/
Makefile.j2
157 lines (147 loc) · 7.22 KB
/
Makefile.j2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# This file is built automatically using cg.py file and Makefile.j2
# Any change you make on this file will be lost in the next run.
# Remove target files after command failure.
.DELETE_ON_ERROR:
models: \
{%- for wiki in wiki_names %}
{{ wiki }}_models{% if not loop.last %} \{% endif -%}
{% endfor %}
tuning_reports: \
{%- for wiki in wiki_names %}
{{ wiki }}_tuning_reports{% if not loop.last %} \{% endif -%}
{% endfor %}
test_statistics = \
-s 'table' -s 'accuracy' -s 'precision' -s 'recall' \
-s 'pr' -s 'roc' \
-s 'recall_at_fpr(max_fpr=0.10)' \
-s 'filter_rate_at_recall(min_recall=0.9)' \
-s 'filter_rate_at_recall(min_recall=0.75)' \
-s 'recall_at_precision(min_precision=0.995)' \
-s 'recall_at_precision(min_precision=0.99)' \
-s 'recall_at_precision(min_precision=0.98)' \
-s 'recall_at_precision(min_precision=0.90)' \
-s 'recall_at_precision(min_precision=0.75)' \
-s 'recall_at_precision(min_precision=0.60)' \
-s 'recall_at_precision(min_precision=0.45)' \
-s 'recall_at_precision(min_precision=0.15)'
major_minor = 0.3
reverted_major_minor = $(major_minor)
damaging_major_minor = $(major_minor)
goodfaith_major_minor = $(major_minor)
include Makefile.manual
{% for wiki in wikis %}
############################# {{ wiki.label }} ################################
{% for sample_name, sample in wiki.samples|dictsort %}{%- if "quarry_url" in sample %}
{%- if "quarry_page" in sample %}
# From {{ sample.quarry_page }}{% endif %}
datasets/{{ wiki.name }}.sampled_revisions.{{ sample_name }}.json:
wget -qO- {{ sample.quarry_url }} > $@
datasets/{{ wiki.name }}.autolabeled_revisions.{{ sample_name }}.json: \
datasets/{{ wiki.name }}.sampled_revisions.{{ sample_name }}.json
cat $< | \
./utility autolabel --host=https://{{ wiki.host }} \
--trusted-groups={{ wiki.trusted_groups|join(",") }} \
--trusted-edits=1000 \
--revert-radius=3 \
--revert-window=48 \
--verbose > $@
{% endif %}{% if "labeling_campaign" in sample %}
datasets/{{ wiki.name }}.human_labeled_revisions.{{ sample_name }}.json:
./utility fetch_labels \
{{ sample.labeling_campaign }} > $@
{% endif %}{% endfor %}{% if wiki.balanced_5k_set_for_review %}
datasets/{{ wiki.name }}.revisions_for_review.{{ wiki.review_sample }}.json: \
datasets/{{ wiki.name }}.autolabeled_revisions.{% if wiki.default_sample_bigger_sample %}{{ wiki.default_sample_bigger_sample }}{% else %}{{ wiki.default_sample }}{% endif %}.json
( \
cat $< | \
grep '"needs_review": true' | \
shuf -n 2500; \
cat $< | \
grep '"needs_review": false' | \
shuf -n 2500 \
) | shuf > $@
{% endif %}{% if wiki.unbalanced_5k_set_for_review %}
datasets/{{ wiki.name }}.revisions_for_review.{{ wiki.review_sample }}.json: \
datasets/{{ wiki.name }}.autolabeled_revisions.{% if wiki.sample_to_build_review %}{{ wiki.sample_to_build_review }}{% else %}{{ wiki.default_sample }}{% endif %}.json
grep '"needs_review": true' $< | shuf > $@
{% endif %}{% if wiki.merged_samples %}{% for big_sample_name, big_sample in wiki.merged_samples|dictsort %}
datasets/{{ wiki.name }}.labeled_revisions.{{ big_sample_name }}.json: \
{%- if big_sample.human_labeled_revisions %}
datasets/{{ wiki.name }}.human_labeled_revisions.{{ big_sample.human_labeled_revisions }}.json{%- if big_sample.autolabeled_revisions %} \
datasets/{{ wiki.name }}.autolabeled_revisions.{{ big_sample.autolabeled_revisions }}.json{% endif %}{% endif %}
./utility merge_labels $^ > $@
{% endfor %}
{%- endif %}{% if wiki.models.reverted %}
datasets/{{ wiki.name }}.autolabeled_revisions.w_cache.{{ wiki.default_sample }}.json: \
datasets/{{ wiki.name }}.autolabeled_revisions.{% if wiki.default_sample_bigger_sample %}{{ wiki.default_sample_bigger_sample }}{% else %}{{ wiki.default_sample }}{% endif %}.json
{% if wiki.default_sample_size %}shuf -n {{ wiki.default_sample_size }}{% else %}cat{% endif %} $< | \
revscoring extract \
editquality.feature_lists.{{ wiki.name }}.reverted \
--host https://{{ wiki.host }} \
--extractor $(max_extractors) \
--verbose > $@
{% endif %}{% if wiki.merged_samples %}
{%- for big_sample_name, big_sample in wiki.merged_samples|dictsort %}
datasets/{{ wiki.name }}.labeled_revisions.w_cache.{{ big_sample_name }}.json: \
datasets/{{ wiki.name }}.labeled_revisions.{{ big_sample_name }}.json
cat $< | \
revscoring extract \
{% for model_name, model in wiki.models.items() -%}
editquality.feature_lists.{{ wiki.name }}.{{ model_name }} \
{% endfor -%}
--host https://{{ wiki.host }} \
--extractor $(max_extractors) \
--verbose > $@
{% endfor %}{% endif %}{% for model_name, model in wiki.models.items() %}
tuning_reports/{{ wiki.name }}.{{ model_name }}.md: \
{%- if model.datasets %}{%- for dataset in model.datasets %}
datasets/{{ wiki.name }}.{% if model_name == "reverted" %}auto{% endif %}labeled_revisions.w_cache.{{ dataset }}.json{% if not loop.last %} \{% endif -%}
{% endfor %}{% else %}
datasets/{{ wiki.name }}.{% if model_name == "reverted" %}auto{% endif %}labeled_revisions.w_cache.{{ wiki.default_sample }}.json
{%- endif %}
cat {% if model.datasets %}$^{% else %}$<{% endif %} | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.{{ wiki.name }}.{{ model_name }} \
{%- if model_name == "reverted" %}
reverted_for_damage \
{% else %}
{{ model_name }} \
{% endif -%}
roc_auc.labels.true \
--label-weight "{% if model_name == "goodfaith" %}false{% else %}true{% endif %}=$({{ model_name }}_weight)" \
--pop-rate "true={{ model.pop_rate_true }}" \
--pop-rate "false={{ 1 - model.pop_rate_true }}" \
--center --scale \
--cv-timeout 60 \
--debug > $@
models/{{ wiki.name }}.{{ model_name }}.{% if model.rf %}rf{% else %}gradient_boosting{% endif %}.model: \
{%- if model.datasets %}{%- for dataset in model.datasets %}
datasets/{{ wiki.name }}.{% if model_name == "reverted" %}auto{% endif %}labeled_revisions.w_cache.{{ dataset }}.json{% if not loop.last %} \{% endif -%}
{% endfor %}{% else %}
datasets/{{ wiki.name }}.{% if model_name == "reverted" %}auto{% endif %}labeled_revisions.w_cache.{{ wiki.default_sample }}.json
{%- endif %}
cat {% if model.datasets %}$^{% else %}$<{% endif %} | \
revscoring cv_train \
revscoring.scoring.models.{% if model.rf %}RandomForest{% else %}GradientBoosting{% endif %} \
editquality.feature_lists.{{ wiki.name }}.{{ model_name }} \
{% if model_name == "reverted" %}reverted_for_damage{% else %}{{ model_name }}{% endif %} \
--version=$({{ model_name }}_major_minor).{{model.build_number}} \
{% for key, value in model.tuning_params|dictsort -%}
-p '{{ key }}={{ value }}' \
{% endfor -%}
--label-weight "{% if model_name == "goodfaith" %}false{% else %}true{% endif %}=$({{ model_name }}_weight)" \
--pop-rate "true={{ model.pop_rate_true }}" \
--pop-rate "false={{ 1 - model.pop_rate_true }}" \
--center --scale > $@
{% endfor %}
{% if wiki.models %}{{ wiki.name }}_models: \
{%- for model_name, model in wiki.models.items() %}
models/{{ wiki.name }}.{{ model_name }}.{% if model.rf %}rf{% else %}gradient_boosting{% endif %}.model{% if not loop.last %} \{% endif -%}
{% endfor %}
{{ wiki.name }}_tuning_reports: \
{%- for model_name, model in wiki.models.items() %}
tuning_reports/{{ wiki.name }}.{{ model_name }}.md{% if not loop.last %} \{% endif -%}
{% endfor %}
{%- endif -%}
{% endfor %}