wikimedia · groceryheist · Jun 7, 2019 · Jun 4, 2019
diff --git a/Makefile b/Makefile
@@ -2258,6 +2258,10 @@ itwiki_tuning_reports: \
 
 
 ############################# Japanese Wikipedia ################################
+datasets/jawiki.human_labeled_revisions.5k_2016.json:
+	./utility fetch_labels \
+		https://labels.wmflabs.org/campaigns/jawiki/15 > $@
+
 # From https://quarry.wmflabs.org/query/9927
 datasets/jawiki.sampled_revisions.40k_2016.json:
 	wget -qO- https://quarry.wmflabs.org/run/89016/output/0/json-lines?download=true > $@
@@ -2271,54 +2275,98 @@ datasets/jawiki.autolabeled_revisions.40k_2016.json: \
 		--revert-radius=5 \
 		--verbose > $@
 
-datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json: \
+datasets/jawiki.labeled_revisions.40k_2016.json: \
+		datasets/jawiki.human_labeled_revisions.5k_2016.json \
 		datasets/jawiki.autolabeled_revisions.40k_2016.json
+	./utility merge_labels $^ > $@
+
+datasets/jawiki.labeled_revisions.w_cache.40k_2016.json: \
+		datasets/jawiki.labeled_revisions.40k_2016.json
 	cat $< | \
 	revscoring extract \
-		editquality.feature_lists.jawiki.reverted \
+		editquality.feature_lists.jawiki.damaging \
+		editquality.feature_lists.jawiki.goodfaith \
 		--host https://ja.wikipedia.org \
 		--extractors $(max_extractors) \
 		--verbose > $@
 
-tuning_reports/jawiki.reverted.md: \
-		datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json
+tuning_reports/jawiki.damaging.md: \
+		datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
 	cat $< | \
 	revscoring tune \
 		config/classifiers.params.yaml \
-		editquality.feature_lists.jawiki.reverted \
-		reverted_for_damage \
-		$(reverted_tuning_statistic) \
-		--label-weight $(reverted_weight) \
-		--pop-rate "true=0.03256945140908635" \
-		--pop-rate "false=0.9674305485909136" \
+		editquality.feature_lists.jawiki.damaging \
+		damaging \
+		$(damaging_tuning_statistic) \
+		--label-weight $(damaging_weight) \
+		--pop-rate "true=0.010758453070269498" \
+		--pop-rate "false=0.9892415469297305" \
 		--center --scale \
 		--cv-timeout 60 \
 		--debug > $@
 
-models/jawiki.reverted.gradient_boosting.model: \
-		datasets/jawiki.autolabeled_revisions.w_cache.40k_2016.json
+models/jawiki.damaging.gradient_boosting.model: \
+		datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
 	cat $< | \
 	revscoring cv_train \
 		revscoring.scoring.models.GradientBoosting \
-		editquality.feature_lists.jawiki.reverted \
-		reverted_for_damage \
-		--version=$(reverted_major_minor).0 \
-		-p 'learning_rate=0.01' \
-		-p 'max_depth=7' \
+		editquality.feature_lists.jawiki.damaging \
+		damaging \
+		--version=$(damaging_major_minor).0 \
+		-p 'learning_rate=0.1' \
+		-p 'max_depth=1' \
 		-p 'max_features="log2"' \
-		-p 'n_estimators=700' \
-		--label-weight $(reverted_weight) \
-		--pop-rate "true=0.03256945140908635" \
-		--pop-rate "false=0.9674305485909136" \
+		-p 'min_samples_leaf=3' \
+		-p 'n_estimators=300' \
+		--label-weight $(damaging_weight) \
+		--pop-rate "true=0.010758453070269498" \
+		--pop-rate "false=0.9892415469297305" \
+		--center --scale > $@
+
+	revscoring model_info $@ > model_info/jawiki.damaging.md
+
+tuning_reports/jawiki.goodfaith.md: \
+		datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
+	cat $< | \
+	revscoring tune \
+		config/classifiers.params.yaml \
+		editquality.feature_lists.jawiki.goodfaith \
+		goodfaith \
+		$(goodfaith_tuning_statistic) \
+		--label-weight $(goodfaith_weight) \
+		--pop-rate "true=0.00461076560154407" \
+		--pop-rate "false=0.995389234398456" \
+		--center --scale \
+		--cv-timeout 60 \
+		--debug > $@
+
+models/jawiki.goodfaith.gradient_boosting.model: \
+		datasets/jawiki.labeled_revisions.w_cache.40k_2016.json
+	cat $< | \
+	revscoring cv_train \
+		revscoring.scoring.models.GradientBoosting \
+		editquality.feature_lists.jawiki.goodfaith \
+		goodfaith \
+		--version=$(goodfaith_major_minor).0 \
+		-p 'learning_rate=0.1' \
+		-p 'max_depth=1' \
+		-p 'max_features="log2"' \
+		-p 'min_samples_leaf=7' \
+		-p 'n_estimators=300' \
+		--label-weight $(goodfaith_weight) \
+		--pop-rate "true=0.00461076560154407" \
+		--pop-rate "false=0.995389234398456" \
 		--center --scale > $@
 
-	revscoring model_info $@ > model_info/jawiki.reverted.md
+	revscoring model_info $@ > model_info/jawiki.goodfaith.md
 
 jawiki_models: \
-	models/jawiki.reverted.gradient_boosting.model
+	models/jawiki.damaging.gradient_boosting.model \
+	models/jawiki.goodfaith.gradient_boosting.model
 
 jawiki_tuning_reports: \
-	tuning_reports/jawiki.reverted.md
+	tuning_reports/jawiki.damaging.md \
+	tuning_reports/jawiki.goodfaith.md
 
 
 ############################# Korean Wikipedia ################################
@@ -3790,6 +3838,7 @@ models/zhwiki.damaging.gradient_boosting.model: \
 		-p 'learning_rate=0.01' \
 		-p 'max_depth=3' \
 		-p 'max_features="log2"' \
+		-p 'min_samples_leaf=7' \
 		-p 'n_estimators=700' \
 		--label-weight $(damaging_weight) \
 		--pop-rate "true=0.0405" \
@@ -3824,6 +3873,7 @@ models/zhwiki.goodfaith.gradient_boosting.model: \
 		-p 'learning_rate=0.01' \
 		-p 'max_depth=3' \
 		-p 'max_features="log2"' \
+		-p 'min_samples_leaf=5' \
 		-p 'n_estimators=500' \
 		--label-weight $(goodfaith_weight) \
 		--pop-rate "true=0.9682" \

diff --git a/config/wikis/jawiki.yaml b/config/wikis/jawiki.yaml
@@ -6,6 +6,8 @@ external_samples:
   sampled_revisions.40k_2016:
     quarry_page: https://quarry.wmflabs.org/query/9927
     quarry_url: https://quarry.wmflabs.org/run/89016/output/0/json-lines?download=true
+  human_labeled_revisions.5k_2016:
+    labeling_campaign: https://labels.wmflabs.org/campaigns/jawiki/15
 
 autolabeled_samples:
   trusted_edits: 1000
@@ -22,22 +24,43 @@ autolabeled_samples:
   labeled_samples:
     autolabeled_revisions.40k_2016: sampled_revisions.40k_2016
 
+merged_samples:
+  labeled_revisions.40k_2016:
+    - human_labeled_revisions.5k_2016
+    - autolabeled_revisions.40k_2016
+
 extracted_samples:
-  autolabeled_revisions.w_cache.40k_2016:
-    sample: autolabeled_revisions.40k_2016
+  labeled_revisions.w_cache.40k_2016:
+    sample: labeled_revisions.40k_2016
     features_for:
-      - reverted
+      - damaging
+      - goodfaith
 
 models:
-  reverted:
-    observations: autolabeled_revisions.w_cache.40k_2016
-    label: reverted_for_damage
-    pop_rate_true: 0.03256945140908635
+  damaging:
+    observations: labeled_revisions.w_cache.40k_2016
+    label: damaging
+    pop_rate_true: 0.010758453070269498
     tune: true
     cv_train:
       algorithm: GradientBoosting
       parameters:
-        learning_rate: 0.01
-        max_depth: 7
+        learning_rate: 0.1
+        max_depth: 1
         max_features: log2
-        n_estimators: 700
+        n_estimators: 300
+        min_samples_leaf: 3
+  goodfaith:
+    observations: labeled_revisions.w_cache.40k_2016
+    label: goodfaith
+    pop_rate_true: 0.00461076560154407
+    tune: true
+    cv_train:
+      algorithm: GradientBoosting
+      parameters:
+        learning_rate: 0.1
+        max_depth: 1
+        max_features: log2
+        n_estimators: 300
+        min_samples_leaf: 7
+
diff --git a/model_info/jawiki.damaging.md b/model_info/jawiki.damaging.md
@@ -0,0 +1,81 @@
+Model Information:
+	 - type: GradientBoosting
+	 - version: 0.5.0
+	 - params: {'label_weights': OrderedDict([(True, 10)]), 'subsample': 1.0, 'random_state': None, 'population_rates': None, 'criterion': 'friedman_mse', 'max_leaf_nodes': None, 'multilabel': False, 'min_impurity_split': None, 'loss': 'deviance', 'n_estimators': 300, 'min_samples_leaf': 3, 'presort': 'auto', 'min_weight_fraction_leaf': 0.0, 'learning_rate': 0.1, 'warm_start': False, 'scale': True, 'min_impurity_decrease': 0.0, 'init': None, 'verbose': 0, 'labels': [True, False], 'max_features': 'log2', 'center': True, 'max_depth': 1, 'min_samples_split': 2}
+	Environment:
+	 - revscoring_version: '2.4.0'
+	 - platform: 'Linux-4.9.0-9-amd64-x86_64-with-debian-9.9'
+	 - machine: 'x86_64'
+	 - version: '#1 SMP Debian 4.9.168-1+deb9u2 (2019-05-13)'
+	 - system: 'Linux'
+	 - processor: ''
+	 - python_build: ('default', 'Sep 27 2018 17:25:39')
+	 - python_compiler: 'GCC 6.3.0 20170516'
+	 - python_branch: ''
+	 - python_implementation: 'CPython'
+	 - python_revision: ''
+	 - python_version: '3.5.3'
+	 - release: '4.9.0-9-amd64'
+
+	Statistics:
+	counts (n=27875):
+		label        n         ~True    ~False
+		-------  -----  ---  -------  --------
+		True       298  -->       42       256
+		False    27577  -->      194     27383
+	rates:
+		              True    False
+		----------  ------  -------
+		sample       0.011    0.989
+		population   0.011    0.989
+	match_rate (micro=0.981, macro=0.5):
+		  False    True
+		-------  ------
+		  0.992   0.008
+	filter_rate (micro=0.019, macro=0.5):
+		  False    True
+		-------  ------
+		  0.008   0.992
+	recall (micro=0.984, macro=0.567):
+		  False    True
+		-------  ------
+		  0.993   0.141
+	!recall (micro=0.15, macro=0.567):
+		  False    True
+		-------  ------
+		  0.141   0.993
+	precision (micro=0.982, macro=0.585):
+		  False    True
+		-------  ------
+		  0.991   0.179
+	!precision (micro=0.188, macro=0.585):
+		  False    True
+		-------  ------
+		  0.179   0.991
+	f1 (micro=0.983, macro=0.575):
+		  False    True
+		-------  ------
+		  0.992   0.158
+	!f1 (micro=0.167, macro=0.575):
+		  False    True
+		-------  ------
+		  0.158   0.992
+	accuracy (micro=0.984, macro=0.984):
+		  False    True
+		-------  ------
+		  0.984   0.984
+	fpr (micro=0.85, macro=0.433):
+		  False    True
+		-------  ------
+		  0.859   0.007
+	roc_auc (micro=0.85, macro=0.85):
+		  False    True
+		-------  ------
+		   0.85    0.85
+	pr_auc (micro=0.988, macro=0.537):
+		  False    True
+		-------  ------
+		  0.998   0.075
+
+	 - score_schema: {'title': 'Scikit learn-based classifier score with probability', 'properties': {'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}, 'probability': {'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}, 'description': 'A mapping of probabilities onto each of the potential output labels', 'type': 'object'}}, 'type': 'object'}
+
diff --git a/model_info/jawiki.goodfaith.md b/model_info/jawiki.goodfaith.md
@@ -0,0 +1,81 @@
+Model Information:
+	 - type: GradientBoosting
+	 - version: 0.5.0
+	 - params: {'learning_rate': 0.1, 'init': None, 'scale': True, 'labels': [True, False], 'min_samples_leaf': 7, 'random_state': None, 'loss': 'deviance', 'criterion': 'friedman_mse', 'verbose': 0, 'n_estimators': 300, 'presort': 'auto', 'population_rates': None, 'min_impurity_split': None, 'warm_start': False, 'min_weight_fraction_leaf': 0.0, 'max_depth': 1, 'max_leaf_nodes': None, 'center': True, 'max_features': 'log2', 'multilabel': False, 'min_impurity_decrease': 0.0, 'label_weights': OrderedDict([(False, 10)]), 'subsample': 1.0, 'min_samples_split': 2}
+	Environment:
+	 - revscoring_version: '2.4.0'
+	 - platform: 'Linux-4.9.0-9-amd64-x86_64-with-debian-9.9'
+	 - machine: 'x86_64'
+	 - version: '#1 SMP Debian 4.9.168-1+deb9u2 (2019-05-13)'
+	 - system: 'Linux'
+	 - processor: ''
+	 - python_build: ('default', 'Sep 27 2018 17:25:39')
+	 - python_compiler: 'GCC 6.3.0 20170516'
+	 - python_branch: ''
+	 - python_implementation: 'CPython'
+	 - python_revision: ''
+	 - python_version: '3.5.3'
+	 - release: '4.9.0-9-amd64'
+
+	Statistics:
+	counts (n=27875):
+		label        n         ~True    ~False
+		-------  -----  ---  -------  --------
+		True     27748  -->    27707        41
+		False      127  -->      117        10
+	rates:
+		              True    False
+		----------  ------  -------
+		sample       0.995    0.005
+		population   0.005    0.995
+	match_rate (micro=0.082, macro=0.5):
+		  False    True
+		-------  ------
+		  0.078   0.922
+	filter_rate (micro=0.918, macro=0.5):
+		  False    True
+		-------  ------
+		  0.922   0.078
+	recall (micro=0.083, macro=0.539):
+		  False    True
+		-------  ------
+		  0.079   0.999
+	!recall (micro=0.994, macro=0.539):
+		  False    True
+		-------  ------
+		  0.999   0.079
+	precision (micro=0.995, macro=0.502):
+		  False    True
+		-------  ------
+		      1   0.005
+	!precision (micro=0.01, macro=0.502):
+		  False    True
+		-------  ------
+		  0.005       1
+	f1 (micro=0.145, macro=0.078):
+		  False    True
+		-------  ------
+		  0.146    0.01
+	!f1 (micro=0.011, macro=0.078):
+		  False    True
+		-------  ------
+		   0.01   0.146
+	accuracy (micro=0.083, macro=0.083):
+		  False    True
+		-------  ------
+		  0.083   0.083
+	fpr (micro=0.006, macro=0.461):
+		  False    True
+		-------  ------
+		  0.001   0.921
+	roc_auc (micro=0.833, macro=0.834):
+		  False    True
+		-------  ------
+		  0.833   0.834
+	pr_auc (micro=0.985, macro=0.588):
+		  False    True
+		-------  ------
+		  0.989   0.188
+
+	 - score_schema: {'title': 'Scikit learn-based classifier score with probability', 'type': 'object', 'properties': {'probability': {'description': 'A mapping of probabilities onto each of the potential output labels', 'type': 'object', 'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}}, 'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}}}
+