From 11a1baf7d1afaefa5f93eea94d0b277746eba785 Mon Sep 17 00:00:00 2001 From: Amir Sarabadani Date: Fri, 28 Aug 2020 14:10:32 +0200 Subject: [PATCH 1/3] Add is_astronomical_object feature for wikidatawiki Bug: T260770 --- articlequality/feature_lists/wikidatawiki.py | 17 +++++++-- .../wikidatawiki_data/__init__.py | 0 .../wikidatawiki_data/items_lists.py | 10 ++++++ .../property_datatypes.py | 16 +++++---- tests/feature_lists/test_wikidatawiki.py | 36 +++++++++++++++++++ 5 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 articlequality/feature_lists/wikidatawiki_data/__init__.py create mode 100644 articlequality/feature_lists/wikidatawiki_data/items_lists.py rename articlequality/feature_lists/{ => wikidatawiki_data}/property_datatypes.py (98%) diff --git a/articlequality/feature_lists/wikidatawiki.py b/articlequality/feature_lists/wikidatawiki.py index c10001c..0c2c304 100644 --- a/articlequality/feature_lists/wikidatawiki.py +++ b/articlequality/feature_lists/wikidatawiki.py @@ -7,7 +7,8 @@ from revscoring.features.meta import aggregators from revscoring.features.modifiers import not_ -from . import wikibase, property_datatypes +from . import wikibase +from .wikidatawiki_data import property_datatypes, items_lists name = "wikidatawiki" @@ -139,6 +140,14 @@ def _process_important_description_translations(item_descriptions): "`int` : A count of unique sources in the revision" +def _process_is_astronomical_object(entity): + statements = entity.properties.get(properties.INSTANCE_OF, []) + for s in statements: + if str(s.claim.datavalue) in items_lists.ASTRONOMICAL_OBJECTS: + return True + return False + + def _process_item_completeness(current_properties, properties_suggested): current_properties = set(current_properties.keys()) @@ -173,7 +182,11 @@ def _process_item_completeness(current_properties, properties_suggested): items.SCHOLARLY_ARTICLE, name=name + '.revision.is_scholarlyarticle' ) - +is_astronomical_object = Feature( + name + '.revision.page.is_astronomical_object', + _process_is_astronomical_object, + returns=bool, + depends_on=[wikibase_.revision.datasources.entity]) local_wiki = [ is_scholarlyarticle, diff --git a/articlequality/feature_lists/wikidatawiki_data/__init__.py b/articlequality/feature_lists/wikidatawiki_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/articlequality/feature_lists/wikidatawiki_data/items_lists.py b/articlequality/feature_lists/wikidatawiki_data/items_lists.py new file mode 100644 index 0000000..186ed16 --- /dev/null +++ b/articlequality/feature_lists/wikidatawiki_data/items_lists.py @@ -0,0 +1,10 @@ +ASTRONOMICAL_OBJECTS = [ + 'Q523', 'Q318', 'Q1931185', 'Q1457376', 'Q2247863', 'Q3863', 'Q83373', + 'Q2154519', 'Q726242', 'Q1153690', 'Q204107', 'Q71963409', 'Q67206691', + 'Q1151284', 'Q67206701', 'Q66619666', 'Q72802727', 'Q2168098', 'Q6243', + 'Q72802508', 'Q11282', 'Q72803170', 'Q1332364', 'Q72802977', 'Q6999', + 'Q1491746', 'Q272447', 'Q497654', 'Q204194', 'Q130019', 'Q744691', + 'Q71798532', 'Q46587', 'Q11276', 'Q71965429', 'Q5871', 'Q72803622', + 'Q72803426', 'Q3937', 'Q72803708', 'Q168845', 'Q24452', 'Q67201574', + 'Q2557101', 'Q691269', 'Q13632', 'Q10451997', 'Q28738741', 'Q22247' +] diff --git a/articlequality/feature_lists/property_datatypes.py b/articlequality/feature_lists/wikidatawiki_data/property_datatypes.py similarity index 98% rename from articlequality/feature_lists/property_datatypes.py rename to articlequality/feature_lists/wikidatawiki_data/property_datatypes.py index 0b94d2e..95c9db6 100644 --- a/articlequality/feature_lists/property_datatypes.py +++ b/articlequality/feature_lists/wikidatawiki_data/property_datatypes.py @@ -1,14 +1,16 @@ # Wikidata feature lists -# To add/delete new properties from the `wikidatawiki.py` feature list we need to manually -# update this file. +# To add/delete new properties from the `wikidatawiki.py` feature list we need +# to manually update this file. -# Currently the file contains a list of `NON_EXTERNAL_IDENTIFIERS` that we want to extract -# as features. `NON_EXTERNAL_IDENTIFIERS` is used instead of `EXTERNAL_IDENTIFIERS` because -# external identifiers comprise about 5000+ of the total 8000+ currently on the database. -# Using NON_EXTERNAL_IDENTIFIERS (~2500) reduces the processing time. +# Currently the file contains a list of `NON_EXTERNAL_IDENTIFIERS` that we +# want to extract as features. `NON_EXTERNAL_IDENTIFIERS` is used instead of +# `EXTERNAL_IDENTIFIERS` because external identifiers comprise about 5000+ of +# the total 8000+ currently on the database. +# Using NON_EXTERNAL_IDENTIFIERS (~2500) reduces the processing time. -# This list was obtained by making this query: https://quarry.wmflabs.org/query/47645 +# This list was obtained by making this query: +# https://quarry.wmflabs.org/query/47645 # and then doing a search and replace to format it as a python list. NONEXTERNAL_IDENTIFIERS = [ diff --git a/tests/feature_lists/test_wikidatawiki.py b/tests/feature_lists/test_wikidatawiki.py index 96a48b1..eab3da8 100644 --- a/tests/feature_lists/test_wikidatawiki.py +++ b/tests/feature_lists/test_wikidatawiki.py @@ -57,3 +57,39 @@ def test_references_features(q7251): def test_external_identifiers(q7251): assert solve(aggregators.len(wikidatawiki.external_identifiers), cache={entity: q7251}) == 79 + + +def test_is_astronomical_object(q7251): + crab_nebula = { + 'title': 'Q207436', + 'id': 'Q207436', + 'claims': { + 'P31': [ + { + 'mainsnak': { + 'snaktype': 'value', + 'property': 'P31', + 'datavalue': { + 'value': { + 'entity-type': 'item', + 'numeric-id': 1931185, + 'id': 'Q1931185' + }, + 'type': 'wikibase-entityid' + }, + 'datatype': 'wikibase-item' + }, + 'type': 'statement', + 'id': 'Q10934$46D9E951-DD5A-4832-A2D2-DB0CE0425E0E', + 'rank': 'normal' + } + ] + } + } + + crab_nebula = mwbase.Entity.from_json(crab_nebula) + + assert solve(wikidatawiki.is_astronomical_object, + cache={entity: crab_nebula}) is True + assert solve(wikidatawiki.is_astronomical_object, + cache={entity: q7251}) is False From 4d80418e690ebda7c14a3590cec498e9ed7d9aef Mon Sep 17 00:00:00 2001 From: Amir Sarabadani Date: Fri, 28 Aug 2020 16:59:50 +0200 Subject: [PATCH 2/3] Add documentation for list of astronomical objects --- articlequality/feature_lists/wikidatawiki_data/items_lists.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/articlequality/feature_lists/wikidatawiki_data/items_lists.py b/articlequality/feature_lists/wikidatawiki_data/items_lists.py index 186ed16..e62690f 100644 --- a/articlequality/feature_lists/wikidatawiki_data/items_lists.py +++ b/articlequality/feature_lists/wikidatawiki_data/items_lists.py @@ -1,3 +1,5 @@ +# The list is generated using https://w.wiki/aaD +# Picking the top 50 ones only because it covers 97% of cases ASTRONOMICAL_OBJECTS = [ 'Q523', 'Q318', 'Q1931185', 'Q1457376', 'Q2247863', 'Q3863', 'Q83373', 'Q2154519', 'Q726242', 'Q1153690', 'Q204107', 'Q71963409', 'Q67206691', From 67d413b3cd19ff85e49823bf78af21580fb0c7f9 Mon Sep 17 00:00:00 2001 From: Amir Sarabadani Date: Fri, 28 Aug 2020 21:21:17 +0200 Subject: [PATCH 3/3] Add the feature to the list --- articlequality/feature_lists/wikidatawiki.py | 1 + 1 file changed, 1 insertion(+) diff --git a/articlequality/feature_lists/wikidatawiki.py b/articlequality/feature_lists/wikidatawiki.py index 0c2c304..12484dd 100644 --- a/articlequality/feature_lists/wikidatawiki.py +++ b/articlequality/feature_lists/wikidatawiki.py @@ -190,6 +190,7 @@ def _process_item_completeness(current_properties, properties_suggested): local_wiki = [ is_scholarlyarticle, + is_astronomical_object, is_human, is_blp, aggregators.len(complete_translations),