-
Notifications
You must be signed in to change notification settings - Fork 20
/
ArticleTopicFeature.php
162 lines (149 loc) 路 6.32 KB
/
ArticleTopicFeature.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<?php
namespace CirrusSearch\Query;
use CirrusSearch\Search\SearchContext;
use CirrusSearch\WarningCollector;
use CirrusSearch\Wikimedia\WeightedTagsHooks;
use Elastica\Query\DisMax;
use Elastica\Query\Term;
use MediaWiki\Message\Message;
/**
* Finds pages based on how well they match a given topic, based on scores provided by the
* (Wikimedia-specific) articletopic ORES model.
* @package CirrusSearch\Wikimedia
* @see WeightedTagsHooks
* @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Articletopic
*/
class ArticleTopicFeature extends SimpleKeywordFeature {
public const ARTICLE_TOPIC_TAG_PREFIX = 'classification.ores.articletopic';
public const DRAFT_TOPIC_TAG_PREFIX = 'classification.ores.drafttopic';
private const PREFIX_PER_KEYWORD = [
'articletopic' => self::ARTICLE_TOPIC_TAG_PREFIX,
'drafttopic' => self::DRAFT_TOPIC_TAG_PREFIX
];
public const TERMS_TO_LABELS = [
'biography' => 'Culture.Biography.Biography*',
'women' => 'Culture.Biography.Women',
'food-and-drink' => 'Culture.Food and drink',
'internet-culture' => 'Culture.Internet culture',
'linguistics' => 'Culture.Linguistics',
'literature' => 'Culture.Literature',
'books' => 'Culture.Media.Books',
'entertainment' => 'Culture.Media.Entertainment',
'films' => 'Culture.Media.Films',
'media' => 'Culture.Media.Media*',
'music' => 'Culture.Media.Music',
'radio' => 'Culture.Media.Radio',
'software' => 'Culture.Media.Software',
'television' => 'Culture.Media.Television',
'video-games' => 'Culture.Media.Video games',
'performing-arts' => 'Culture.Performing arts',
'philosophy-and-religion' => 'Culture.Philosophy and religion',
'sports' => 'Culture.Sports',
'architecture' => 'Culture.Visual arts.Architecture',
'comics-and-anime' => 'Culture.Visual arts.Comics and Anime',
'fashion' => 'Culture.Visual arts.Fashion',
'visual-arts' => 'Culture.Visual arts.Visual arts*',
'geographical' => 'Geography.Geographical',
'africa' => 'Geography.Regions.Africa.Africa*',
'central-africa' => 'Geography.Regions.Africa.Central Africa',
'eastern-africa' => 'Geography.Regions.Africa.Eastern Africa',
'northern-africa' => 'Geography.Regions.Africa.Northern Africa',
'southern-africa' => 'Geography.Regions.Africa.Southern Africa',
'western-africa' => 'Geography.Regions.Africa.Western Africa',
'central-america' => 'Geography.Regions.Americas.Central America',
'north-america' => 'Geography.Regions.Americas.North America',
'south-america' => 'Geography.Regions.Americas.South America',
'asia' => 'Geography.Regions.Asia.Asia*',
'central-asia' => 'Geography.Regions.Asia.Central Asia',
'east-asia' => 'Geography.Regions.Asia.East Asia',
'north-asia' => 'Geography.Regions.Asia.North Asia',
'south-asia' => 'Geography.Regions.Asia.South Asia',
'southeast-asia' => 'Geography.Regions.Asia.Southeast Asia',
'west-asia' => 'Geography.Regions.Asia.West Asia',
'eastern-europe' => 'Geography.Regions.Europe.Eastern Europe',
'europe' => 'Geography.Regions.Europe.Europe*',
'northern-europe' => 'Geography.Regions.Europe.Northern Europe',
'southern-europe' => 'Geography.Regions.Europe.Southern Europe',
'western-europe' => 'Geography.Regions.Europe.Western Europe',
'oceania' => 'Geography.Regions.Oceania',
'business-and-economics' => 'History and Society.Business and economics',
'education' => 'History and Society.Education',
'history' => 'History and Society.History',
'military-and-warfare' => 'History and Society.Military and warfare',
'politics-and-government' => 'History and Society.Politics and government',
'society' => 'History and Society.Society',
'transportation' => 'History and Society.Transportation',
'biology' => 'STEM.Biology',
'chemistry' => 'STEM.Chemistry',
'computing' => 'STEM.Computing',
'earth-and-environment' => 'STEM.Earth and environment',
'engineering' => 'STEM.Engineering',
'libraries-and-information' => 'STEM.Libraries & Information',
'mathematics' => 'STEM.Mathematics',
'medicine-and-health' => 'STEM.Medicine & Health',
'physics' => 'STEM.Physics',
'stem' => 'STEM.STEM*',
'space' => 'STEM.Space',
'technology' => 'STEM.Technology',
];
/**
* Helper method for turning raw ORES score data (as stored in the Cirrus document) into
* search terms, for analytics/debugging.
* @param array $rawTopicData The unprefixed content of the document's weighted_tags field
* @return array corresponding search term => ORES score (rounded to three decimals)
*/
public static function getTopicScores( array $rawTopicData ): array {
$labelsToTerms = array_flip( self::TERMS_TO_LABELS );
$topicScores = [];
foreach ( $rawTopicData as $rawTopic ) {
[ $oresLabel, $scaledScore ] = explode( '|', $rawTopic );
$topicId = $labelsToTerms[$oresLabel];
$topicScores[$topicId] = (int)$scaledScore / 1000;
}
return $topicScores;
}
/**
* @inheritDoc
* @phan-return array{topics:string[],tag_prefix:string}
*/
public function parseValue(
$key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector
) {
$topics = explode( '|', $value );
$invalidTopics = array_diff( $topics, array_keys( self::TERMS_TO_LABELS ) );
$validTopics = array_filter( array_map( static function ( $topic ) {
return self::TERMS_TO_LABELS[$topic];
}, array_diff( $topics, $invalidTopics ) ) );
if ( $invalidTopics ) {
$warningCollector->addWarning( 'cirrussearch-articletopic-invalid-topic',
Message::listParam( $invalidTopics, 'comma' ), count( $invalidTopics ) );
}
return [ 'topics' => $validTopics, 'tag_prefix' => self::PREFIX_PER_KEYWORD[$key] ];
}
/** @inheritDoc */
protected function getKeywords() {
return array_keys( self::PREFIX_PER_KEYWORD );
}
/** @inheritDoc */
protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) {
$parsed = $this->parseValue( $key, $value, $quotedValue, '', '', $context );
$topics = $parsed['topics'];
$tagPrefix = $parsed['tag_prefix'];
if ( $topics === [] ) {
$context->setResultsPossible( false );
return [ null, true ];
}
$query = new DisMax();
foreach ( $topics as $topic ) {
$topicQuery = new Term();
$topicQuery->setTerm( WeightedTagsHooks::FIELD_NAME, $tagPrefix . '/' . $topic );
$query->addQuery( $topicQuery );
}
if ( !$negated ) {
$context->addNonTextQuery( $query );
return [ null, false ];
} else {
return [ $query, false ];
}
}
}