-
Notifications
You must be signed in to change notification settings - Fork 26
/
MoveLeadParagraphTransform.php
317 lines (292 loc) · 10.2 KB
/
MoveLeadParagraphTransform.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
<?php
namespace MobileFrontend\Transforms;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMXPath;
use MediaWiki\MediaWikiServices;
use MediaWiki\Title\Title;
use Wikimedia\Parsoid\Utils\DOMCompat;
class MoveLeadParagraphTransform implements IMobileTransform {
/**
* @var Title|string
*/
private $title;
/**
* @var int
*/
private $revId;
/**
* @param Title|string $title for logging purposes
* @param int $revId for logging purposes
*/
public function __construct( $title, $revId ) {
$this->title = $title;
$this->revId = $revId;
}
/**
* Rearranges content so that text in the lead paragraph is prioritised to appear
* before the infobox. Lead
*
* @param DOMElement $node to be transformed
*/
public function apply( DOMElement $node ) {
$section = DOMCompat::querySelector( $node, 'section' );
if ( $section ) {
$this->moveFirstParagraphBeforeInfobox( $section, $section->ownerDocument );
}
}
/**
* Helper function to verify that passed $node matched tagName and has set required classname
* @param DOMElement $node Node to verify
* @param string|bool $requiredTagName Required tag name, has to be lowercase
* if false it is ignored and requiredClass is used.
* @param string $requiredClass Regular expression with required class name
* @return bool
*/
private static function matchElement( DOMElement $node, $requiredTagName, $requiredClass ) {
$classes = explode( ' ', $node->getAttribute( 'class' ) );
return ( $requiredTagName === false || strtolower( $node->tagName ) === $requiredTagName )
&& preg_grep( $requiredClass, $classes );
}
/**
* Iterate up the DOM tree until find a parent node which has the parent $parent
* @param DOMNode $node
* @param DOMNode $parent
* @return DOMNode representing a node which is either $node or an ancestor of $node which
* has a parent $parent. Note, it is assumed that $node will always be a descendent of $parent so
* if this is not true, you probably shouldn't be using this function and I, as the writer of this
* code cannot be held responsible for portals that open to another dimension or your laptop
* setting on fire.
*/
private static function findParentWithParent( $node, $parent ) {
$search = $node;
while ( $search->parentNode && !$search->parentNode->isSameNode( $parent ) ) {
$search = $search->parentNode;
}
return $search;
}
/**
* Extract the first infobox in document
* @param DOMXPath $xPath XPath object to execute the query
* @param DOMElement $section Where to search for an infobox
* @return DOMElement|null The first infobox
*/
private function identifyInfoboxElement( DOMXPath $xPath, DOMElement $section ): ?DOMElement {
$paths = [
// Infoboxes: *.infobox
'.//*[contains(concat(" ",normalize-space(@class)," ")," infobox ")]',
// Thumbnail images: .thumb, figure (Parsoid)
'.//*[contains(concat(" ",normalize-space(@class)," ")," thumb ")]',
'.//figure',
];
$query = '(' . implode( '|', $paths ) . ')';
$infobox = $xPath->query( $query, $section )->item( 0 );
if ( $infobox instanceof DOMElement ) {
// Check if the infobox is inside a container
$node = $infobox;
$wrapperClass = '/^(mw-stack|collapsible)$/';
// Traverse up
while ( $node->parentNode ) {
if ( self::matchElement( $node, false, $wrapperClass ) ) {
$infobox = $node;
}
$node = $node->parentNode;
}
// For images, include any containers.
// We don't need to check if the parent is an infobox, because it
// would've matched first in the XPath query.
if (
strtolower( $infobox->tagName ) === 'figure' ||
strpos( $infobox->getAttribute( 'class' ), 'thumb' ) !== false
) {
while ( $infobox->parentNode !== $section ) {
$infobox = $infobox->parentNode;
}
}
return $infobox;
}
return null;
}
/**
* Find first paragraph that has text content, i.e. paragraphs that are not empty
* This function will also filter out the paragraphs that have nodes containing whitespaces
* only.
* example: `<p> <span> </span> </p>` is not a lead paragraph
*
* Keep in sync with mobile.init/identifyLeadParagraph.js.
*
* @param DOMXPath $xPath XPath object to execute the query
* @param DOMElement $section Where to search for paragraphs
* @return DOMElement|null The lead paragraph
*/
private function identifyLeadParagraph( DOMXPath $xPath, DOMElement $section ): ?DOMElement {
$paragraphs = $xPath->query( './p', $section );
$index = 0;
while ( $index < $paragraphs->length ) {
$node = $paragraphs->item( $index );
if ( $node && !$this->isNonLeadParagraph( $xPath, $node ) ) {
/** @phan-suppress-next-line PhanTypeMismatchReturn DOMNode vs. DOMElement */
return $node;
}
++$index;
}
return null;
}
/**
* Move the first paragraph in the lead section above the infobox
*
* In order for a paragraph to be moved the following conditions must be met:
* - the lead section contains at least one infobox;
* - the paragraph doesn't already appear before the first infobox
* if any in the DOM;
* - the paragraph contains visible text content
* - article belongs to the MAIN namespace
*
* Additionally if paragraph immediate sibling is a list (ol or ul element), the list
* is also moved along with paragraph above infobox.
*
* Note that the first paragraph is not moved before hatnotes, or mbox or other
* elements that are not infoboxes.
*
* @param DOMElement $leadSection
* @param ?DOMDocument $doc Document to which the section belongs
*/
private function moveFirstParagraphBeforeInfobox( DOMElement $leadSection, ?DOMDocument $doc ) {
if ( $doc === null ) {
return;
}
$xPath = new DOMXPath( $doc );
$infobox = $this->identifyInfoboxElement( $xPath, $leadSection );
if ( $infobox ) {
$leadParagraph = $this->identifyLeadParagraph( $xPath, $leadSection );
$isTopLevelInfobox = $infobox->parentNode->isSameNode( $leadSection );
if ( $leadParagraph && $isTopLevelInfobox &&
$this->isPreviousSibling( $infobox, $leadParagraph )
) {
$listElementAfterParagraph = null;
$where = $infobox;
$elementAfterParagraphQuery = $xPath->query( 'following-sibling::*[1]', $leadParagraph );
if ( $elementAfterParagraphQuery->length > 0 ) {
$elem = $elementAfterParagraphQuery->item( 0 );
/** @phan-suppress-next-line PhanUndeclaredProperty DOMNode vs. DOMElement */
if ( $elem->tagName === 'ol' || $elem->tagName === 'ul' ) {
$listElementAfterParagraph = $elem;
}
}
$leadSection->insertBefore( $leadParagraph, $where );
if ( $listElementAfterParagraph !== null ) {
$leadSection->insertBefore( $listElementAfterParagraph, $where );
}
} elseif ( !$isTopLevelInfobox ) {
$isInWrongPlace = $this->hasNoNonEmptyPrecedingParagraphs( $xPath,
/** @phan-suppress-next-line PhanTypeMismatchArgumentSuperType DOMNode vs. DOMElement */
self::findParentWithParent( $infobox, $leadSection )
);
$loggingEnabled = MediaWikiServices::getInstance()
->getService( 'MobileFrontend.Config' )->get( 'MFLogWrappedInfoboxes' );
/**
* @see https://phabricator.wikimedia.org/T149884
* @todo remove after research is done
*/
if ( $isInWrongPlace && $loggingEnabled ) {
$this->logInfoboxesWrappedInContainers();
}
}
}
}
/**
* Check if the node contains any non-whitespace characters
*
* Keep in sync with mobile.init/identifyLeadParagraph.js.
*
* @param DOMNode $node
* @return bool
*/
private function isNotEmptyNode( DOMNode $node ) {
return (bool)preg_match( '/\S/', $node->textContent );
}
/**
* Checks if paragraph contains visible content and so
* could be considered the lead paragraph of the aricle.
*
* Keep in sync with mobile.init/identifyLeadParagraph.js.
*
* @param DOMXPath $xPath An XPath query
* @param DOMNode $node DOM Node to verify
* @return bool
*/
private function isNonLeadParagraph( $xPath, $node ) {
if (
$node->nodeType === XML_ELEMENT_NODE &&
/** @phan-suppress-next-line PhanUndeclaredProperty DOMNode vs. DOMElement */
$node->tagName === 'p' &&
$this->isNotEmptyNode( $node )
) {
// Clone the node so we can modifiy it
$node = $node->cloneNode( true );
// Remove any TemplateStyle tags, or coordinate wrappers...
$templateStyles = $xPath->query( '(.//style|.//span[@id="coordinates"])', $node );
foreach ( $templateStyles as $style ) {
$style->parentNode->removeChild( $style );
}
// ...and check again for emptiness
if ( !$this->isNotEmptyNode( $node ) ) {
return true;
}
return false;
}
return true;
}
/**
* Check if the $first is previous sibling of $second
*
* Both nodes ($first and $second) most probably will be located in the beginning of
* article, because of that it's better to loop backward from $second to $first.
* Usually those two elements should be in order, it means that we will do only one
* `isSameNode()` check. If those elements are not in the order, we will quickly get to
* $node->previousSibling==null and return false instead of the whole traversing document.
*
* @param DOMNode $first
* @param DOMNode $second
* @return bool
*/
private function isPreviousSibling( DOMNode $first, DOMNode $second ) {
$node = $second->previousSibling;
while ( $node !== null ) {
if ( $node->isSameNode( $first ) ) {
return true;
}
$node = $node->previousSibling;
}
return false;
}
/**
* Check if there are any non-empty siblings before $element
*
* @param DOMXPath $xPath
* @param DOMElement $element
* @return bool
*/
private function hasNoNonEmptyPrecedingParagraphs( DOMXPath $xPath, DOMElement $element ) {
$node = $element->previousSibling;
while ( $node !== null ) {
if ( !$this->isNonLeadParagraph( $xPath, $node ) ) {
return false;
}
$node = $node->previousSibling;
}
return true;
}
/**
* Finds all infoboxes which are one or more levels deep in $xPath content. When at least one
* element is found - log the page title and revision
*
* @see https://phabricator.wikimedia.org/T149884
*/
private function logInfoboxesWrappedInContainers() {
\MediaWiki\Logger\LoggerFactory::getInstance( 'mobile' )->info(
"Found infobox wrapped with container on {$this->title} (rev:{$this->revId})"
);
}
}