Skip to content

Commit

Permalink
Don't remove child nodes of figures and images.
Browse files Browse the repository at this point in the history
When processing img elements, this was replacing figure and
figure-inline with an img element. This was discarding any
figcaption or other contents of the node.

It doesn't look like there's any reason to do this replacement,
as we're already modifying the img element to our requirements.

Also switch to ctype_xdigit to check for the thumb URL segments
being valid hex digits, because using is_numeric to do this was
deprecated as of PHP 7.0.

Also check for any contained img elements in figure etc. and break
if none found.

Bug: https://phabricator.wikimedia.org/T272307
Bug: https://phabricator.wikimedia.org/T272180
  • Loading branch information
samwilson committed Feb 3, 2021
1 parent 5e82731 commit 3371f96
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 8 deletions.
29 changes: 21 additions & 8 deletions src/PageParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,17 @@ public function getFullChaptersList( $title, $pageList, $namespaces ) {
}

/**
* return the pictures of the file, for all handled <img, the alt
* attribute is set to the title of the image so the backend can
* use it to retrieve the src name without relying on the src= attrib.
* @return array
* Get all pictures in the page, for all handled img elements.
* Picture objects have their title set to a unique string, different for each size of a given image, and their name
* set to the actual image file name. A data-title attribute is added, so that the BookCleanerEpub class can
* retrieve the filename without parsing it from the src attribute.
*
* @return Picture[]
*/
public function getPicturesList() {
public function getPicturesList(): array {
$pictures = [];

// First go through all img elements that are not described as such or that are not links.
$list = $this->xPath->query( '//a[not(contains(@class,"image"))]/img | //img[not(parent::a)]' );
/** @var DOMElement $img */
foreach ( $list as $img ) {
Expand All @@ -163,11 +166,13 @@ public function getPicturesList() {
if ( strpos( $url, '/svg/' ) !== false ) {
$picture->title .= '.svg';
}
$picture->name = $picture->title;
$pictures[$picture->title] = $picture;
$img->setAttribute( 'data-title', $picture->title );

}

// Then go through all non-img elements that are used to contain images.
$list = $this->xPath->query( '
//a[contains(@class,"image")] |
//figure[contains(@typeof,"mw:Image")] |
Expand All @@ -176,7 +181,13 @@ public function getPicturesList() {
/** @var DOMElement $node */
foreach ( $list as $node ) {
/** @var DOMElement $img */
$img = $node->getElementsByTagName( 'img' )->item( 0 );
$imgs = $node->getElementsByTagName( 'img' );
if ( $imgs->count() === 0 ) {
// Image file doesn't exist, but MediaWiki still includes a figure, e.g.
// <figure-inline typeof="mw:Error mw:Image"><a href="…">File:Example.png</span></a></figure-inline>
continue;
}
$img = $imgs->item( 0 );
$picture = new Picture();
$url = $img->getAttribute( 'src' );
$segments = explode( '/', $url );
Expand All @@ -192,7 +203,10 @@ public function getPicturesList() {
// key, so we need only to extract the File:name. This
// is kludgy as we need to rely on the path format,
// either the 6/62 part is at pos -4/-3 or -3/-2.
if ( count( $segments ) >= 4 && is_numeric( "0x" . $segments[count( $segments ) - 4] ) && is_numeric( "0x" . $segments[count( $segments ) - 3] ) ) {
if ( count( $segments ) >= 4
&& ctype_xdigit( $segments[count( $segments ) - 4] )
&& ctype_xdigit( $segments[count( $segments ) - 3] )
) {
$picture->name = $segments[count( $segments ) - 2];
$picture->title = $segments[count( $segments ) - 2] . '-' . $segments[count( $segments ) - 1];
} else {
Expand All @@ -203,7 +217,6 @@ public function getPicturesList() {

$pictures[$picture->title] = $picture;
$img->setAttribute( 'data-title', $picture->title );
$node->parentNode->replaceChild( $img, $node );
}

return $pictures;
Expand Down
42 changes: 42 additions & 0 deletions tests/Book/PageParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,46 @@ public function testSubpagesEncodedHrefs() {
$pageParser = new PageParser( $doc );
$this->assertCount( 1, $pageParser->getFullChaptersList( 'F(o)o♥', [], [] ) );
}

/**
* @dataProvider provideGetPicturesList
*/
public function testGetPicturesList( string $in, string $out, string $picTitle, string $picName ): void {
$doc1 = new DOMDocument();
$doc1->loadXML( $in );
$pageParser1 = new PageParser( $doc1 );
$pictures = $pageParser1->getPicturesList();
$this->assertStringContainsString( $out, $pageParser1->getContent( false )->saveXML() );
$this->assertSame( $picTitle, $pictures[$picTitle]->title );
$this->assertSame( $picName, $pictures[$picTitle]->name );
}

public function provideGetPicturesList(): array {
return [
'Image gets a data-title attribute' => [
'<p><img src="foo/bar.jpg" /></p>',
'<p><img src="foo/bar.jpg" data-title="bar.jpg"/></p>',
'bar.jpg',
'bar.jpg',
],
'Figure caption is not removed' => [
'<figure><img src="foo/bar.jpg" /><figcaption>Lorem</figcaption></figure>',
'<figure><img src="foo/bar.jpg" data-title="bar.jpg"/><figcaption>Lorem</figcaption></figure>',
'bar.jpg',
'bar.jpg',
],
'Title extracted from MediaWiki non-thumb URL' => [
'<p><img src="//example.org/0/00/example.jpg" /></p>',
'<p><img src="//example.org/0/00/example.jpg" data-title="example.jpg"/></p>',
'example.jpg',
'example.jpg',
],
'Title extracted from MediaWiki thumb URL' => [
'<p><a class="image"><img src="//example.org/thumb/2/9A/example.jpg/500px-example.jpg" /></a></p>',
'<p><a class="image"><img src="//example.org/thumb/2/9A/example.jpg/500px-example.jpg" data-title="example.jpg-500px-example.jpg"/></a></p>',
'example.jpg-500px-example.jpg',
'example.jpg',
],
];
}
}

0 comments on commit 3371f96

Please sign in to comment.