Don't remove child nodes of figures and images.

When processing img elements, this was replacing figure and figure-inline with an img element. This was discarding any figcaption or other contents of the node. It doesn't look like there's any reason to do this replacement, as we're already modifying the img element to our requirements. Also switch to ctype_xdigit to check for the thumb URL segments being valid hex digits, because using is_numeric to do this was deprecated as of PHP 7.0. Also check for any contained img elements in figure etc. and break if none found. Bug: https://phabricator.wikimedia.org/T272307 Bug: https://phabricator.wikimedia.org/T272180
wikimedia · Feb 3, 2021 · 3371f96 · 3371f96
1 parent 5e82731
commit 3371f96
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 8 deletions.
diff --git a/src/PageParser.php b/src/PageParser.php
@@ -144,14 +144,17 @@ public function getFullChaptersList( $title, $pageList, $namespaces ) {
 	}
 
 	/**
-	 * return the pictures of the file, for all handled <img, the alt
-	 * attribute is set to the title of the image so the backend can
-	 * use it to retrieve the src name without relying on the src= attrib.
-	 * @return array
+	 * Get all pictures in the page, for all handled img elements.
+	 * Picture objects have their title set to a unique string, different for each size of a given image, and their name
+	 * set to the actual image file name. A data-title attribute is added, so that the BookCleanerEpub class can
+	 * retrieve the filename without parsing it from the src attribute.
+	 *
+	 * @return Picture[]
 	 */
-	public function getPicturesList() {
+	public function getPicturesList(): array {
 		$pictures = [];
 
+		// First go through all img elements that are not described as such or that are not links.
 		$list = $this->xPath->query( '//a[not(contains(@class,"image"))]/img | //img[not(parent::a)]' );
 		/** @var DOMElement $img */
 		foreach ( $list as $img ) {
@@ -163,11 +166,13 @@ public function getPicturesList() {
 			if ( strpos( $url, '/svg/' ) !== false ) {
 				$picture->title .= '.svg';
 			}
+			$picture->name = $picture->title;
 			$pictures[$picture->title] = $picture;
 			$img->setAttribute( 'data-title', $picture->title );
 
 		}
 
+		// Then go through all non-img elements that are used to contain images.
 		$list = $this->xPath->query( '
 			//a[contains(@class,"image")] |
 			//figure[contains(@typeof,"mw:Image")] |
@@ -176,7 +181,13 @@ public function getPicturesList() {
 		/** @var DOMElement $node */
 		foreach ( $list as $node ) {
 			/** @var DOMElement $img */
-			$img = $node->getElementsByTagName( 'img' )->item( 0 );
+			$imgs = $node->getElementsByTagName( 'img' );
+			if ( $imgs->count() === 0 ) {
+				// Image file doesn't exist, but MediaWiki still includes a figure, e.g.
+				// <figure-inline typeof="mw:Error mw:Image"><a href="…">File:Example.png</span></a></figure-inline>
+				continue;
+			}
+			$img = $imgs->item( 0 );
 			$picture = new Picture();
 			$url = $img->getAttribute( 'src' );
 			$segments = explode( '/', $url );
@@ -192,7 +203,10 @@ public function getPicturesList() {
 			// key, so we need only to extract the File:name. This
 			// is kludgy as we need to rely on the path format,
 			// either the 6/62 part is at pos -4/-3 or -3/-2.
-			if ( count( $segments ) >= 4 && is_numeric( "0x" . $segments[count( $segments ) - 4] ) && is_numeric( "0x" . $segments[count( $segments ) - 3] ) ) {
+			if ( count( $segments ) >= 4
+				&& ctype_xdigit( $segments[count( $segments ) - 4] )
+				&& ctype_xdigit( $segments[count( $segments ) - 3] )
+			) {
 				$picture->name = $segments[count( $segments ) - 2];
 				$picture->title = $segments[count( $segments ) - 2] . '-' . $segments[count( $segments ) - 1];
 			} else {
@@ -203,7 +217,6 @@ public function getPicturesList() {
 
 			$pictures[$picture->title] = $picture;
 			$img->setAttribute( 'data-title', $picture->title );
-			$node->parentNode->replaceChild( $img, $node );
 		}
 
 		return $pictures;

diff --git a/tests/Book/PageParserTest.php b/tests/Book/PageParserTest.php
@@ -147,4 +147,46 @@ public function testSubpagesEncodedHrefs() {
 		$pageParser = new PageParser( $doc );
 		$this->assertCount( 1, $pageParser->getFullChaptersList( 'F(o)o♥', [], [] ) );
 	}
+
+	/**
+	 * @dataProvider provideGetPicturesList
+	 */
+	public function testGetPicturesList( string $in, string $out, string $picTitle, string $picName ): void {
+		$doc1 = new DOMDocument();
+		$doc1->loadXML( $in );
+		$pageParser1 = new PageParser( $doc1 );
+		$pictures = $pageParser1->getPicturesList();
+		$this->assertStringContainsString( $out, $pageParser1->getContent( false )->saveXML() );
+		$this->assertSame( $picTitle, $pictures[$picTitle]->title );
+		$this->assertSame( $picName, $pictures[$picTitle]->name );
+	}
+
+	public function provideGetPicturesList(): array {
+		return [
+			'Image gets a data-title attribute'  => [
+				'<p><img src="foo/bar.jpg" /></p>',
+				'<p><img src="foo/bar.jpg" data-title="bar.jpg"/></p>',
+				'bar.jpg',
+				'bar.jpg',
+			],
+			'Figure caption is not removed' => [
+				'<figure><img src="foo/bar.jpg" /><figcaption>Lorem</figcaption></figure>',
+				'<figure><img src="foo/bar.jpg" data-title="bar.jpg"/><figcaption>Lorem</figcaption></figure>',
+				'bar.jpg',
+				'bar.jpg',
+			],
+			'Title extracted from MediaWiki non-thumb URL' => [
+				'<p><img src="//example.org/0/00/example.jpg" /></p>',
+				'<p><img src="//example.org/0/00/example.jpg" data-title="example.jpg"/></p>',
+				'example.jpg',
+				'example.jpg',
+			],
+			'Title extracted from MediaWiki thumb URL' => [
+				'<p><a class="image"><img src="//example.org/thumb/2/9A/example.jpg/500px-example.jpg" /></a></p>',
+				'<p><a class="image"><img src="//example.org/thumb/2/9A/example.jpg/500px-example.jpg" data-title="example.jpg-500px-example.jpg"/></a></p>',
+				'example.jpg-500px-example.jpg',
+				'example.jpg',
+			],
+		];
+	}
 }