Skip to content

Commit

Permalink
[DomCrawler] Improve handling of charsets
Browse files Browse the repository at this point in the history
  • Loading branch information
tgalopin committed Apr 3, 2019
1 parent 279d362 commit e0ca69a
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 33 deletions.
50 changes: 25 additions & 25 deletions src/Symfony/Component/DomCrawler/Crawler.php
Expand Up @@ -601,7 +601,7 @@ public function html(/* $default = null */)
throw new \InvalidArgumentException('The current node list is empty.');
}

if ($this->useHtml5Parser) {
if (null !== $this->html5Parser) {
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $this->html5Parser->saveHTML($child);
Expand Down Expand Up @@ -1116,34 +1116,12 @@ protected function sibling($node, $siblingDir = 'nextSibling')

private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
set_error_handler(function () { throw new \Exception(); });

// Convert to UTF-8
try {
$htmlContent = mb_convert_encoding($htmlContent, 'UTF-8', $charset);
} catch (\Exception $e) {
// Handle invalid charsets by fallbacking on Windows-1252
$htmlContent = mb_convert_encoding($htmlContent, 'UTF-8', 'Windows-1252');
} finally {
restore_error_handler();
}

return $this->html5Parser->parse($htmlContent, [], 'UTF-8');
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
}

private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
set_error_handler(function () { throw new \Exception(); });

try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
// Handle invalid charsets by fallbacking on Windows-1252
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'Windows-1252');
} finally {
restore_error_handler();
}
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);

$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
Expand All @@ -1161,6 +1139,28 @@ private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DO
return $dom;
}

/**
* Convert charset to HTML-entities to ensure valid parsing.
*/
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });

try {
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
} catch (\Exception $e) {
}

return $htmlContent;
} finally {
restore_error_handler();
}
}

/**
* @throws \InvalidArgumentException
*/
Expand Down
Expand Up @@ -1162,6 +1162,14 @@ public function testInheritedClassCallChildrenWithoutArgument()
$crawlerChild->children();
}

public function testAddHtmlContentUnsupportedCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');

$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}

public function createTestCrawler($uri = null)
{
$dom = new \DOMDocument();
Expand Down
Expand Up @@ -67,12 +67,4 @@ public function testAddXmlContentWithErrors()
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}

public function testAddHtmlContentUnsupportedCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');

$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}
}
3 changes: 3 additions & 0 deletions src/Symfony/Component/DomCrawler/composer.json
Expand Up @@ -24,6 +24,9 @@
"symfony/css-selector": "~3.4|~4.0",
"masterminds/html5": "^2.6"
},
"conflict": {
"masterminds/html5": "<2.6"
},
"suggest": {
"symfony/css-selector": ""
},
Expand Down

0 comments on commit e0ca69a

Please sign in to comment.