diff --git a/Crawler.php b/Crawler.php index 3fd87bd7..0ab6b982 100644 --- a/Crawler.php +++ b/Crawler.php @@ -166,20 +166,43 @@ public function addHtmlContent($content, $charset = 'UTF-8') $dom = new \DOMDocument('1.0', $charset); $dom->validateOnParse = true; - if (function_exists('mb_convert_encoding')) { - $hasError = false; - set_error_handler(function () use (&$hasError) { - $hasError = true; - }); - $tmpContent = @mb_convert_encoding($content, 'HTML-ENTITIES', $charset); - - restore_error_handler(); - - if (!$hasError) { - $content = $tmpContent; + set_error_handler(function () {throw new \Exception();}); + + try { + // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() + + if (function_exists('mb_convert_encoding')) { + $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); + } elseif (function_exists('iconv')) { + $content = preg_replace_callback( + '/[\x80-\xFF]+/', + function ($m) { + $m = unpack('C*', $m[0]); + $i = 1; + $entities = ''; + + while (isset($m[$i])) { + if (0xF0 <= $m[$i]) { + $c = (($m[$i++] - 0xF0) << 18) + (($m[$i++] - 0x80) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80; + } elseif (0xE0 <= $m[$i]) { + $c = (($m[$i++] - 0xE0) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80; + } else { + $c = (($m[$i++] - 0xC0) << 6) + $m[$i++] - 0x80; + } + + $entities .= '&#'.$c.';'; + } + + return $entities; + }, + iconv($charset, 'UTF-8', $content) + ); } + } catch (\Exception $e) { } + restore_error_handler(); + if ('' !== trim($content)) { @$dom->loadHTML($content); } diff --git a/Tests/CrawlerTest.php b/Tests/CrawlerTest.php index 3c281eb4..9560d069 100755 --- a/Tests/CrawlerTest.php +++ b/Tests/CrawlerTest.php @@ -81,6 +81,7 @@ public function testAddHtmlContent() /** * @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent + * @requires extension mbstring */ public function testAddHtmlContentCharset() { @@ -115,6 +116,7 @@ public function testAddHtmlContentUnsupportedCharset() /** * @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent + * @requires extension mbstring */ public function testAddHtmlContentCharsetGbk() { @@ -235,7 +237,7 @@ public function testAddContent() $this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset'); $crawler = new Crawler(); - $crawler->addContent(mb_convert_encoding('日本語', 'SJIS', 'UTF-8')); + $crawler->addContent(iconv('UTF-8', 'SJIS', '日本語')); $this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag'); }