Skip to content

Commit

Permalink
[DomCrawler] Fixed charset detection in html5 meta charset tag
Browse files Browse the repository at this point in the history
  • Loading branch information
77web authored and fabpot committed May 26, 2014
1 parent 29341fa commit 172e752
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/Symfony/Component/DomCrawler/Crawler.php
Expand Up @@ -108,8 +108,10 @@ public function addContent($content, $type = null)
}
}

// http://www.w3.org/TR/encoding/#encodings
// http://www.w3.org/TR/REC-xml/#NT-EncName
if (null === $charset &&
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
$charset = $matches[1];
}

Expand Down
4 changes: 4 additions & 0 deletions src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php
Expand Up @@ -232,6 +232,10 @@ public function testAddContent()
$crawler = new Crawler();
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');

$crawler = new Crawler();
$crawler->addContent(mb_convert_encoding('<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>', 'SJIS', 'UTF-8'));
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
}

/**
Expand Down

0 comments on commit 172e752

Please sign in to comment.