Skip to content
Permalink
Browse files

[DomCrawler] Optionally use html5-php to parse HTML

  • Loading branch information...
tgalopin authored and fabpot committed Nov 24, 2018
1 parent 47242e3 commit 4050ec42576ab3b5c01c7773e4fbed4552f4ea50
@@ -101,6 +101,7 @@
"doctrine/orm": "~2.4,>=2.4.5",
"doctrine/reflection": "~1.0",
"doctrine/doctrine-bundle": "~1.4",
"masterminds/html5": "^2.6",
"monolog/monolog": "~1.11",
"nyholm/psr7": "^1.0",
"ocramius/proxy-manager": "~0.4|~1.0|~2.0",
@@ -112,6 +113,7 @@
"phpdocumentor/reflection-docblock": "^3.0|^4.0"
},
"conflict": {
"masterminds/html5": "<2.6",
"phpdocumentor/reflection-docblock": "<3.0||>=3.2.0,<3.2.2",
"phpdocumentor/type-resolver": "<0.3.0",
"phpunit/phpunit": "<5.4.3"
@@ -28,19 +28,19 @@ public function welcomeAction(Request $request, $name = null)
// new session case
if (!$session->has('name')) {
if (!$name) {
return new Response('You are new here and gave no name.');
return new Response('<html><body>You are new here and gave no name.</body></html>');
}
// remember name
$session->set('name', $name);
return new Response(sprintf('Hello %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Hello %s, nice to meet you.</body></html>', $name));
}
// existing session
$name = $session->get('name');
return new Response(sprintf('Welcome back %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Welcome back %s, nice to meet you.</body></html>', $name));
}
public function cacheableAction()
@@ -55,7 +55,7 @@ public function logoutAction(Request $request)
{
$request->getSession()->invalidate();
return new Response('Session cleared.');
return new Response('<html><body>Session cleared.</body></html>');
}
public function setFlashAction(Request $request, $message)
@@ -76,6 +76,6 @@ public function showFlashAction(Request $request)
$output = 'No flash was set.';
}
return new Response($output);
return new Response('<html><body>'.$output.'</body></html>');
}
}
@@ -54,11 +54,11 @@ public function secureAction()
public function profileAction()
{
return new Response('Profile');
return new Response('<html><body>Profile</body></html>');
}
public function homepageAction()
{
return new Response('Homepage');
return new Response('<html><body>Homepage</body></html>');
}
}
@@ -6,6 +6,8 @@ CHANGELOG

* Added return of element name (`_name`) in `extract()` method.
* Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty.
* When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to
parse HTML added to a Crawler for better support of HTML5 tags.

4.2.0
-----
@@ -11,6 +11,7 @@
namespace Symfony\Component\DomCrawler;
use Masterminds\HTML5;
use Symfony\Component\CssSelector\CssSelectorConverter;
/**
@@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
private $isHtml = true;
/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @var HTML5|null
*/
public function __construct($node = null, string $uri = null, string $baseHref = null)
private $html5Parser;
/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
*/
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;
if ($useHtml5Parser && !class_exists(HTML5::class)) {
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
}
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}
$this->add($node);
}
@@ -183,29 +198,7 @@ public function addContent($content, $type = null)
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
set_error_handler(function () { throw new \Exception(); });
try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
}
restore_error_handler();
if ('' !== trim($content)) {
@$dom->loadHTML($content);
}
libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -608,6 +601,15 @@ public function html(/* $default = null */)
throw new \InvalidArgumentException('The current node list is empty.');
}
if (null !== $this->html5Parser) {
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $this->html5Parser->saveHTML($child);
}
return $html;
}
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $child->ownerDocument->saveHTML($child);
@@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling')
return $nodes;
}
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
}
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
if ('' !== trim($htmlContent)) {
@$dom->loadHTML($htmlContent);
}
libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);
return $dom;
}
/**
* Convert charset to HTML-entities to ensure valid parsing.
*/
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });
try {
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
} catch (\Exception $e) {
}
return $htmlContent;
} finally {
restore_error_handler();
}
}
/**
* @throws \InvalidArgumentException
*/
Oops, something went wrong.

0 comments on commit 4050ec4

Please sign in to comment.
You can’t perform that action at this time.