Skip to content

Commit

Permalink
[+]: try to fix "DOM auto remove HTML closing tag in <script> tag whe…
Browse files Browse the repository at this point in the history
  • Loading branch information
voku committed Mar 3, 2019
1 parent fc06c35 commit 4877411
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 9 deletions.
39 changes: 30 additions & 9 deletions src/voku/helper/HtmlDomParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -379,12 +379,16 @@ private function createDOMDocument(string $html, $libXMLExtraOptions = null): \D
$this->isDOMDocumentCreatedWithFakeEndScript = true;
}

if (
\strpos($html, 'type="text/html"') !== false
||
\strpos($html, 'type=\'text/html\'') !== false
) {
$this->keepSpecialScriptTags($html);
if (\strpos($html, '<script') !== false) {
$this->html5FallbackForScriptTags($html);

if (
\strpos($html, 'type="text/html"') !== false
||
\strpos($html, 'type=\'text/html\'') !== false
) {
$this->keepSpecialScriptTags($html);
}
}

// set error level
Expand Down Expand Up @@ -458,20 +462,37 @@ private function createDOMDocument(string $html, $libXMLExtraOptions = null): \D
return $this->document;
}

/**
* workaround for bug: https://bugs.php.net/bug.php?id=74628
*
* @param string $html
*/
protected function html5FallbackForScriptTags(string &$html)
{
// regEx for e.g.: [<script id="elements-image-2">...<script>]
$regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
$html = \preg_replace_callback($regExSpecialScript, function($scripts) {
return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/',$scripts['content']) . '</script>';
},$html);
}

/**
* @param string $html
*/
protected function keepSpecialScriptTags(string &$html)
{
$specialScripts = [];
// regEx for e.g.: [<script id="elements-image-1" type="text/html">...<script>]
// regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
$regExSpecialScript = '/<(script) [^>]*type=(["|\'])text\/html\2([^>]*)>.*<\/\1>/isU';
\preg_match_all($regExSpecialScript, $html, $specialScripts);

if (isset($specialScripts[0])) {
foreach ($specialScripts[0] as $specialScript) {
$specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . substr($specialScript, strlen('<script'));
$specialNonScript = substr($specialNonScript, 0, -strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';

$specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
$specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
// remove the html5 fallback
$specialNonScript = \str_replace('<\/', '</', $specialNonScript);

$html = \str_replace($specialScript, $specialNonScript, $html);
}
Expand Down
9 changes: 9 additions & 0 deletions tests/HtmlDomParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,15 @@ public function testScriptInCommentHtml()
static::assertSame('<script class="script_1" type="text/javascript">someCode</script>', (string) $script);
}

public function testHtmlAndJavaScriptMix()
{
$htmlAndJs = '<p>Text 1</p><script>$(".second-column-mobile-inner").wrapAll("<div class=\'collapse\' id=\'second-column\'></div>");</script><p>Text 2</p>';

$dom = HtmlDomParser::str_get_html($htmlAndJs);
$script = $dom->find('script');
static::assertSame('<script>$(".second-column-mobile-inner").wrapAll("<div class=\'collapse\' id=\'second-column\'><\/div>");</script>', (string) $script);
}

public function testSpecialCharsAndPlaintext()
{
$file = __DIR__ . '/fixtures/test_page_plaintext.html';
Expand Down

0 comments on commit 4877411

Please sign in to comment.