diff --git a/README.mdown b/README.mdown index 23d46a5..9b3ce86 100644 --- a/README.mdown +++ b/README.mdown @@ -40,7 +40,13 @@ $html = '
text

'; + +// shortcut. retrun minify html $minify = HTMLMinify::minify($html); + +// detail +$HTMLMinify = new HTMLMinify($html); +$minify = $HTMLMinify->process(); ``` output html diff --git a/composer.json b/composer.json index d812387..5bf86cb 100644 --- a/composer.json +++ b/composer.json @@ -18,8 +18,7 @@ } ], "require": { - "php": ">=5.3.0", - "composer/installers": "*" + "php": ">=5.3.0" }, "require-dev": { "satooshi/php-coveralls": "dev-master" diff --git a/src/zz/Html/HTMLMinify.php b/src/zz/Html/HTMLMinify.php index 362d196..e9c134e 100644 --- a/src/zz/Html/HTMLMinify.php +++ b/src/zz/Html/HTMLMinify.php @@ -8,8 +8,6 @@ namespace zz\Html; class HTMLMinify { - const ENCODING = 'UTF-8'; - const DOCTYPE_HTML4 = 'HTML4.01'; const DOCTYPE_XHTML1 = 'XHTML1.0'; const DOCTYPE_HTML5 = 'html5'; @@ -226,7 +224,7 @@ public function __construct($html, $options = array()) { * * 'excludeComment' * example : content - * array()(default) => content + * array('//')(default) => content * array('//') => content * * 'removeDuplicateAttribute' @@ -389,60 +387,62 @@ protected function beforeFilter() { protected function removeWhitespaceFromComment() { $tokens = $this->tokens; $regexps = $this->options['excludeComment']; - - for ($i = 0, $len = count($tokens); $i < $len; $i++) { + $HTMLTokenStartTag = HTMLToken::StartTag; + $HTMLTokenComment = HTMLToken::Comment; + $HTMLTokenCharacter = HTMLToken::Character; + $HTMLNamesScriptTag = HTMLNames::scriptTag; + $HTMLNamesStyleTag = HTMLNames::styleTag; + $removes = array(); + $combineIndex = null; + + $len = count($tokens); + for ($i = 0; $i < $len; $i++) { $token = $tokens[$i]; $type = $token->getType(); - if ($type === HTMLToken::StartTag) { + if ($type === $HTMLTokenStartTag) { + $combineIndex = null; $tagName = $token->getTagName(); - if ($tagName === HTMLNames::scriptTag || $tagName === HTMLNames::styleTag) { + if ($tagName === $HTMLNamesScriptTag || $tagName === $HTMLNamesStyleTag) { $i++; - continue; } - } else if ($this->_isConditionalComment($token)) { + continue; + } else if ($type === $HTMLTokenCharacter) { + if ($combineIndex > 0) { + $tokens[$combineIndex]->setData($tokens[$combineIndex] . $token); + $removes[] = $i; + } + continue; + } else if ($type !== $HTMLTokenComment) { + $combineIndex = null; continue; } - if ($type !== HTMLToken::Comment) { + + $comment = $token->getData(); + if ($this->_isConditionalComment($comment)) { + $combineIndex = null; continue; } if ($regexps) { - $comment = $token->getData(); foreach ($regexps as $regexp) { if (preg_match($regexp, $comment)) { + $combineIndex = null; continue 2; } } } - - unset($tokens[$i]); - $tokens = array_merge($tokens, array()); - $len = count($tokens); - $i--; + $combineIndex = $i - 1; + $removes[] = $i; } - /** - * @var HTMLToken[] $tokens - */ - $tokens = array_merge($tokens, array()); + foreach ($removes as $remove) { + unset($tokens[$remove]); + } - // combine chars - for ($i = 1, $len = count($tokens); $i < $len; $i++) { - $token = $tokens[$i]; - if ($token->getType() !== HTMLToken::Character) { - continue; - } - $token_before = $tokens[$i - 1]; - if ($token_before->getType() !== HTMLToken::Character) { - continue; - } - $tokens[$i]->setData($token_before . $token->getData()); - unset($tokens[$i - 1]); - $len = count($tokens); - $tokens = array_merge($tokens, array()); - $i--; + if ($len !== count($tokens)) { + $tokens = array_merge($tokens,array()); } - $tokens = array_merge($tokens, array()); $this->tokens = $tokens; + return true; } protected function isInlineTag($tag) { @@ -470,22 +470,24 @@ protected function removeWhitespaceFromCharacter() { $token = $tokens[$i]; $type = $token->getType(); if ($type === HTMLToken::StartTag) { - $isBeforeInline = $this->isInlineTag($token->getTagName()); - switch ($token->getTagName()) { + $tagName = $token->getName(); + $isBeforeInline = $this->isInlineTag($tagName); + switch ($tagName) { case HTMLNames::scriptTag: case HTMLNames::styleTag: case HTMLNames::textareaTag: case HTMLNames::preTag: $isEditable = false; - $uneditableTag = $token->getTagName(); + $uneditableTag = $tagName; continue 2; break; default: break; } } else if ($type === HTMLToken::EndTag) { - $isBeforeInline = $this->isInlineTag($token->getTagName()); - if (!$isEditable && $token->getTagName() === $uneditableTag) { + $tagName = $token->getName(); + $isBeforeInline = $this->isInlineTag($tagName); + if (!$isEditable && $tagName === $uneditableTag) { $uneditableTag = null; $isEditable = true; continue; @@ -535,12 +537,12 @@ protected function _removeWhitespaceFromCharacter($characters) { $compactCharacters = ''; $hasWhiteSpace = false; - for ($i = 0, $len = mb_strlen($characters, static::ENCODING); $i < $len; $i++) { - $char = mb_substr($characters, $i, 1, static::ENCODING); + for ($i = 0, $len = strlen($characters); $i < $len; $i++) { + $char = $characters[$i]; if ($char === "\x0A") { // remove before whitespace char if ($hasWhiteSpace) { - $compactCharacters = mb_substr($compactCharacters, 0, -1, static::ENCODING); + $compactCharacters = substr($compactCharacters, 0, -1); } $compactCharacters .= $char; $hasWhiteSpace = true; @@ -567,7 +569,7 @@ protected function optimizeStartTagAttributes() { } $attributes_old = $token->getAttributes(); - $attributes_new = array(); + $attributes_new =array(); $attributes_name = array(); foreach ($attributes_old as $attribute) { @@ -586,15 +588,10 @@ protected function optimizeStartTagAttributes() { /** * downlevel-hidden : * downlevel-revealed : HTML - * @param HTMLToken $token + * @param string $comment * @return bool */ - protected function _isConditionalComment(HTMLToken $token) { - if ($token->getType() !== HTMLToken::Comment) { - return false; - } - - $comment = $this->_buildElement($token); + protected function _isConditionalComment($comment) { $pattern = '/\A/s'; if (preg_match($pattern, $comment)) { return true; diff --git a/src/zz/Html/HTMLToken.php b/src/zz/Html/HTMLToken.php index 76ae0e3..67bbbd1 100644 --- a/src/zz/Html/HTMLToken.php +++ b/src/zz/Html/HTMLToken.php @@ -235,7 +235,8 @@ public function setState($states) { } public function getTagName() { - if ($this->getType() !== static::StartTag && $this->getType() !== static::EndTag) { + $type = $this->getType(); + if ($type !== static::StartTag && $type !== static::EndTag) { return false; } return $this->getName(); diff --git a/src/zz/Html/HTMLTokenizer.php b/src/zz/Html/HTMLTokenizer.php index e97316a..30a1cdf 100644 --- a/src/zz/Html/HTMLTokenizer.php +++ b/src/zz/Html/HTMLTokenizer.php @@ -175,6 +175,7 @@ public function getState() { } /** + * @throws \InvalidArgumentException * @return HtmlToken[] */ public function tokenizer() { @@ -195,12 +196,13 @@ public function tokenizer() { $startState = $this->_startState; // In other than `DataState`, `nextToken` return the type of Character, it contains the type of EndTag. // SegmentedString go back to the end of the type of Character position. - if ($this->_Token->getType() === HTMLToken::Character && $this->_bufferedEndTagName !== '' && ($startState === static::RAWTEXTState || $startState === static::RCDATAState || $startState === static::ScriptDataState)) { + $type = $this->_Token->getType(); + if ($type === HTMLToken::Character && $this->_bufferedEndTagName !== '' && ($startState === static::RAWTEXTState || $startState === static::RCDATAState || $startState === static::ScriptDataState)) { $length = strlen($this->_Token->getData()); // HTMLToken::Character $this->_buffer = array_slice($this->_buffer, 0, $length); - $this->_compactBuffer($startPos, $startPos + $length); + $this->_compactBuffer($startPos, $startPos + $length, $type); $token = $this->_Token; $this->_tokens[] = $token; @@ -208,11 +210,11 @@ public function tokenizer() { $this->_SegmentedString->seek($startPos + $length); $this->_state = $startState; } else { - $this->_compactBuffer($startPos, $endPos); + $this->_compactBuffer($startPos, $endPos, $type); $token = $this->_Token; $this->_tokens[] = $token; // FIXME: The tokenizer should do this work for us. - if ($this->_Token->getType() === HTMLToken::StartTag) { + if ($type === HTMLToken::StartTag) { $this->_updateStateFor($token->getTagName()); } else { $this->_state = static::DataState; @@ -239,7 +241,7 @@ public function getTokensAsArray() { return $result; } - protected function _compactBuffer($startPos, $endPos) { + protected function _compactBuffer($startPos, $endPos, $type) { $compactBuffer = array(); $before = static::kEndOfFileMarker; $html = $this->_SegmentedString->substr($startPos, $endPos - $startPos); @@ -248,7 +250,7 @@ protected function _compactBuffer($startPos, $endPos) { $before = $compactBuffer[$i] = $state; } } - switch ($this->_Token->getType()) { + switch ($type) { case HTMLToken::Uninitialized: case HTMLToken::EndOfFile: case HTMLToken::Character: @@ -260,7 +262,7 @@ protected function _compactBuffer($startPos, $endPos) { if ($this->_debug) { $this->_Token->setHtmlOrigin($html); $this->_Token->setState($compactBuffer); - } else if ($this->_Token->getType() === HTMLToken::DOCTYPE) { + } else if ($type === HTMLToken::DOCTYPE) { $this->_Token->setHtmlOrigin($html); } $this->_Token->clean(); @@ -360,10 +362,10 @@ protected function nextToken(SegmentedString $source) { $this->_HTML_ADVANCE_TO(static::MarkupDeclarationOpenState); } else if ($char === '/') { $this->_HTML_ADVANCE_TO(static::EndTagOpenState); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->beginStartTag(strtolower($char)); $this->_HTML_ADVANCE_TO(static::TagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_Token->beginStartTag(strtolower($char)); $this->_HTML_ADVANCE_TO(static::TagNameState); } else if ($char === '?') { @@ -380,10 +382,10 @@ protected function nextToken(SegmentedString $source) { break; case static::EndTagOpenState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_Token->beginEndTag(strtolower($char)); $this->_HTML_ADVANCE_TO(static::TagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_Token->beginEndTag(strtolower($char)); $this->_HTML_ADVANCE_TO(static::TagNameState); } else if ($char === '>') { @@ -407,7 +409,7 @@ protected function nextToken(SegmentedString $source) { $this->_HTML_ADVANCE_TO(static::SelfClosingStartTagState); } else if ($char === '>') { return $this->_emitAndResumeIn(); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->appendToName(strtolower($char)); $this->_HTML_ADVANCE_TO(static::TagNameState); } else if ($char === static::kEndOfFileMarker) { @@ -430,11 +432,11 @@ protected function nextToken(SegmentedString $source) { break; case static::RCDATAEndTagOpenState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::RCDATAEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::RCDATAEndTagNameState); @@ -446,11 +448,11 @@ protected function nextToken(SegmentedString $source) { break; case static::RCDATAEndTagNameState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::RCDATAEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::RCDATAEndTagNameState); @@ -499,11 +501,11 @@ protected function nextToken(SegmentedString $source) { break; case static::RAWTEXTEndTagOpenState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::RAWTEXTEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::RAWTEXTEndTagNameState); @@ -515,11 +517,11 @@ protected function nextToken(SegmentedString $source) { break; case static::RAWTEXTEndTagNameState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::RAWTEXTEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::RAWTEXTEndTagNameState); @@ -572,11 +574,11 @@ protected function nextToken(SegmentedString $source) { break; case static::ScriptDataEndTagOpenState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataEndTagNameState); @@ -588,11 +590,11 @@ protected function nextToken(SegmentedString $source) { break; case static::ScriptDataEndTagNameState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataEndTagNameState); @@ -700,13 +702,13 @@ protected function nextToken(SegmentedString $source) { if ($char === '/') { $this->_temporaryBuffer = ''; $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedEndTagOpenState); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_bufferCharacter('<'); $this->_bufferCharacter($char); $this->_temporaryBuffer = ''; $this->_temporaryBuffer = strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapeStartState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_bufferCharacter('<'); $this->_bufferCharacter($char); $this->_temporaryBuffer = ''; @@ -719,11 +721,11 @@ protected function nextToken(SegmentedString $source) { break; case static::ScriptDataEscapedEndTagOpenState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedEndTagNameState); @@ -735,11 +737,11 @@ protected function nextToken(SegmentedString $source) { break; case static::ScriptDataEscapedEndTagNameState: - if ($this->_isASCIIUpper($char)) { + if (ctype_upper($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedEndTagNameState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_temporaryBuffer .= $char; $this->_bufferedEndTagName .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedEndTagNameState); @@ -780,11 +782,11 @@ protected function nextToken(SegmentedString $source) { } else { $this->_HTML_ADVANCE_TO(static::ScriptDataEscapedState); } - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_bufferCharacter($char); $this->_temporaryBuffer .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapeStartState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_bufferCharacter($char); $this->_temporaryBuffer .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapeStartState); @@ -861,11 +863,11 @@ protected function nextToken(SegmentedString $source) { } else { $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapedState); } - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_bufferCharacter($char); $this->_temporaryBuffer .= strtolower($char); $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapeEndState); - } else if ($this->_isASCIILower($char)) { + } else if (ctype_lower($char)) { $this->_bufferCharacter($char); $this->_temporaryBuffer .= $char; $this->_HTML_ADVANCE_TO(static::ScriptDataDoubleEscapeEndState); @@ -881,7 +883,7 @@ protected function nextToken(SegmentedString $source) { $this->_HTML_ADVANCE_TO(static::SelfClosingStartTagState); } else if ($char === '>') { return $this->_emitAndResumeIn(); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->addNewAttribute(); $this->_Token->beginAttributeName($source->numberOfCharactersConsumed()); $this->_Token->appendToAttributeName(strtolower($char)); @@ -913,7 +915,7 @@ protected function nextToken(SegmentedString $source) { } else if ($char === '>') { $this->_Token->endAttributeName($source->numberOfCharactersConsumed()); return $this->_emitAndResumeIn(); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->appendToAttributeName(strtolower($char)); $this->_HTML_ADVANCE_TO(static::AttributeNameState); } else if ($char === static::kEndOfFileMarker) { @@ -938,7 +940,7 @@ protected function nextToken(SegmentedString $source) { $this->_HTML_ADVANCE_TO(static::BeforeAttributeValueState); } else if ($char === '>') { return $this->_emitAndResumeIn(); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->addNewAttribute(); $this->_Token->beginAttributeName($source->numberOfCharactersConsumed()); $this->_Token->appendToAttributeName(strtolower($char)); @@ -1265,7 +1267,7 @@ protected function nextToken(SegmentedString $source) { case static::BeforeDOCTYPENameState: if ($this->_isTokenizerWhitespace($char)) { $this->_HTML_ADVANCE_TO(static::BeforeDOCTYPENameState); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->beginDOCTYPE(strtolower($char)); $this->_HTML_ADVANCE_TO(static::DOCTYPENameState); } else if ($char === '>') { @@ -1289,7 +1291,7 @@ protected function nextToken(SegmentedString $source) { $this->_HTML_ADVANCE_TO(static::AfterDOCTYPENameState); } else if ($char === '>') { return $this->_emitAndResumeIn(); - } else if ($this->_isASCIIUpper($char)) { + } else if (ctype_upper($char)) { $this->_Token->appendToName(strtolower($char)); $this->_HTML_ADVANCE_TO(static::DOCTYPENameState); } else if ($char === static::kEndOfFileMarker) { @@ -1636,14 +1638,6 @@ protected function _notImplemented() { // logger } - protected function _isASCIIUpper($char) { - return preg_match('/\A[A-Z]\Z/', $char); - } - - protected function _isASCIILower($char) { - return preg_match('/\A[a-z]\Z/', $char); - } - protected function _temporaryBufferIs($expectedString) { return $this->_vectorEqualsString($this->_temporaryBuffer, $expectedString); } @@ -1690,15 +1684,15 @@ protected function _emitAndResumeIn() { return true; } - protected function _flushEmitAndResumeIn($source, $state) { + protected function _flushEmitAndResumeIn(SegmentedString $source, $state) { // m_state = state; $this->_state = $state; $this->_flushBufferedEndTag($source); return true; } - protected function _flushBufferedEndTag($source) { - $this->_SegmentedString->advance(); + protected function _flushBufferedEndTag(SegmentedString $source) { + $source->advance(); if ($this->_Token->getType() === HTMLToken::Character) { return true; } diff --git a/src/zz/Html/SegmentedString.php b/src/zz/Html/SegmentedString.php index 2dbc0fa..6989989 100644 --- a/src/zz/Html/SegmentedString.php +++ b/src/zz/Html/SegmentedString.php @@ -48,7 +48,11 @@ public function __construct($str) { * @return bool|string */ public function getCurrentChar() { - return substr($this->str, $this->i, 1); + $i = $this->i; + if ($this->len <= $i) { + return false; + } + return $this->str[$i]; } public function advance() { @@ -78,6 +82,8 @@ public function substr($startPos, $length) { /** * @param int $offset + * @param int $whence + * @throws \InvalidArgumentException * @return bool */ public function seek($offset, $whence = self::begin) {