From e27b57dc3a5e7977ce12d6aebc34d178da98988b Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 14 Dec 2015 18:36:20 +0900 Subject: [PATCH 01/27] Conform ampersand-error reporting to HTML spec --- .../validator/htmlparser/impl/Tokenizer.java | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 13fd56b1..84c59e01 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -221,6 +221,8 @@ public class Tokenizer implements Locator, Locator2 { public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; + public static final int AMBIGUOUS_AMPERSAND = 75; + /** * Magic value for UTF-16 operations. */ @@ -3054,6 +3056,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException { case '<': case '&': case '\u0000': + case ';': emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; @@ -3082,17 +3085,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { firstCharKey = c - 'A'; } else { // No match - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3153,17 +3151,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { } } if (hilo == 0) { - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } // Didn't fail yet @@ -3246,16 +3239,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException { if (candidate == -1) { // reconsume deals with CR, LF or nul - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = pos; } reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } else { // c can't be CR, LF or nul if we got here @@ -3293,10 +3282,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); reconsume = true; - state = transition(state, returnState, reconsume, pos); + state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos); continue stateloop; } } @@ -3359,6 +3347,28 @@ private void ensureBufferSpace(int inputLength) throws SAXException { * I'm ∉ I tell you. */ } + // XXX reorder point + case AMBIGUOUS_AMPERSAND: + ampersandloop: for (;;) { + if (reconsume) { + if (++pos == endPos) { + break stateloop; + } + pos--; + c = checkChar(buf, pos); + } + if (c == ';') { + errNoNamedCharacterMatch(); + } else if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) { + appendStrBuf(c); + pos++; + continue; + } + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } case CONSUME_NCR: if (++pos == endPos) { break stateloop; @@ -6449,7 +6459,6 @@ public void eof() throws SAXException { state = returnState; continue; case CHARACTER_REFERENCE_HILO_LOOKUP: - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue; @@ -6503,10 +6512,6 @@ public void eof() throws SAXException { } if (candidate == -1) { - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); emitOrAppendCharRefBuf(returnState); state = returnState; continue eofloop; @@ -6544,7 +6549,6 @@ public void eof() throws SAXException { * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ - errNoNamedCharacterMatch(); appendCharRefBufToStrBuf(); state = returnState; continue eofloop; From 82ef614c00e0872ebf122af489a451fb9b327c2c Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 10 Jun 2016 21:39:28 +0900 Subject: [PATCH 02/27] Make consecutive hyphens in comments a non-error Also allow `` at (IE conditional) comment end See https://github.com/whatwg/html/pull/1356 See https://github.com/whatwg/html/pull/1456 --- .../impl/ErrorReportingTokenizer.java | 10 +- .../validator/htmlparser/impl/Tokenizer.java | 199 ++++++++++++++++-- 2 files changed, 190 insertions(+), 19 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java index 19fbe7a6..77dc9090 100644 --- a/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java +++ b/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2013 Mozilla Foundation + * Copyright (c) 2009-2017 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -395,8 +395,8 @@ private boolean isAstralPrivateUse(int c) { err("Nameless doctype."); } - @Override protected void errConsecutiveHyphens() throws SAXException { - err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is."); + @Override protected void errNestedComment() throws SAXException { + err("Saw \u201C @@ -71,13 +69,30 @@ scm:hg:http://hg.mozilla.org/projects/htmlparser/ http://hg.mozilla.org/projects/htmlparser/ + + UTF-8 + ${project.build.directory}/src ${basedir}/test-src + + org.apache.maven.plugins + maven-source-plugin + 3.2.1 + + + attach-sources + + jar-no-fork + + + + org.apache.maven.plugins maven-compiler-plugin + 3.8.1 1.7 1.7 @@ -128,13 +143,41 @@ + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + false + + + org.apache.maven.plugins maven-surefire-plugin + 3.0.0-M5 true + + org.apache.maven.plugins + maven-javadoc-plugin + 3.1.0 + + 7 + true + + + + attach-javadocs + + jar + + + + From 85705859a5f01f7ef0e2f29f052a41eab0a66861 Mon Sep 17 00:00:00 2001 From: Carlos Amengual Date: Sun, 5 Jul 2020 16:06:31 +0200 Subject: [PATCH 23/27] Maven POM: update dependency version for jsontools. Not required for modularization, but no reason to stay on the old version. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 39fdb1ae..da54d8f8 100644 --- a/pom.xml +++ b/pom.xml @@ -205,7 +205,7 @@ com.sdicons.jsontools jsontools-core - 1.4 + 1.7 test From 2ad928a24e7401e9e31c0ccc925da16dceb48c5f Mon Sep 17 00:00:00 2001 From: Carlos Amengual Date: Sun, 5 Jul 2020 16:22:33 +0200 Subject: [PATCH 24/27] Maven POM: update the 'scm' information --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index da54d8f8..789fa5ee 100644 --- a/pom.xml +++ b/pom.xml @@ -66,8 +66,8 @@ - scm:hg:http://hg.mozilla.org/projects/htmlparser/ - http://hg.mozilla.org/projects/htmlparser/ + scm:git:https://github.com/validator/htmlparser.git + https://github.com/validator/htmlparser UTF-8 From 4e33cbcaf7f79baf70aba1fd8d4fb6f69f282ef3 Mon Sep 17 00:00:00 2001 From: Carlos Amengual Date: Sun, 5 Jul 2020 16:33:06 +0200 Subject: [PATCH 25/27] Modularize the library, modifying the POM and adding a module-info Requires JDK 12 or higher (due to bug in JDK 11), but produces bytecode for Java 7 and 11. --- pom.xml | 47 +++++++++++++++++++++++++++++++++++++------- src/module-info.java | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 src/module-info.java diff --git a/pom.xml b/pom.xml index 789fa5ee..4331a6da 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ + [64.2,) compile true diff --git a/src/module-info.java b/src/module-info.java new file mode 100644 index 00000000..729b91c0 --- /dev/null +++ b/src/module-info.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Carlos Amengual + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +module nu.validator.htmlparser { + exports nu.validator.htmlparser.annotation; + exports nu.validator.htmlparser.common; + exports nu.validator.htmlparser.dom; + exports nu.validator.htmlparser.extra; + exports nu.validator.htmlparser.impl; + exports nu.validator.htmlparser.io; + exports nu.validator.htmlparser.rewindable; + exports nu.validator.htmlparser.sax; + exports nu.validator.htmlparser.xom; + exports nu.validator.saxtree; + + requires transitive java.xml; + requires static nu.xom; + requires static com.ibm.icu; + requires static jchardet; +} From b047f935b2f7f702102c51344a897e99f2c0976d Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 6 Jul 2020 13:25:33 +0900 Subject: [PATCH 26/27] Maven POM: Use none for javadoc Without this change, the Maven javadoc build fails with fatal errors. --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index 4331a6da..355bae5e 100644 --- a/pom.xml +++ b/pom.xml @@ -195,6 +195,7 @@ 7 true + none From cf8e4a41fe1c1b7b02c5d88e9b9a27dabce0c8af Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 6 Jul 2020 14:08:06 +0900 Subject: [PATCH 27/27] Maven POM: Use https URLs --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 355bae5e..31b2bace 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ htmlparser 1.4 htmlparser - http://about.validator.nu/htmlparser/ + https://about.validator.nu/htmlparser/ The Validator.nu HTML Parser is an implementation of the HTML5 parsing algorithm in Java for applications. The parser is designed to work as a drop-in replacement for the XML parser in applications that already support XHTML 1.x content with an XML parser and use SAX, DOM or XOM to interface with the parser.