Permalink
Browse files

Added DefaultHandler option "enforceEmptyTags" so that XML can be par…

…sed correctly
  • Loading branch information...
tautologistics committed May 25, 2010
1 parent e8b2910 commit b954e7fec9da6a969bfcc1d3e40e1e1d57369fbb
Showing with 63 additions and 25 deletions.
  1. +9 −0 CHANGELOG
  2. +24 −1 README.md
  3. +6 −3 node-htmlparser.js
  4. +19 −18 node-htmlparser.min.js
  5. +2 −0 runtests.min.html
  6. +3 −3 snippet.js
View
@@ -0,0 +1,9 @@
+
+v1.5.0
+ * Added DefaultHandler option "enforceEmptyTags" so that XML can be parsed correctly
+
+v1.4.2
+ * Added tests for parsing XML with namespaces
+
+v1.4.1
+ * Added minified version
View
@@ -1,5 +1,5 @@
#NodeHtmlParser
-A forgiving HTML parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
+A forgiving HTML/XML parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
##Running Tests
@@ -141,6 +141,29 @@ becomes:
}
]
+###Option: enforceEmptyTags
+Indicates whether the DOM should prevent children on tags marked as empty in the HTML spec. Typically this should be set to "true" HTML parsing and "false" for XML parsing. The default value is "true".
+
+####Example: true
+The following HTML:
+ <link>text</link>
+becomes:
+ [ { raw: 'link', data: 'link', type: 'tag', name: 'link' }
+ , { raw: 'text', data: 'text', type: 'text' }
+ ]
+
+####Example: false
+The following HTML:
+ <link>text</link>
+becomes:
+ [ { raw: 'link'
+ , data: 'link'
+ , type: 'tag'
+ , name: 'link'
+ , children: [ { raw: 'text', data: 'text', type: 'text' } ]
+ }
+ ]
+
##DomUtils
###TBD (see utils_example.js for now)
View
@@ -18,6 +18,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***********************************************/
+/* v1.5.0 */
(function () {
@@ -416,6 +417,8 @@ function DefaultHandler (callback, options) {
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
if (this._options.verbose == undefined)
this._options.verbose = true; //Keep data property for tags and raw property for all
+ if (this._options.enforceEmptyTags == undefined)
+ this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
if ((typeof callback) == "function")
this._callback = callback;
}
@@ -508,7 +511,7 @@ function DefaultHandler (callback, options) {
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
this.dom.push(element);
- if (!DefaultHandler._emptyTags[element.name]) { //Don't add tags to the tag stack that can't have children
+ if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) { //Don't add tags to the tag stack that can't have children
this._tagStack.push(element);
}
}
@@ -524,7 +527,7 @@ function DefaultHandler (callback, options) {
//This is a closing tag, scan the tagStack to find the matching opening tag
//and pop the stack up to the opening tag's parent
var baseName = element.name.substring(1);
- if (!DefaultHandler._emptyTags[baseName]) {
+ if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[baseName]) {
var pos = this._tagStack.length - 1;
while (pos > -1 && this._tagStack[pos--].name != baseName) { }
if (pos > -1 || this._tagStack[0].name == baseName)
@@ -536,7 +539,7 @@ function DefaultHandler (callback, options) {
if (!this._tagStack.last().children)
this._tagStack.last().children = [];
this._tagStack.last().children.push(element);
- if (!DefaultHandler._emptyTags[element.name]) //Don't add tags to the tag stack that can't have children
+ if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) //Don't add tags to the tag stack that can't have children
this._tagStack.push(element);
}
}
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -39,6 +39,8 @@
<script language="JavaScript" src="tests/15-non-verbose.js"></script>
<script language="JavaScript" src="tests/16-ignore_whitespace.js"></script>
<script language="JavaScript" src="tests/17-xml_namespace.js"></script>
+ <script language="JavaScript" src="tests/18-enforce_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/19-ignore_empty_tags.js"></script>
<!-- //TODO: dynamic loading of test files -->
</head>
<body style="font-size: small; font-family:Arial, Helvetica, sans-serif;">
View
@@ -3,13 +3,13 @@
var sys = require("sys");
var htmlparser = require("./node-htmlparser");
-var html = "<a href=\"test.html\">xxx</a>";
+var html = "<link>text</link>";
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err)
sys.debug("Error: " + err);
else
sys.debug(sys.inspect(dom, false, null));
-}, { verbose: false });
+}, { enforceEmptyTags: true });
var parser = new htmlparser.Parser(handler);
-parser.ParseComplete(html);
+parser.parseComplete(html);

0 comments on commit b954e7f

Please sign in to comment.