patch to support doctype with test #48

Open
wants to merge 3 commits into from
View
64 lib/htmlparser.js
@@ -49,6 +49,7 @@ var Mode = {
Tag: 'tag',
Attr: 'attr',
CData: 'cdata',
+ Doctype: 'doctype',
Comment: 'comment'
};
@@ -136,6 +137,8 @@ function Parser (builder, options) {
return this._parseAttr(this._state);
case Mode.CData:
return this._parseCData(this._state);
+ case Mode.Doctype:
+ return this._parseDoctype(this._state);
case Mode.Comment:
return this._parseComment(this._state);
}
@@ -224,6 +227,11 @@ function Parser (builder, options) {
state.pos += 8;
return;
}
+ if (!match[1] && match[2].substr(0, 8) === '!DOCTYPE') {
+ state.mode = Mode.Doctype;
+ state.pos += 8;
+ return;
+ }
if (!state.done && (state.pos + match[0].length) === state.data.length) {
//We're at the and of the data, might be incomplete
state.needData = true;
@@ -341,6 +349,7 @@ function Parser (builder, options) {
}
state.pos += name_data.match.length;
var value_data = this._parseAttr_findValue(state);
+ var end = state.data.indexOf(' ', state.pos);
if (value_data) {
if (!state.done && state.pos + value_data.match.length === state.data.length) {
state.needData = true;
@@ -349,16 +358,24 @@ function Parser (builder, options) {
}
state.pos += value_data.match.length;
} else {
- Parser.re_parseAttr_splitValue.lastIndex = state.pos;
- if (Parser.re_parseAttr_splitValue.exec(state.data)) {
- state.needData = true;
- state.pos -= name_data.match.length;
- return;
+ if (state.data.indexOf(' ', state.pos-1)) {
+ value_data = {
+ match: ''
+ , value: name_data.name
+ };
+
+ } else {
+ Parser.re_parseAttr_splitValue.lastIndex = state.pos;
+ if (Parser.re_parseAttr_splitValue.exec(state.data)) {
+ state.needData = true;
+ state.pos -= name_data.match.length;
+ return;
+ }
+ value_data = {
+ match: ''
+ , value: null
+ };
}
- value_data = {
- match: ''
- , value: null
- };
}
state.lastTag.raw += name_data.match + value_data.match;
@@ -400,6 +417,35 @@ function Parser (builder, options) {
}
};
+ Parser.prototype._parseDoctype = function Parser$_parseDoctype () {
+ var state = this._state;
+ var foundPos = state.data.indexOf('>', state.pos);
+ if (foundPos < 0 && state.done) {
+ foundPos = state.data.length;
+ }
+ if (foundPos < 0) {
+ Parser.re_parseCData_findEnding.lastIndex = state.pos;
+ if (!state.pendingText) {
+ state.pendingText = [];
+ }
+ state.pendingText.push(state.data.substr(state.pos, state.data.length));
+ state.pos = state.data.length;
+ state.needData = true;
+ } else {
+ var text;
+ if (state.pendingText) {
+ state.pendingText.push(state.data.substring(state.pos, foundPos));
+ text = state.pendingText.join('');
+ state.pendingText = null;
+ } else {
+ text = state.data.substring(state.pos, foundPos);
+ }
+ this._write({ type: Mode.Doctype, data: text });
+ state.mode = Mode.Text;
+ state.pos = foundPos + 1;
+ }
+ };
+
Parser.re_parseComment_findEnding = /\-{1,2}$/;
Parser.prototype._parseComment = function Parser$_parseComment () {
var state = this._state;
View
9 tests/parser.js
@@ -367,6 +367,15 @@ exports['html inside comment'] = {
, expected: [{ type: 'comment', data: ' <div>foo</div> '}]
};
+exports['transitional doctype'] = {
+ data: ['<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html></html>']
+ , expected: [
+ { type: 'doctype', data: ' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"'},
+ { type: 'tag', name: 'html', raw: 'html' },
+ { type: 'tag', name: '/html', raw: '/html' }
+ ]
+};
+
exports['html inside cdata'] = {
data: ['<![CDATA[ <div>foo</div> ]]>']
, expected: [{ type: 'cdata', data: ' <div>foo</div> '}]