Skip to content

Commit

Permalink
use new haxe.xml.Parser for JS : faster, not based on Regexp but stat…
Browse files Browse the repository at this point in the history
…e machine

git-svn-id: http://haxe.googlecode.com/svn/trunk@4486 f16182fa-f095-11de-8f43-4547254af6c6
  • Loading branch information
ncannasse committed Apr 20, 2012
1 parent 7743b22 commit 8bae7d8
Show file tree
Hide file tree
Showing 3 changed files with 286 additions and 111 deletions.
1 change: 1 addition & 0 deletions doc/CHANGES.txt
Expand Up @@ -8,6 +8,7 @@
js : forbid static 'length' (issue since object is a Function)
all : does not allow overriding var/prop
flash : removed wrapping for Xml nodes, use instead specific compare when comparing two typed nodes
js : use new haxe.xml.Parser (faster, not based on Regexp)

2012-04-14: 2.09
all : optimized const == const and const != const (with different const types)
Expand Down
284 changes: 284 additions & 0 deletions std/haxe/xml/Parser.hx
@@ -0,0 +1,284 @@
package haxe.xml;

using StringTools;

/* poor'man enum : reduce code size + a bit faster since inlined */
extern private class S {
public static inline var IGNORE_SPACES = 0;
public static inline var BEGIN = 1;
public static inline var BEGIN_NODE = 2;
public static inline var TAG_NAME = 3;
public static inline var BODY = 4;
public static inline var ATTRIB_NAME = 5;
public static inline var EQUALS = 6;
public static inline var ATTVAL_BEGIN = 7;
public static inline var ATTRIB_VAL = 8;
public static inline var CHILDS = 9;
public static inline var CLOSE = 10;
public static inline var WAIT_END = 11;
public static inline var WAIT_END_RET = 12;
public static inline var PCDATA = 13;
public static inline var HEADER = 14;
public static inline var COMMENT = 15;
public static inline var DOCTYPE = 16;
public static inline var CDATA = 17;
}

class Parser
{
static public function parse(str:String)
{
var doc = Xml.createDocument();
doParse(str, 0, doc);
return doc;
}

static function doParse(str:String, ?p:Int = 0, ?parent:Xml):Int
{
var xml:Xml = null;
var state = S.BEGIN;
var next = S.BEGIN;
var aname = null;
var start = 0;
var nsubs = 0;
var nbrackets = 0;
var c = str.fastCodeAt(p);

while (!c.isEOF())
{
switch(state)
{
case S.IGNORE_SPACES:
switch(c)
{
case
'\n'.code,
'\r'.code,
'\t'.code,
' '.code:
default:
state = next;
continue;
}
case S.BEGIN:
switch(c)
{
case '<'.code:
state = S.IGNORE_SPACES;
next = S.BEGIN_NODE;
default:
start = p;
state = S.PCDATA;
continue;
}
case S.PCDATA:
if (c == '<'.code)
{
var child = Xml.createPCData(str.substr(start, p - start));
parent.addChild(child);
nsubs++;
state = S.IGNORE_SPACES;
next = S.BEGIN_NODE;
}
case S.CDATA:
if (c == ']'.code && str.fastCodeAt(p + 1) == ']'.code && str.fastCodeAt(p + 2) == '>'.code)
{
var child = Xml.createCData(str.substr(start, p - start));
parent.addChild(child);
nsubs++;
p += 2;
state = S.BEGIN;
}
case S.BEGIN_NODE:
switch(c)
{
case '!'.code:
if (str.fastCodeAt(p + 1) == '['.code)
{
p += 2;
if (str.substr(p, 6).toUpperCase() != "CDATA[")
throw("Expected <![CDATA[");
p += 5;
state = S.CDATA;
start = p + 1;
}
else if (str.fastCodeAt(p + 1) == 'D'.code || str.fastCodeAt(p + 1) == 'd'.code)
{
if(str.substr(p + 2, 6).toUpperCase() != "OCTYPE")
throw("Expected <!DOCTYPE");
p += 8;
state = S.DOCTYPE;
start = p + 1;
}
else if( str.fastCodeAt(p + 1) != '-'.code || str.fastCodeAt(p + 2) != '-'.code )
throw("Expected <!--");
else
{
p += 2;
state = S.COMMENT;
start = p + 1;
}
case '?'.code:
state = S.HEADER;
start = p;
case '/'.code:
if( parent == null )
throw("Expected node name");
start = p + 1;
state = S.IGNORE_SPACES;
next = S.CLOSE;
default:
state = S.TAG_NAME;
start = p;
continue;
}
case S.TAG_NAME:
if (!isValidChar(c))
{
if( p == start )
throw("Expected node name");
xml = Xml.createElement(str.substr(start, p - start));
parent.addChild(xml);
state = S.IGNORE_SPACES;
next = S.BODY;
continue;
}
case S.BODY:
switch(c)
{
case '/'.code:
state = S.WAIT_END;
nsubs++;
case '>'.code:
state = S.CHILDS;
nsubs++;
default:
state = S.ATTRIB_NAME;
start = p;
continue;
}
case S.ATTRIB_NAME:
if (!isValidChar(c))
{
var tmp;
if( start == p )
throw("Expected attribute name");
tmp = str.substr(start,p-start);
aname = tmp;
if( xml.exists(aname) )
throw("Duplicate attribute");
state = S.IGNORE_SPACES;
next = S.EQUALS;
continue;
}
case S.EQUALS:
switch(c)
{
case '='.code:
state = S.IGNORE_SPACES;
next = S.ATTVAL_BEGIN;
default:
throw("Expected =");
}
case S.ATTVAL_BEGIN:
switch(c)
{
case '"'.code, '\''.code:
state = S.ATTRIB_VAL;
start = p;
default:
throw("Expected \"");
}
case S.ATTRIB_VAL:
if (c == str.fastCodeAt(start))
{
var val = str.substr(start+1,p-start-1);
xml.set(aname, val);
state = S.IGNORE_SPACES;
next = S.BODY;
}
case S.CHILDS:
p = doParse(str, p, xml);
start = p;
state = S.BEGIN;
case S.WAIT_END:
switch(c)
{
case '>'.code:
state = S.BEGIN;
default :
throw("Expected >");
}
case S.WAIT_END_RET:
switch(c)
{
case '>'.code:
if( nsubs == 0 )
parent.addChild(Xml.createPCData(""));
return p;
default :
throw("Expected >");
}
case S.CLOSE:
if (!isValidChar(c))
{
if( start == p )
throw("Expected node name");

var v = str.substr(start,p - start);
if (v != parent.nodeName)
throw "Expected </" +parent.nodeName + ">";

state = S.IGNORE_SPACES;
next = S.WAIT_END_RET;
continue;
}
case S.COMMENT:
if (c == '-'.code && str.fastCodeAt(p +1) == '-'.code && str.fastCodeAt(p + 2) == '>'.code)
{
parent.addChild(Xml.createComment(str.substr(start, p - start)));
p += 2;
state = S.BEGIN;
}
case S.DOCTYPE:
if(c == '['.code)
nbrackets++;
else if(c == ']'.code)
nbrackets--;
else if (c == '>'.code && nbrackets == 0)
{
parent.addChild(Xml.createDocType(str.substr(start, p - start)));
state = S.BEGIN;
}
case S.HEADER:
if (c == '?'.code && str.fastCodeAt(p + 1) == '>'.code)
{
p++;
var str = str.substr(start + 1, p - start - 2);
parent.addChild(Xml.createProlog(str));
state = S.BEGIN;
}
}
c = str.fastCodeAt(++p);
}

if (state == S.BEGIN)
{
start = p;
state = S.PCDATA;
}

if (state == S.PCDATA)
{
if (p != start || nsubs == 0)
parent.addChild(Xml.createPCData(str.substr(start, p - start)));
return p;
}

throw "Unexpected end";
}

static inline function isValidChar(c) {
return (c >= 'a'.code && c <= 'z'.code) || (c >= 'A'.code && c <= 'Z'.code) || (c >= '0'.code && c <= '9'.code) || c == ':'.code || c == '.'.code || c == '_'.code || c == '-'.code;
}
}

0 comments on commit 8bae7d8

Please sign in to comment.