Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

More work:

* move parser into lib
* import entities from html5lib
* get more stuff working in the tokenizer
  • Loading branch information...
commit 6af006b12a549feffc591aadc092f80a7eadeff9 1 parent 9af9585
@wycats authored
View
27 Rakefile
@@ -1 +1,26 @@
-require "bundler/setup"
+require "json"
+require "./constants"
+
+task :default do
+ File.open("lib/entities.js", "w") do |file|
+ file.puts "return {\n named: {\n"
+
+ out = []
+ HTML5::ENTITIES.each do |entity, string|
+ out << %{ #{entity.dup.force_encoding('UTF-8').to_json}: #{string.force_encoding('UTF-8').to_json}}
+ end
+ file.puts out.join(",\n")
+ file.puts " },\n"
+
+ out = []
+ file.puts " windows: [\n"
+ HTML5::ENTITIES_WINDOWS1252.each do |entity|
+ out << %{ #{entity}}
+ end
+ file.puts out.join(",\n")
+ file.puts " ]\n"
+
+ file.puts "}"
+ end
+end
+
View
1,061 constants.rb
@@ -0,0 +1,1061 @@
+module HTML5
+
+ class EOF < Exception; end
+
+ def self._(str); str end
+
+ CONTENT_MODEL_FLAGS = [
+ :PCDATA,
+ :RCDATA,
+ :CDATA,
+ :PLAINTEXT
+ ]
+
+ SCOPING_ELEMENTS = %w[
+ applet
+ button
+ caption
+ html
+ marquee
+ object
+ table
+ td
+ th
+ ]
+
+ FORMATTING_ELEMENTS = %w[
+ a
+ b
+ big
+ em
+ font
+ i
+ nobr
+ s
+ small
+ strike
+ strong
+ tt
+ u
+ ]
+
+ SPECIAL_ELEMENTS = %w[
+ address
+ area
+ base
+ basefont
+ bgsound
+ blockquote
+ body
+ br
+ center
+ col
+ colgroup
+ dd
+ dir
+ div
+ dl
+ dt
+ embed
+ fieldset
+ form
+ frame
+ frameset
+ h1
+ h2
+ h3
+ h4
+ h5
+ h6
+ head
+ hr
+ iframe
+ image
+ img
+ input
+ isindex
+ li
+ link
+ listing
+ menu
+ meta
+ noembed
+ noframes
+ noscript
+ ol
+ optgroup
+ option
+ p
+ param
+ plaintext
+ pre
+ script
+ select
+ spacer
+ style
+ tbody
+ textarea
+ tfoot
+ thead
+ title
+ tr
+ ul
+ wbr
+ ]
+
+ SPACE_CHARACTERS = %W[
+ \t
+ \n
+ \x0B
+ \x0C
+ \x20
+ \r
+ ]
+
+ TABLE_INSERT_MODE_ELEMENTS = %w[
+ table
+ tbody
+ tfoot
+ thead
+ tr
+ ]
+
+ ASCII_LOWERCASE = ('a'..'z').to_a.join('')
+ ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
+ ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
+ DIGITS = '0'..'9'
+ HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
+
+ # Heading elements need to be ordered
+ HEADING_ELEMENTS = %w[
+ h1
+ h2
+ h3
+ h4
+ h5
+ h6
+ ]
+
+ # XXX What about event-source and command?
+ VOID_ELEMENTS = %w[
+ base
+ link
+ meta
+ hr
+ br
+ img
+ embed
+ param
+ area
+ col
+ input
+ ]
+
+ CDATA_ELEMENTS = %w[title textarea]
+
+ RCDATA_ELEMENTS = %w[
+ style
+ script
+ xmp
+ iframe
+ noembed
+ noframes
+ noscript
+ ]
+
+ BOOLEAN_ATTRIBUTES = {
+ :global => %w[irrelevant],
+ 'style' => %w[scoped],
+ 'img' => %w[ismap],
+ 'audio' => %w[autoplay controls],
+ 'video' => %w[autoplay controls],
+ 'script' => %w[defer async],
+ 'details' => %w[open],
+ 'datagrid' => %w[multiple disabled],
+ 'command' => %w[hidden disabled checked default],
+ 'menu' => %w[autosubmit],
+ 'fieldset' => %w[disabled readonly],
+ 'option' => %w[disabled readonly selected],
+ 'optgroup' => %w[disabled readonly],
+ 'button' => %w[disabled autofocus],
+ 'input' => %w[disabled readonly required autofocus checked ismap],
+ 'select' => %w[disabled readonly autofocus multiple],
+ 'output' => %w[disabled readonly]
+
+ }
+
+ # entitiesWindows1252 has to be _ordered_ and needs to have an index.
+ ENTITIES_WINDOWS1252 = [
+ 8364, # 0x80 0x20AC EURO SIGN
+ 65533, # 0x81 UNDEFINED
+ 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
+ 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
+ 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
+ 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
+ 8224, # 0x86 0x2020 DAGGER
+ 8225, # 0x87 0x2021 DOUBLE DAGGER
+ 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
+ 8240, # 0x89 0x2030 PER MILLE SIGN
+ 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
+ 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
+ 65533, # 0x8D UNDEFINED
+ 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
+ 65533, # 0x8F UNDEFINED
+ 65533, # 0x90 UNDEFINED
+ 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
+ 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
+ 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
+ 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
+ 8226, # 0x95 0x2022 BULLET
+ 8211, # 0x96 0x2013 EN DASH
+ 8212, # 0x97 0x2014 EM DASH
+ 732, # 0x98 0x02DC SMALL TILDE
+ 8482, # 0x99 0x2122 TRADE MARK SIGN
+ 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
+ 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
+ 65533, # 0x9D UNDEFINED
+ 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
+ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
+ ]
+
+ # ENTITIES was generated from Python using the following code:
+ #
+ # import constants
+ # entities = constants.entities.items()
+ # entities.sort()
+ # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
+ # repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
+ # for entity, value in entities]
+ # print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
+
+ ENTITIES = {
+ 'AElig' => "\xc3\x86",
+ 'AElig;' => "\xc3\x86",
+ 'AMP' => '&',
+ 'AMP;' => '&',
+ 'Aacute' => "\xc3\x81",
+ 'Aacute;' => "\xc3\x81",
+ 'Acirc' => "\xc3\x82",
+ 'Acirc;' => "\xc3\x82",
+ 'Agrave' => "\xc3\x80",
+ 'Agrave;' => "\xc3\x80",
+ 'Alpha;' => "\xce\x91",
+ 'Aring' => "\xc3\x85",
+ 'Aring;' => "\xc3\x85",
+ 'Atilde' => "\xc3\x83",
+ 'Atilde;' => "\xc3\x83",
+ 'Auml' => "\xc3\x84",
+ 'Auml;' => "\xc3\x84",
+ 'Beta;' => "\xce\x92",
+ 'COPY' => "\xc2\xa9",
+ 'COPY;' => "\xc2\xa9",
+ 'Ccedil' => "\xc3\x87",
+ 'Ccedil;' => "\xc3\x87",
+ 'Chi;' => "\xce\xa7",
+ 'Dagger;' => "\xe2\x80\xa1",
+ 'Delta;' => "\xce\x94",
+ 'ETH' => "\xc3\x90",
+ 'ETH;' => "\xc3\x90",
+ 'Eacute' => "\xc3\x89",
+ 'Eacute;' => "\xc3\x89",
+ 'Ecirc' => "\xc3\x8a",
+ 'Ecirc;' => "\xc3\x8a",
+ 'Egrave' => "\xc3\x88",
+ 'Egrave;' => "\xc3\x88",
+ 'Epsilon;' => "\xce\x95",
+ 'Eta;' => "\xce\x97",
+ 'Euml' => "\xc3\x8b",
+ 'Euml;' => "\xc3\x8b",
+ 'GT' => '>',
+ 'GT;' => '>',
+ 'Gamma;' => "\xce\x93",
+ 'Iacute' => "\xc3\x8d",
+ 'Iacute;' => "\xc3\x8d",
+ 'Icirc' => "\xc3\x8e",
+ 'Icirc;' => "\xc3\x8e",
+ 'Igrave' => "\xc3\x8c",
+ 'Igrave;' => "\xc3\x8c",
+ 'Iota;' => "\xce\x99",
+ 'Iuml' => "\xc3\x8f",
+ 'Iuml;' => "\xc3\x8f",
+ 'Kappa;' => "\xce\x9a",
+ 'LT' => '<',
+ 'LT;' => '<',
+ 'Lambda;' => "\xce\x9b",
+ 'Mu;' => "\xce\x9c",
+ 'Ntilde' => "\xc3\x91",
+ 'Ntilde;' => "\xc3\x91",
+ 'Nu;' => "\xce\x9d",
+ 'OElig;' => "\xc5\x92",
+ 'Oacute' => "\xc3\x93",
+ 'Oacute;' => "\xc3\x93",
+ 'Ocirc' => "\xc3\x94",
+ 'Ocirc;' => "\xc3\x94",
+ 'Ograve' => "\xc3\x92",
+ 'Ograve;' => "\xc3\x92",
+ 'Omega;' => "\xce\xa9",
+ 'Omicron;' => "\xce\x9f",
+ 'Oslash' => "\xc3\x98",
+ 'Oslash;' => "\xc3\x98",
+ 'Otilde' => "\xc3\x95",
+ 'Otilde;' => "\xc3\x95",
+ 'Ouml' => "\xc3\x96",
+ 'Ouml;' => "\xc3\x96",
+ 'Phi;' => "\xce\xa6",
+ 'Pi;' => "\xce\xa0",
+ 'Prime;' => "\xe2\x80\xb3",
+ 'Psi;' => "\xce\xa8",
+ 'QUOT' => '"',
+ 'QUOT;' => '"',
+ 'REG' => "\xc2\xae",
+ 'REG;' => "\xc2\xae",
+ 'Rho;' => "\xce\xa1",
+ 'Scaron;' => "\xc5\xa0",
+ 'Sigma;' => "\xce\xa3",
+ 'THORN' => "\xc3\x9e",
+ 'THORN;' => "\xc3\x9e",
+ 'TRADE;' => "\xe2\x84\xa2",
+ 'Tau;' => "\xce\xa4",
+ 'Theta;' => "\xce\x98",
+ 'Uacute' => "\xc3\x9a",
+ 'Uacute;' => "\xc3\x9a",
+ 'Ucirc' => "\xc3\x9b",
+ 'Ucirc;' => "\xc3\x9b",
+ 'Ugrave' => "\xc3\x99",
+ 'Ugrave;' => "\xc3\x99",
+ 'Upsilon;' => "\xce\xa5",
+ 'Uuml' => "\xc3\x9c",
+ 'Uuml;' => "\xc3\x9c",
+ 'Xi;' => "\xce\x9e",
+ 'Yacute' => "\xc3\x9d",
+ 'Yacute;' => "\xc3\x9d",
+ 'Yuml;' => "\xc5\xb8",
+ 'Zeta;' => "\xce\x96",
+ 'aacute' => "\xc3\xa1",
+ 'aacute;' => "\xc3\xa1",
+ 'acirc' => "\xc3\xa2",
+ 'acirc;' => "\xc3\xa2",
+ 'acute' => "\xc2\xb4",
+ 'acute;' => "\xc2\xb4",
+ 'aelig' => "\xc3\xa6",
+ 'aelig;' => "\xc3\xa6",
+ 'agrave' => "\xc3\xa0",
+ 'agrave;' => "\xc3\xa0",
+ 'alefsym;' => "\xe2\x84\xb5",
+ 'alpha;' => "\xce\xb1",
+ 'amp' => '&',
+ 'amp;' => '&',
+ 'and;' => "\xe2\x88\xa7",
+ 'ang;' => "\xe2\x88\xa0",
+ 'apos;' => "'",
+ 'aring' => "\xc3\xa5",
+ 'aring;' => "\xc3\xa5",
+ 'asymp;' => "\xe2\x89\x88",
+ 'atilde' => "\xc3\xa3",
+ 'atilde;' => "\xc3\xa3",
+ 'auml' => "\xc3\xa4",
+ 'auml;' => "\xc3\xa4",
+ 'bdquo;' => "\xe2\x80\x9e",
+ 'beta;' => "\xce\xb2",
+ 'brvbar' => "\xc2\xa6",
+ 'brvbar;' => "\xc2\xa6",
+ 'bull;' => "\xe2\x80\xa2",
+ 'cap;' => "\xe2\x88\xa9",
+ 'ccedil' => "\xc3\xa7",
+ 'ccedil;' => "\xc3\xa7",
+ 'cedil' => "\xc2\xb8",
+ 'cedil;' => "\xc2\xb8",
+ 'cent' => "\xc2\xa2",
+ 'cent;' => "\xc2\xa2",
+ 'chi;' => "\xcf\x87",
+ 'circ;' => "\xcb\x86",
+ 'clubs;' => "\xe2\x99\xa3",
+ 'cong;' => "\xe2\x89\x85",
+ 'copy' => "\xc2\xa9",
+ 'copy;' => "\xc2\xa9",
+ 'crarr;' => "\xe2\x86\xb5",
+ 'cup;' => "\xe2\x88\xaa",
+ 'curren' => "\xc2\xa4",
+ 'curren;' => "\xc2\xa4",
+ 'dArr;' => "\xe2\x87\x93",
+ 'dagger;' => "\xe2\x80\xa0",
+ 'darr;' => "\xe2\x86\x93",
+ 'deg' => "\xc2\xb0",
+ 'deg;' => "\xc2\xb0",
+ 'delta;' => "\xce\xb4",
+ 'diams;' => "\xe2\x99\xa6",
+ 'divide' => "\xc3\xb7",
+ 'divide;' => "\xc3\xb7",
+ 'eacute' => "\xc3\xa9",
+ 'eacute;' => "\xc3\xa9",
+ 'ecirc' => "\xc3\xaa",
+ 'ecirc;' => "\xc3\xaa",
+ 'egrave' => "\xc3\xa8",
+ 'egrave;' => "\xc3\xa8",
+ 'empty;' => "\xe2\x88\x85",
+ 'emsp;' => "\xe2\x80\x83",
+ 'ensp;' => "\xe2\x80\x82",
+ 'epsilon;' => "\xce\xb5",
+ 'equiv;' => "\xe2\x89\xa1",
+ 'eta;' => "\xce\xb7",
+ 'eth' => "\xc3\xb0",
+ 'eth;' => "\xc3\xb0",
+ 'euml' => "\xc3\xab",
+ 'euml;' => "\xc3\xab",
+ 'euro;' => "\xe2\x82\xac",
+ 'exist;' => "\xe2\x88\x83",
+ 'fnof;' => "\xc6\x92",
+ 'forall;' => "\xe2\x88\x80",
+ 'frac12' => "\xc2\xbd",
+ 'frac12;' => "\xc2\xbd",
+ 'frac14' => "\xc2\xbc",
+ 'frac14;' => "\xc2\xbc",
+ 'frac34' => "\xc2\xbe",
+ 'frac34;' => "\xc2\xbe",
+ 'frasl;' => "\xe2\x81\x84",
+ 'gamma;' => "\xce\xb3",
+ 'ge;' => "\xe2\x89\xa5",
+ 'gt' => '>',
+ 'gt;' => '>',
+ 'hArr;' => "\xe2\x87\x94",
+ 'harr;' => "\xe2\x86\x94",
+ 'hearts;' => "\xe2\x99\xa5",
+ 'hellip;' => "\xe2\x80\xa6",
+ 'iacute' => "\xc3\xad",
+ 'iacute;' => "\xc3\xad",
+ 'icirc' => "\xc3\xae",
+ 'icirc;' => "\xc3\xae",
+ 'iexcl' => "\xc2\xa1",
+ 'iexcl;' => "\xc2\xa1",
+ 'igrave' => "\xc3\xac",
+ 'igrave;' => "\xc3\xac",
+ 'image;' => "\xe2\x84\x91",
+ 'infin;' => "\xe2\x88\x9e",
+ 'int;' => "\xe2\x88\xab",
+ 'iota;' => "\xce\xb9",
+ 'iquest' => "\xc2\xbf",
+ 'iquest;' => "\xc2\xbf",
+ 'isin;' => "\xe2\x88\x88",
+ 'iuml' => "\xc3\xaf",
+ 'iuml;' => "\xc3\xaf",
+ 'kappa;' => "\xce\xba",
+ 'lArr;' => "\xe2\x87\x90",
+ 'lambda;' => "\xce\xbb",
+ 'lang;' => "\xe2\x9f\xa8",
+ 'laquo' => "\xc2\xab",
+ 'laquo;' => "\xc2\xab",
+ 'larr;' => "\xe2\x86\x90",
+ 'lceil;' => "\xe2\x8c\x88",
+ 'ldquo;' => "\xe2\x80\x9c",
+ 'le;' => "\xe2\x89\xa4",
+ 'lfloor;' => "\xe2\x8c\x8a",
+ 'lowast;' => "\xe2\x88\x97",
+ 'loz;' => "\xe2\x97\x8a",
+ 'lrm;' => "\xe2\x80\x8e",
+ 'lsaquo;' => "\xe2\x80\xb9",
+ 'lsquo;' => "\xe2\x80\x98",
+ 'lt' => '<',
+ 'lt;' => '<',
+ 'macr' => "\xc2\xaf",
+ 'macr;' => "\xc2\xaf",
+ 'mdash;' => "\xe2\x80\x94",
+ 'micro' => "\xc2\xb5",
+ 'micro;' => "\xc2\xb5",
+ 'middot' => "\xc2\xb7",
+ 'middot;' => "\xc2\xb7",
+ 'minus;' => "\xe2\x88\x92",
+ 'mu;' => "\xce\xbc",
+ 'nabla;' => "\xe2\x88\x87",
+ 'nbsp' => "\xc2\xa0",
+ 'nbsp;' => "\xc2\xa0",
+ 'ndash;' => "\xe2\x80\x93",
+ 'ne;' => "\xe2\x89\xa0",
+ 'ni;' => "\xe2\x88\x8b",
+ 'not' => "\xc2\xac",
+ 'not;' => "\xc2\xac",
+ 'notin;' => "\xe2\x88\x89",
+ 'nsub;' => "\xe2\x8a\x84",
+ 'ntilde' => "\xc3\xb1",
+ 'ntilde;' => "\xc3\xb1",
+ 'nu;' => "\xce\xbd",
+ 'oacute' => "\xc3\xb3",
+ 'oacute;' => "\xc3\xb3",
+ 'ocirc' => "\xc3\xb4",
+ 'ocirc;' => "\xc3\xb4",
+ 'oelig;' => "\xc5\x93",
+ 'ograve' => "\xc3\xb2",
+ 'ograve;' => "\xc3\xb2",
+ 'oline;' => "\xe2\x80\xbe",
+ 'omega;' => "\xcf\x89",
+ 'omicron;' => "\xce\xbf",
+ 'oplus;' => "\xe2\x8a\x95",
+ 'or;' => "\xe2\x88\xa8",
+ 'ordf' => "\xc2\xaa",
+ 'ordf;' => "\xc2\xaa",
+ 'ordm' => "\xc2\xba",
+ 'ordm;' => "\xc2\xba",
+ 'oslash' => "\xc3\xb8",
+ 'oslash;' => "\xc3\xb8",
+ 'otilde' => "\xc3\xb5",
+ 'otilde;' => "\xc3\xb5",
+ 'otimes;' => "\xe2\x8a\x97",
+ 'ouml' => "\xc3\xb6",
+ 'ouml;' => "\xc3\xb6",
+ 'para' => "\xc2\xb6",
+ 'para;' => "\xc2\xb6",
+ 'part;' => "\xe2\x88\x82",
+ 'permil;' => "\xe2\x80\xb0",
+ 'perp;' => "\xe2\x8a\xa5",
+ 'phi;' => "\xcf\x86",
+ 'pi;' => "\xcf\x80",
+ 'piv;' => "\xcf\x96",
+ 'plusmn' => "\xc2\xb1",
+ 'plusmn;' => "\xc2\xb1",
+ 'pound' => "\xc2\xa3",
+ 'pound;' => "\xc2\xa3",
+ 'prime;' => "\xe2\x80\xb2",
+ 'prod;' => "\xe2\x88\x8f",
+ 'prop;' => "\xe2\x88\x9d",
+ 'psi;' => "\xcf\x88",
+ 'quot' => '"',
+ 'quot;' => '"',
+ 'rArr;' => "\xe2\x87\x92",
+ 'radic;' => "\xe2\x88\x9a",
+ 'rang;' => "\xe2\x9f\xa9",
+ 'raquo' => "\xc2\xbb",
+ 'raquo;' => "\xc2\xbb",
+ 'rarr;' => "\xe2\x86\x92",
+ 'rceil;' => "\xe2\x8c\x89",
+ 'rdquo;' => "\xe2\x80\x9d",
+ 'real;' => "\xe2\x84\x9c",
+ 'reg' => "\xc2\xae",
+ 'reg;' => "\xc2\xae",
+ 'rfloor;' => "\xe2\x8c\x8b",
+ 'rho;' => "\xcf\x81",
+ 'rlm;' => "\xe2\x80\x8f",
+ 'rsaquo;' => "\xe2\x80\xba",
+ 'rsquo;' => "\xe2\x80\x99",
+ 'sbquo;' => "\xe2\x80\x9a",
+ 'scaron;' => "\xc5\xa1",
+ 'sdot;' => "\xe2\x8b\x85",
+ 'sect' => "\xc2\xa7",
+ 'sect;' => "\xc2\xa7",
+ 'shy' => "\xc2\xad",
+ 'shy;' => "\xc2\xad",
+ 'sigma;' => "\xcf\x83",
+ 'sigmaf;' => "\xcf\x82",
+ 'sim;' => "\xe2\x88\xbc",
+ 'spades;' => "\xe2\x99\xa0",
+ 'sub;' => "\xe2\x8a\x82",
+ 'sube;' => "\xe2\x8a\x86",
+ 'sum;' => "\xe2\x88\x91",
+ 'sup1' => "\xc2\xb9",
+ 'sup1;' => "\xc2\xb9",
+ 'sup2' => "\xc2\xb2",
+ 'sup2;' => "\xc2\xb2",
+ 'sup3' => "\xc2\xb3",
+ 'sup3;' => "\xc2\xb3",
+ 'sup;' => "\xe2\x8a\x83",
+ 'supe;' => "\xe2\x8a\x87",
+ 'szlig' => "\xc3\x9f",
+ 'szlig;' => "\xc3\x9f",
+ 'tau;' => "\xcf\x84",
+ 'there4;' => "\xe2\x88\xb4",
+ 'theta;' => "\xce\xb8",
+ 'thetasym;' => "\xcf\x91",
+ 'thinsp;' => "\xe2\x80\x89",
+ 'thorn' => "\xc3\xbe",
+ 'thorn;' => "\xc3\xbe",
+ 'tilde;' => "\xcb\x9c",
+ 'times' => "\xc3\x97",
+ 'times;' => "\xc3\x97",
+ 'trade;' => "\xe2\x84\xa2",
+ 'uArr;' => "\xe2\x87\x91",
+ 'uacute' => "\xc3\xba",
+ 'uacute;' => "\xc3\xba",
+ 'uarr;' => "\xe2\x86\x91",
+ 'ucirc' => "\xc3\xbb",
+ 'ucirc;' => "\xc3\xbb",
+ 'ugrave' => "\xc3\xb9",
+ 'ugrave;' => "\xc3\xb9",
+ 'uml' => "\xc2\xa8",
+ 'uml;' => "\xc2\xa8",
+ 'upsih;' => "\xcf\x92",
+ 'upsilon;' => "\xcf\x85",
+ 'uuml' => "\xc3\xbc",
+ 'uuml;' => "\xc3\xbc",
+ 'weierp;' => "\xe2\x84\x98",
+ 'xi;' => "\xce\xbe",
+ 'yacute' => "\xc3\xbd",
+ 'yacute;' => "\xc3\xbd",
+ 'yen' => "\xc2\xa5",
+ 'yen;' => "\xc2\xa5",
+ 'yuml' => "\xc3\xbf",
+ 'yuml;' => "\xc3\xbf",
+ 'zeta;' => "\xce\xb6",
+ 'zwj;' => "\xe2\x80\x8d",
+ 'zwnj;' => "\xe2\x80\x8c"
+ }
+
+ ENCODINGS = %w[
+ ansi_x3.4-1968
+ iso-ir-6
+ ansi_x3.4-1986
+ iso_646.irv:1991
+ ascii
+ iso646-us
+ us-ascii
+ us
+ ibm367
+ cp367
+ csascii
+ ks_c_5601-1987
+ korean
+ iso-2022-kr
+ csiso2022kr
+ euc-kr
+ iso-2022-jp
+ csiso2022jp
+ iso-2022-jp-2
+ iso-ir-58
+ chinese
+ csiso58gb231280
+ iso_8859-1:1987
+ iso-ir-100
+ iso_8859-1
+ iso-8859-1
+ latin1
+ l1
+ ibm819
+ cp819
+ csisolatin1
+ iso_8859-2:1987
+ iso-ir-101
+ iso_8859-2
+ iso-8859-2
+ latin2
+ l2
+ csisolatin2
+ iso_8859-3:1988
+ iso-ir-109
+ iso_8859-3
+ iso-8859-3
+ latin3
+ l3
+ csisolatin3
+ iso_8859-4:1988
+ iso-ir-110
+ iso_8859-4
+ iso-8859-4
+ latin4
+ l4
+ csisolatin4
+ iso_8859-6:1987
+ iso-ir-127
+ iso_8859-6
+ iso-8859-6
+ ecma-114
+ asmo-708
+ arabic
+ csisolatinarabic
+ iso_8859-7:1987
+ iso-ir-126
+ iso_8859-7
+ iso-8859-7
+ elot_928
+ ecma-118
+ greek
+ greek8
+ csisolatingreek
+ iso_8859-8:1988
+ iso-ir-138
+ iso_8859-8
+ iso-8859-8
+ hebrew
+ csisolatinhebrew
+ iso_8859-5:1988
+ iso-ir-144
+ iso_8859-5
+ iso-8859-5
+ cyrillic
+ csisolatincyrillic
+ iso_8859-9:1989
+ iso-ir-148
+ iso_8859-9
+ iso-8859-9
+ latin5
+ l5
+ csisolatin5
+ iso-8859-10
+ iso-ir-157
+ l6
+ iso_8859-10:1992
+ csisolatin6
+ latin6
+ hp-roman8
+ roman8
+ r8
+ ibm037
+ cp037
+ csibm037
+ ibm424
+ cp424
+ csibm424
+ ibm437
+ cp437
+ 437
+ cspc8codepage437
+ ibm500
+ cp500
+ csibm500
+ ibm775
+ cp775
+ cspc775baltic
+ ibm850
+ cp850
+ 850
+ cspc850multilingual
+ ibm852
+ cp852
+ 852
+ cspcp852
+ ibm855
+ cp855
+ 855
+ csibm855
+ ibm857
+ cp857
+ 857
+ csibm857
+ ibm860
+ cp860
+ 860
+ csibm860
+ ibm861
+ cp861
+ 861
+ cp-is
+ csibm861
+ ibm862
+ cp862
+ 862
+ cspc862latinhebrew
+ ibm863
+ cp863
+ 863
+ csibm863
+ ibm864
+ cp864
+ csibm864
+ ibm865
+ cp865
+ 865
+ csibm865
+ ibm866
+ cp866
+ 866
+ csibm866
+ ibm869
+ cp869
+ 869
+ cp-gr
+ csibm869
+ ibm1026
+ cp1026
+ csibm1026
+ koi8-r
+ cskoi8r
+ koi8-u
+ big5-hkscs
+ ptcp154
+ csptcp154
+ pt154
+ cp154
+ utf-7
+ utf-16be
+ utf-16le
+ utf-16
+ utf-8
+ iso-8859-13
+ iso-8859-14
+ iso-ir-199
+ iso_8859-14:1998
+ iso_8859-14
+ latin8
+ iso-celtic
+ l8
+ iso-8859-15
+ iso_8859-15
+ iso-8859-16
+ iso-ir-226
+ iso_8859-16:2001
+ iso_8859-16
+ latin10
+ l10
+ gbk
+ cp936
+ ms936
+ gb18030
+ shift_jis
+ ms_kanji
+ csshiftjis
+ euc-jp
+ gb2312
+ big5
+ csbig5
+ windows-1250
+ windows-1251
+ windows-1252
+ windows-1253
+ windows-1254
+ windows-1255
+ windows-1256
+ windows-1257
+ windows-1258
+ tis-620
+ hz-gb-2312
+ ]
+
+ E = {
+ "null-character" =>
+ _("Null character in input stream, replaced with U+FFFD."),
+ "incorrectly-placed-solidus" =>
+ _("Solidus (/) incorrectly placed in tag."),
+ "incorrect-cr-newline-entity" =>
+ _("Incorrect CR newline entity, replaced with LF."),
+ "illegal-windows-1252-entity" =>
+ _("Entity used with illegal number (windows-1252 reference)."),
+ "cant-convert-numeric-entity" =>
+ _("Numeric entity couldn't be converted to character " +
+ "(codepoint U+%(charAsInt)08x)."),
+ "illegal-codepoint-for-numeric-entity" =>
+ _("Numeric entity represents an illegal codepoint=> " +
+ "U+%(charAsInt)08x."),
+ "numeric-entity-without-semicolon" =>
+ _("Numeric entity didn't end with ';'."),
+ "expected-numeric-entity-but-got-eof" =>
+ _("Numeric entity expected. Got end of file instead."),
+ "expected-numeric-entity" =>
+ _("Numeric entity expected but none found."),
+ "named-entity-without-semicolon" =>
+ _("Named entity didn't end with ';'."),
+ "expected-named-entity" =>
+ _("Named entity expected. Got none."),
+ "attributes-in-end-tag" =>
+ _("End tag contains unexpected attributes."),
+ "expected-tag-name-but-got-right-bracket" =>
+ _("Expected tag name. Got '>' instead."),
+ "expected-tag-name-but-got-question-mark" =>
+ _("Expected tag name. Got '?' instead. (HTML doesn't " +
+ "support processing instructions.)"),
+ "expected-tag-name" =>
+ _("Expected tag name. Got something else instead"),
+ "expected-closing-tag-but-got-right-bracket" =>
+ _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
+ "expected-closing-tag-but-got-eof" =>
+ _("Expected closing tag. Unexpected end of file."),
+ "expected-closing-tag-but-got-char" =>
+ _("Expected closing tag. Unexpected character '%(data)' found."),
+ "eof-in-tag-name" =>
+ _("Unexpected end of file in the tag name."),
+ "expected-attribute-name-but-got-eof" =>
+ _("Unexpected end of file. Expected attribute name instead."),
+ "eof-in-attribute-name" =>
+ _("Unexpected end of file in attribute name."),
+ "duplicate-attribute" =>
+ _("Dropped duplicate attribute on tag."),
+ "expected-end-of-tag-name-but-got-eof" =>
+ _("Unexpected end of file. Expected = or end of tag."),
+ "expected-attribute-value-but-got-eof" =>
+ _("Unexpected end of file. Expected attribute value."),
+ "eof-in-attribute-value-double-quote" =>
+ _("Unexpected end of file in attribute value (\")."),
+ "eof-in-attribute-value-single-quote" =>
+ _("Unexpected end of file in attribute value (')."),
+ "eof-in-attribute-value-no-quotes" =>
+ _("Unexpected end of file in attribute value."),
+ "expected-dashes-or-doctype" =>
+ _("Expected '--' or 'DOCTYPE'. Not found."),
+ "incorrect-comment" =>
+ _("Incorrect comment."),
+ "eof-in-comment" =>
+ _("Unexpected end of file in comment."),
+ "eof-in-comment-end-dash" =>
+ _("Unexpected end of file in comment (-)"),
+ "unexpected-dash-after-double-dash-in-comment" =>
+ _("Unexpected '-' after '--' found in comment."),
+ "eof-in-comment-double-dash" =>
+ _("Unexpected end of file in comment (--)."),
+ "unexpected-char-in-comment" =>
+ _("Unexpected character in comment found."),
+ "need-space-after-doctype" =>
+ _("No space after literal string 'DOCTYPE'."),
+ "expected-doctype-name-but-got-right-bracket" =>
+ _("Unexpected > character. Expected DOCTYPE name."),
+ "expected-doctype-name-but-got-eof" =>
+ _("Unexpected end of file. Expected DOCTYPE name."),
+ "eof-in-doctype-name" =>
+ _("Unexpected end of file in DOCTYPE name."),
+ "eof-in-doctype" =>
+ _("Unexpected end of file in DOCTYPE."),
+ "expected-space-or-right-bracket-in-doctype" =>
+ _("Expected space or '>'. Got '%(data)'"),
+ "unexpected-end-of-doctype" =>
+ _("Unexpected end of DOCTYPE."),
+ "unexpected-char-in-doctype" =>
+ _("Unexpected character in DOCTYPE."),
+ "eof-in-bogus-doctype" =>
+ _("Unexpected end of file in bogus doctype."),
+ "eof-in-innerhtml" =>
+ _("Unexpected EOF in inner html mode."),
+ "unexpected-doctype" =>
+ _("Unexpected DOCTYPE. Ignored."),
+ "non-html-root" =>
+ _("html needs to be the first start tag."),
+ "expected-doctype-but-got-eof" =>
+ _("Unexpected End of file. Expected DOCTYPE."),
+ "unknown-doctype" =>
+ _("Erroneous DOCTYPE."),
+ "expected-doctype-but-got-chars" =>
+ _("Unexpected non-space characters. Expected DOCTYPE."),
+ "expected-doctype-but-got-start-tag" =>
+ _("Unexpected start tag (%(name)). Expected DOCTYPE."),
+ "expected-doctype-but-got-end-tag" =>
+ _("Unexpected end tag (%(name)). Expected DOCTYPE."),
+ "end-tag-after-implied-root" =>
+ _("Unexpected end tag (%(name)) after the (implied) root element."),
+ "expected-named-closing-tag-but-got-eof" =>
+ _("Unexpected end of file. Expected end tag (%(name))."),
+ "two-heads-are-not-better-than-one" =>
+ _("Unexpected start tag head in existing head. Ignored."),
+ "unexpected-end-tag" =>
+ _("Unexpected end tag (%(name)). Ignored."),
+ "unexpected-start-tag-out-of-my-head" =>
+ _("Unexpected start tag (%(name)) that can be in head. Moved."),
+ "unexpected-start-tag" =>
+ _("Unexpected start tag (%(name))."),
+ "missing-end-tag" =>
+ _("Missing end tag (%(name))."),
+ "missing-end-tags" =>
+ _("Missing end tags (%(name))."),
+ "unexpected-start-tag-implies-end-tag" =>
+ _("Unexpected start tag (%(startName)) " +
+ "implies end tag (%(endName))."),
+ "unexpected-start-tag-treated-as" =>
+ _("Unexpected start tag (%(originalName)). Treated as %(newName)."),
+ "deprecated-tag" =>
+ _("Unexpected start tag %(name). Don't use it!"),
+ "unexpected-start-tag-ignored" =>
+ _("Unexpected start tag %(name). Ignored."),
+ "expected-one-end-tag-but-got-another" =>
+ _("Unexpected end tag (%(gotName)). " +
+ "Missing end tag (%(expectedName))."),
+ "end-tag-too-early" =>
+ _("End tag (%(name)) seen too early. Expected other end tag."),
+ "end-tag-too-early-named" =>
+ _("Unexpected end tag (%(gotName)). Expected end tag (%(expectedName))."),
+ "end-tag-too-early-ignored" =>
+ _("End tag (%(name)) seen too early. Ignored."),
+ "adoption-agency-1.1" =>
+ _("End tag (%(name)) violates step 1, " +
+ "paragraph 1 of the adoption agency algorithm."),
+ "adoption-agency-1.2" =>
+ _("End tag (%(name)) violates step 1, " +
+ "paragraph 2 of the adoption agency algorithm."),
+ "adoption-agency-1.3" =>
+ _("End tag (%(name)) violates step 1, " +
+ "paragraph 3 of the adoption agency algorithm."),
+ "unexpected-end-tag-treated-as" =>
+ _("Unexpected end tag (%(originalName)). Treated as %(newName)."),
+ "no-end-tag" =>
+ _("This element (%(name)) has no end tag."),
+ "unexpected-implied-end-tag-in-table" =>
+ _("Unexpected implied end tag (%(name)) in the table phase."),
+ "unexpected-implied-end-tag-in-table-body" =>
+ _("Unexpected implied end tag (%(name)) in the table body phase."),
+ "unexpected-char-implies-table-voodoo" =>
+ _("Unexpected non-space characters in " +
+ "table context caused voodoo mode."),
+ "unpexted-hidden-input-in-table" =>
+ _("Unexpected input with type hidden in table context."),
+ "unexpected-start-tag-implies-table-voodoo" =>
+ _("Unexpected start tag (%(name)) in " +
+ "table context caused voodoo mode."),
+ "unexpected-end-tag-implies-table-voodoo" =>
+ _("Unexpected end tag (%(name)) in " +
+ "table context caused voodoo mode."),
+ "unexpected-cell-in-table-body" =>
+ _("Unexpected table cell start tag (%(name)) " +
+ "in the table body phase."),
+ "unexpected-cell-end-tag" =>
+ _("Got table cell end tag (%(name)) " +
+ "while required end tags are missing."),
+ "unexpected-end-tag-in-table-body" =>
+ _("Unexpected end tag (%(name)) in the table body phase. Ignored."),
+ "unexpected-implied-end-tag-in-table-row" =>
+ _("Unexpected implied end tag (%(name)) in the table row phase."),
+ "unexpected-end-tag-in-table-row" =>
+ _("Unexpected end tag (%(name)) in the table row phase. Ignored."),
+ "unexpected-select-in-select" =>
+ _("Unexpected select start tag in the select phase " +
+ "treated as select end tag."),
+ "unexpected-input-in-select" =>
+ _("Unexpected input start tag in the select phase."),
+ "unexpected-start-tag-in-select" =>
+ _("Unexpected start tag token (%(name)) in the select phase. " +
+ "Ignored."),
+ "unexpected-end-tag-in-select" =>
+ _("Unexpected end tag (%(name)) in the select phase. Ignored."),
+ "unexpected-table-element-start-tag-in-select-in-table" =>
+ _("Unexpected table element start tag (%(name)s) in the select in table phase."),
+ "unexpected-table-element-end-tag-in-select-in-table" =>
+ _("Unexpected table element end tag (%(name)s) in the select in table phase."),
+ "unexpected-char-after-body" =>
+ _("Unexpected non-space characters in the after body phase."),
+ "unexpected-start-tag-after-body" =>
+ _("Unexpected start tag token (%(name))" +
+ " in the after body phase."),
+ "unexpected-end-tag-after-body" =>
+ _("Unexpected end tag token (%(name))" +
+ " in the after body phase."),
+ "unexpected-char-in-frameset" =>
+ _("Unepxected characters in the frameset phase. Characters ignored."),
+ "unexpected-start-tag-in-frameset" =>
+ _("Unexpected start tag token (%(name))" +
+ " in the frameset phase. Ignored."),
+ "unexpected-frameset-in-frameset-innerhtml" =>
+ _("Unexpected end tag token (frameset) " +
+ "in the frameset phase (innerHTML)."),
+ "unexpected-end-tag-in-frameset" =>
+ _("Unexpected end tag token (%(name))" +
+ " in the frameset phase. Ignored."),
+ "unexpected-char-after-frameset" =>
+ _("Unexpected non-space characters in the " +
+ "after frameset phase. Ignored."),
+ "unexpected-start-tag-after-frameset" =>
+ _("Unexpected start tag (%(name))" +
+ " in the after frameset phase. Ignored."),
+ "unexpected-end-tag-after-frameset" =>
+ _("Unexpected end tag (%(name))" +
+ " in the after frameset phase. Ignored."),
+ "expected-eof-but-got-char" =>
+ _("Unexpected non-space characters. Expected end of file."),
+ "expected-eof-but-got-start-tag" =>
+ _("Unexpected start tag (%(name))" +
+ ". Expected end of file."),
+ "expected-eof-but-got-end-tag" =>
+ _("Unexpected end tag (%(name))" +
+ ". Expected end of file."),
+ "unexpected-end-table-in-caption" =>
+ _("Unexpected end table tag in caption. Generates implied end caption."),
+ "end-html-in-innerhtml" => _("Unexpected html end tag in inner html mode."),
+ "expected-self-closing-tag" => _("Expected a > after the /."),
+ "self-closing-end-tag" => _("Self closing end tag."),
+ "eof-in-table" => _("Unexpected end of file. Expected table content."),
+ "html-in-foreign-content" => _("HTML start tag \"%(name)\" in a foreign namespace context."),
+ "unexpected-start-tag-in-table" => _("Unexpected %(name). Expected table content."),
+ }
+
+end
View
404 lib/entities.js
@@ -0,0 +1,404 @@
+return {
+ named: {
+ "AElig": "Æ",
+ "AElig;": "Æ",
+ "AMP": "&",
+ "AMP;": "&",
+ "Aacute": "Á",
+ "Aacute;": "Á",
+ "Acirc": "Â",
+ "Acirc;": "Â",
+ "Agrave": "À",
+ "Agrave;": "À",
+ "Alpha;": "Α",
+ "Aring": "Å",
+ "Aring;": "Å",
+ "Atilde": "Ã",
+ "Atilde;": "Ã",
+ "Auml": "Ä",
+ "Auml;": "Ä",
+ "Beta;": "Β",
+ "COPY": "©",
+ "COPY;": "©",
+ "Ccedil": "Ç",
+ "Ccedil;": "Ç",
+ "Chi;": "Χ",
+ "Dagger;": "",
+ "Delta;": "Δ",
+ "ETH": "Ð",
+ "ETH;": "Ð",
+ "Eacute": "É",
+ "Eacute;": "É",
+ "Ecirc": "Ê",
+ "Ecirc;": "Ê",
+ "Egrave": "È",
+ "Egrave;": "È",
+ "Epsilon;": "Ε",
+ "Eta;": "Η",
+ "Euml": "Ë",
+ "Euml;": "Ë",
+ "GT": ">",
+ "GT;": ">",
+ "Gamma;": "Γ",
+ "Iacute": "Í",
+ "Iacute;": "Í",
+ "Icirc": "Î",
+ "Icirc;": "Î",
+ "Igrave": "Ì",
+ "Igrave;": "Ì",
+ "Iota;": "Ι",
+ "Iuml": "Ï",
+ "Iuml;": "Ï",
+ "Kappa;": "Κ",
+ "LT": "<",
+ "LT;": "<",
+ "Lambda;": "Λ",
+ "Mu;": "Μ",
+ "Ntilde": "Ñ",
+ "Ntilde;": "Ñ",
+ "Nu;": "Ν",
+ "OElig;": "Œ",
+ "Oacute": "Ó",
+ "Oacute;": "Ó",
+ "Ocirc": "Ô",
+ "Ocirc;": "Ô",
+ "Ograve": "Ò",
+ "Ograve;": "Ò",
+ "Omega;": "Ω",
+ "Omicron;": "Ο",
+ "Oslash": "Ø",
+ "Oslash;": "Ø",
+ "Otilde": "Õ",
+ "Otilde;": "Õ",
+ "Ouml": "Ö",
+ "Ouml;": "Ö",
+ "Phi;": "Φ",
+ "Pi;": "Π",
+ "Prime;": "",
+ "Psi;": "Ψ",
+ "QUOT": "\"",
+ "QUOT;": "\"",
+ "REG": "®",
+ "REG;": "®",
+ "Rho;": "Ρ",
+ "Scaron;": "Š",
+ "Sigma;": "Σ",
+ "THORN": "Þ",
+ "THORN;": "Þ",
+ "TRADE;": "",
+ "Tau;": "Τ",
+ "Theta;": "Θ",
+ "Uacute": "Ú",
+ "Uacute;": "Ú",
+ "Ucirc": "Û",
+ "Ucirc;": "Û",
+ "Ugrave": "Ù",
+ "Ugrave;": "Ù",
+ "Upsilon;": "Υ",
+ "Uuml": "Ü",
+ "Uuml;": "Ü",
+ "Xi;": "Ξ",
+ "Yacute": "Ý",
+ "Yacute;": "Ý",
+ "Yuml;": "Ÿ",
+ "Zeta;": "Ζ",
+ "aacute": "á",
+ "aacute;": "á",
+ "acirc": "â",
+ "acirc;": "â",
+ "acute": "´",
+ "acute;": "´",
+ "aelig": "æ",
+ "aelig;": "æ",
+ "agrave": "à",
+ "agrave;": "à",
+ "alefsym;": "",
+ "alpha;": "α",
+ "amp": "&",
+ "amp;": "&",
+ "and;": "",
+ "ang;": "",
+ "apos;": "'",
+ "aring": "å",
+ "aring;": "å",
+ "asymp;": "",
+ "atilde": "ã",
+ "atilde;": "ã",
+ "auml": "ä",
+ "auml;": "ä",
+ "bdquo;": "",
+ "beta;": "β",
+ "brvbar": "¦",
+ "brvbar;": "¦",
+ "bull;": "",
+ "cap;": "",
+ "ccedil": "ç",
+ "ccedil;": "ç",
+ "cedil": "¸",
+ "cedil;": "¸",
+ "cent": "¢",
+ "cent;": "¢",
+ "chi;": "χ",
+ "circ;": "ˆ",
+ "clubs;": "",
+ "cong;": "",
+ "copy": "©",
+ "copy;": "©",
+ "crarr;": "",
+ "cup;": "",
+ "curren": "¤",
+ "curren;": "¤",
+ "dArr;": "",
+ "dagger;": "",
+ "darr;": "",
+ "deg": "°",
+ "deg;": "°",
+ "delta;": "δ",
+ "diams;": "",
+ "divide": "÷",
+ "divide;": "÷",
+ "eacute": "é",
+ "eacute;": "é",
+ "ecirc": "ê",
+ "ecirc;": "ê",
+ "egrave": "è",
+ "egrave;": "è",
+ "empty;": "",
+ "emsp;": "",
+ "ensp;": "",
+ "epsilon;": "ε",
+ "equiv;": "",
+ "eta;": "η",
+ "eth": "ð",
+ "eth;": "ð",
+ "euml": "ë",
+ "euml;": "ë",
+ "euro;": "",
+ "exist;": "",
+ "fnof;": "ƒ",
+ "forall;": "",
+ "frac12": "½",
+ "frac12;": "½",
+ "frac14": "¼",
+ "frac14;": "¼",
+ "frac34": "¾",
+ "frac34;": "¾",
+ "frasl;": "",
+ "gamma;": "γ",
+ "ge;": "",
+ "gt": ">",
+ "gt;": ">",
+ "hArr;": "",
+ "harr;": "",
+ "hearts;": "",
+ "hellip;": "",
+ "iacute": "í",
+ "iacute;": "í",
+ "icirc": "î",
+ "icirc;": "î",
+ "iexcl": "¡",
+ "iexcl;": "¡",
+ "igrave": "ì",
+ "igrave;": "ì",
+ "image;": "",
+ "infin;": "",
+ "int;": "",
+ "iota;": "ι",
+ "iquest": "¿",
+ "iquest;": "¿",
+ "isin;": "",
+ "iuml": "ï",
+ "iuml;": "ï",
+ "kappa;": "κ",
+ "lArr;": "",
+ "lambda;": "λ",
+ "lang;": "",
+ "laquo": "«",
+ "laquo;": "«",
+ "larr;": "",
+ "lceil;": "",
+ "ldquo;": "",
+ "le;": "",
+ "lfloor;": "",
+ "lowast;": "",
+ "loz;": "",
+ "lrm;": "",
+ "lsaquo;": "",
+ "lsquo;": "",
+ "lt": "<",
+ "lt;": "<",
+ "macr": "¯",
+ "macr;": "¯",
+ "mdash;": "",
+ "micro": "µ",
+ "micro;": "µ",
+ "middot": "·",
+ "middot;": "·",
+ "minus;": "",
+ "mu;": "μ",
+ "nabla;": "",
+ "nbsp": " ",
+ "nbsp;": " ",
+ "ndash;": "",
+ "ne;": "",
+ "ni;": "",
+ "not": "¬",
+ "not;": "¬",
+ "notin;": "",
+ "nsub;": "",
+ "ntilde": "ñ",
+ "ntilde;": "ñ",
+ "nu;": "ν",
+ "oacute": "ó",
+ "oacute;": "ó",
+ "ocirc": "ô",
+ "ocirc;": "ô",
+ "oelig;": "œ",
+ "ograve": "ò",
+ "ograve;": "ò",
+ "oline;": "",
+ "omega;": "ω",
+ "omicron;": "ο",
+ "oplus;": "",
+ "or;": "",
+ "ordf": "ª",
+ "ordf;": "ª",
+ "ordm": "º",
+ "ordm;": "º",
+ "oslash": "ø",
+ "oslash;": "ø",
+ "otilde": "õ",
+ "otilde;": "õ",
+ "otimes;": "",
+ "ouml": "ö",
+ "ouml;": "ö",
+ "para": "",
+ "para;": "",
+ "part;": "",
+ "permil;": "",
+ "perp;": "",
+ "phi;": "φ",
+ "pi;": "π",
+ "piv;": "ϖ",
+ "plusmn": "±",
+ "plusmn;": "±",
+ "pound": "£",
+ "pound;": "£",
+ "prime;": "",
+ "prod;": "",
+ "prop;": "",
+ "psi;": "ψ",
+ "quot": "\"",
+ "quot;": "\"",
+ "rArr;": "",
+ "radic;": "",
+ "rang;": "",
+ "raquo": "»",
+ "raquo;": "»",
+ "rarr;": "",
+ "rceil;": "",
+ "rdquo;": "",
+ "real;": "",
+ "reg": "®",
+ "reg;": "®",
+ "rfloor;": "",
+ "rho;": "ρ",
+ "rlm;": "",
+ "rsaquo;": "",
+ "rsquo;": "",
+ "sbquo;": "",
+ "scaron;": "š",
+ "sdot;": "",
+ "sect": "§",
+ "sect;": "§",
+ "shy": "­",
+ "shy;": "­",
+ "sigma;": "σ",
+ "sigmaf;": "ς",
+ "sim;": "",
+ "spades;": "",
+ "sub;": "",
+ "sube;": "",
+ "sum;": "",
+ "sup1": "¹",
+ "sup1;": "¹",
+ "sup2": "²",
+ "sup2;": "²",
+ "sup3": "³",
+ "sup3;": "³",
+ "sup;": "",
+ "supe;": "",
+ "szlig": "ß",
+ "szlig;": "ß",
+ "tau;": "τ",
+ "there4;": "",
+ "theta;": "θ",
+ "thetasym;": "ϑ",
+ "thinsp;": "",
+ "thorn": "þ",
+ "thorn;": "þ",
+ "tilde;": "˜",
+ "times": "×",
+ "times;": "×",
+ "trade;": "",
+ "uArr;": "",
+ "uacute": "ú",
+ "uacute;": "ú",
+ "uarr;": "",
+ "ucirc": "û",
+ "ucirc;": "û",
+ "ugrave": "ù",
+ "ugrave;": "ù",
+ "uml": "¨",
+ "uml;": "¨",
+ "upsih;": "ϒ",
+ "upsilon;": "υ",
+ "uuml": "ü",
+ "uuml;": "ü",
+ "weierp;": "",
+ "xi;": "ξ",
+ "yacute": "ý",
+ "yacute;": "ý",
+ "yen": "¥",
+ "yen;": "¥",
+ "yuml": "ÿ",
+ "yuml;": "ÿ",
+ "zeta;": "ζ",
+ "zwj;": "",
+ "zwnj;": ""
+ },
+ windows: [
+ 8364,
+ 65533,
+ 8218,
+ 402,
+ 8222,
+ 8230,
+ 8224,
+ 8225,
+ 710,
+ 8240,
+ 352,
+ 8249,
+ 338,
+ 65533,
+ 381,
+ 65533,
+ 65533,
+ 8216,
+ 8217,
+ 8220,
+ 8221,
+ 8226,
+ 8211,
+ 8212,
+ 732,
+ 8482,
+ 353,
+ 8250,
+ 339,
+ 65533,
+ 382,
+ 376
+ ]
+}
View
145 parser.js → lib/parser.js
@@ -7,6 +7,8 @@ var FORMFEED = "\u000c";
var SPACE = " ";
var ANYSPACE = /[\t\n\u000c ]/;
+var ENTITIES = require("entities.js");
+
var Tk = function() {};
Tk.prototype = {
toString: function() {
@@ -173,7 +175,14 @@ states.charRefInData = {
toString: function() { return "charRefInData"; },
consume: function(lexer) {
- throw "Not implemented";
+ lexer.setState('data');
+ var token = lexer.consumeCharacterReference();
+
+ if (token) {
+ lexer.pushToken(token);
+ } else {
+ lexer.pushToken(TkChar, "&");
+ }
}
};
@@ -1346,7 +1355,12 @@ Tokenizer.prototype = {
},
pushToken: function(TokenClass, param) {
- this.token = new TokenClass(param);
+ if (TokenClass instanceof Tk) {
+ this.token = TokenClass;
+ } else {
+ this.token = new TokenClass(param);
+ }
+
this.emitCurrentToken();
},
@@ -1385,7 +1399,7 @@ Tokenizer.prototype = {
},
consumeCharacterReference: function(allowed) {
- var next = this.getChar();
+ var next = this.peek();
if (next === undefined) { return; }
switch (next) {
@@ -1399,10 +1413,135 @@ Tokenizer.prototype = {
case allowed:
return;
case "#":
+ this.getChar();
return this.consumeCharacterReferenceNumberSign();
default:
return this.consumeCharacterReferenceNamed();
}
+ },
+
+ consumeCharacterReferenceNumberSign: function() {
+ var next = this.peek();
+
+ switch (next) {
+ case "x":
+ case "X":
+ this.getChar();
+ return this.consumeCharacterReferenceNumberPart(/[0-9A-Fa-f]/, 16);
+ default:
+ return this.consumeCharacterReferenceNumberPart(/[0-9]/, 10);
+ }
+ },
+
+ consumeCharacterReferenceNumberPart: function(range, radix) {
+ var next = this.peek(), out = "";
+
+ if (!next || !range.test(next)) {
+ this.reconsume();
+
+ // if an "X" was provided, reconsume it
+ if (radix === 16) { this.reconsume(); }
+
+ this.parseError();
+ return;
+ }
+
+ while (next && range.test(next)) {
+ this.getChar();
+ out += next;
+ next = this.peek();
+ }
+
+ var final = this.peek();
+
+ if (final === ";") {
+ this.getChar();
+ } else {
+ this.parseError();
+ }
+
+ var number = parseInt(out, radix), char;
+
+ if (number === 0) {
+ this.parseError();
+ return new TkChar(REPLACEMENT);
+ } else if (number === 13) {
+ this.parseError();
+ return new TkChar(String.fromCharCode(13));
+ } else if (number >= 128 && number <= 159) {
+ this.parseError();
+ return new TkChar(String.fromCharCode(ENTITIES.windows[number - 128]));
+ } else if (number >= 0xD800 && number <= 0xDFFF || number > 0x10FFFF) {
+ this.parseError();
+ return new TkChar(REPLACEMENT);
+ }
+
+ this.assertValidChar(number);
+ return new TkChar(String.fromCharCode(number));
+ },
+
+ assertValidChar: function(number) {
+ if ((number >= 0x0001 && number <= 0x0008) ||
+ (number >= 0x000E && number <= 0x001F) ||
+ (number >= 0x007F && number <= 0x009F) ||
+ (number >= 0xFDD0 && number <= 0xFDEF)) {
+ this.parseError();
+ }
+
+ switch (number) {
+ case 0x000B:
+ case 0xFFFE:
+ case 0xFFFF:
+ case 0x1FFFE:
+ case 0x1FFFF:
+ case 0x2FFFE:
+ case 0x2FFFF:
+ case 0x3FFFE:
+ case 0x3FFFF:
+ case 0x4FFFE:
+ case 0x4FFFF:
+ case 0x5FFFE:
+ case 0x5FFFF:
+ case 0x6FFFE:
+ case 0x6FFFF:
+ case 0x7FFFE:
+ case 0x7FFFF:
+ case 0x8FFFE:
+ case 0x8FFFF:
+ case 0x9FFFE:
+ case 0x9FFFF:
+ case 0xAFFFE:
+ case 0xAFFFF:
+ case 0xBFFFE:
+ case 0xBFFFF:
+ case 0xCFFFE:
+ case 0xCFFFF:
+ case 0xDFFFE:
+ case 0xDFFFF:
+ case 0xEFFFE:
+ case 0xEFFFF:
+ case 0xFFFFE:
+ case 0xFFFFF:
+ case 0x10FFFE:
+ case 0x10FFFF:
+ this.parseError();
+ }
+ },
+
+ consumeCharacterReferenceNamed: function() {
+ var chars = this.peek(9);
+
+ for (var i=9; i>1; i--) {
+ if (ENTITIES.named.hasOwnProperty(chars)) {
+ if (chars.substr(-1) !== ";") { this.parseError(); }
+ this.getChars(chars.length);
+
+ return new TkChar(ENTITIES.named[chars]);
+ }
+ chars = chars.slice(0, -1);
+ }
+
+ this.parseError();
}
};
View
10 spec/spec_helper.rb
@@ -5,8 +5,14 @@ def initialize
@context = V8::Context.new
@context['console'] = { "log" => lambda { |string| puts string } }
- parser = File.expand_path("../../parser.js", __FILE__)
- exports = @context.eval "(function(exports) { #{File.read(parser)}; return exports; })({})", "parser.js"
+ load_path = File.expand_path("../../lib", __FILE__)
+
+ js_require = @context['require'] = lambda do |name|
+ contents = File.read(File.join(load_path, name))
+ @context.eval "(function(exports) { #{contents}; return exports; })({})", name
+ end
+
+ exports = js_require.call("parser.js")
@tokenizer = exports['Tokenizer']
end
View
6 spec/tokenizer_spec.rb
@@ -1,6 +1,7 @@
require "spec_helper"
require "json"
require "digest"
+require "timeout"
def load_test_data(file)
test_data = File.expand_path("../testdata/tokenizer", __FILE__)
@@ -43,7 +44,10 @@ def normalize_expected(expected)
pending if description =~ /doctype/i
expected = normalize_expected(output)
- @tokenizer.tokenize(input).should == expected
+
+ Timeout.timeout(1) do
+ @tokenizer.tokenize(input).should == expected
+ end
end
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.