Skip to content

Commit

Permalink
better HTML decoding (and encoding) - deal with numeric entities, rem…
Browse files Browse the repository at this point in the history
…ove bad entities, also don't, by default, encode regular UTF-8 chars
  • Loading branch information
ejones committed Oct 26, 2010
1 parent aba1cc2 commit 7c3f0ed
Showing 1 changed file with 42 additions and 23 deletions.
65 changes: 42 additions & 23 deletions lib/jsdom/browser/htmlencoding.js
Expand Up @@ -128,18 +128,24 @@ var htmlentities = function(string, quote_style) {

string = string || "";

var hash_map = {}, symbol = '', tmp_str = '', entity = '';
var hash_map = {}, symbol, tmp_str = '', entity = '';
tmp_str = string.toString();

if (false === (hash_map = get_html_translation_table('HTML_ENTITIES', quote_style))) {

if (false === (hash_map = get_html_translation_table('HTML_SPECIALCHARS', quote_style))) {
return false;
}
hash_map["'"] = ''';
delete hash_map["&"];
tmp_str = tmp_str.split("&").join('&');

// amended by ejones - get_html_translation_table no longer translates
// char codes by default

//hash_map["'"] = ''';
delete hash_map['38']; // &
tmp_str = tmp_str.split('&').join('&');
for (symbol in hash_map) {
entity = hash_map[symbol];
tmp_str = tmp_str.split(symbol).join(entity);
tmp_str = tmp_str
.split(String.fromCharCode(symbol))
.join(hash_map[symbol]);
}

return tmp_str;
Expand All @@ -166,24 +172,32 @@ var html_entity_decode = function(string, quote_style) {

string = string || "";

var hash_map = {}, symbol = '', tmp_str = '', entity = '';
var hash_map = {}, symbol, tmp_str = '',
entity = '', entity_to_char_code = {};
tmp_str = string.toString();

if (false === (hash_map = get_html_translation_table('HTML_ENTITIES', quote_style))) {
return false;
}

// fix & problem
// http://phpjs.org/functions/get_html_translation_table:416#comment_97660
delete(hash_map['&']);
hash_map['&'] = '&';

tmp_str = tmp_str.split(''').join("'");
tmp_str = tmp_str.split(''').join("'");

// amended by ejones - more flexible recognition of entities
for (symbol in hash_map) {
entity = hash_map[symbol];
tmp_str = tmp_str.split(entity).join(symbol);
entity_to_char_code[hash_map[symbol]] = symbol;
}
tmp_str = tmp_str.replace(/&([#a-z0-9]+);/gi, function( ent, ename ) {
var char = entity_to_char_code[ent];
if (char == null && ename === 'apos') char = 39;
if (char == null && ename[0] === '#') {
if (ename[1].toLowerCase() === 'x') {
char = parseInt(ename.substring(2), 16);
} else {
char = parseInt(ename.substring(1), 10);
};
if (isNaN(char)) char = null;
};
return char != null ? String.fromCharCode(char) : '';
});

return tmp_str;
};
Expand Down Expand Up @@ -339,13 +353,18 @@ var get_html_translation_table = function(table, quote_style) {
entities['62'] = '>';


// ascii decimals to real symbols
for (decimal in entities) {
symbol = String.fromCharCode(decimal);
hash_map[symbol] = entities[decimal];
}

// removed by ejones - translation now done in encoding/decoding

//// ascii decimals to real symbols
//for (decimal in entities) {
// symbol = String.fromCharCode(decimal);
// hash_map[symbol] = entities[decimal];
//}

//return hash_map;

return hash_map;
return entities;
};

/* }}} */
Expand Down

0 comments on commit 7c3f0ed

Please sign in to comment.