Skip to content

Commit 66f5709

Browse files
committedMar 16, 2025
Refactor traverse_for_entities (used in unescape_html_entities): Optimize scanning for '&' and ';' using memchr
Use memcpy instead of character-by-character copying language
1 parent e954bf6 commit 66f5709

File tree

1 file changed

+133
-96
lines changed

1 file changed

+133
-96
lines changed
 

‎ext/standard/html.c

+133-96
Original file line numberDiff line numberDiff line change
@@ -809,112 +809,149 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
811811
static void traverse_for_entities(
812-
const char *old,
813-
size_t oldlen,
814-
zend_string *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
812+
const char *input,
813+
size_t input_len,
814+
zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815815
int all,
816816
int flags,
817817
const entity_ht *inv_map,
818818
enum entity_charset charset)
819819
{
820-
const char *p,
821-
*lim;
822-
char *q;
823-
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
824-
825-
lim = old + oldlen; /* terminator address */
826-
assert(*lim == '\0');
827-
828-
for (p = old, q = ZSTR_VAL(ret); p < lim;) {
829-
unsigned code, code2 = 0;
830-
const char *next = NULL; /* when set, next > p, otherwise possible inf loop */
831-
832-
/* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833-
* ASCII range byte can be part of a multi-byte sequence.
834-
* However, they start at 0x40, therefore if we find a 0x26 byte,
835-
* we're sure it represents the '&' character. */
836-
837-
/* assumes there are no single-char entities */
838-
if (p[0] != '&' || (p + 3 >= lim)) {
839-
*(q++) = *(p++);
840-
continue;
841-
}
842-
843-
/* now p[3] is surely valid and is no terminator */
844-
845-
/* numerical entity */
846-
if (p[1] == '#') {
847-
next = &p[2];
848-
if (process_numeric_entity(&next, &code) == FAILURE)
849-
goto invalid_code;
850-
851-
/* If we're in htmlspecialchars_decode, we're only decoding entities
820+
const char *current_ptr = input;
821+
const char *input_end = input + input_len; /* terminator address */
822+
char *output_ptr = ZSTR_VAL(output);
823+
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
824+
825+
assert(*input_end == '\0');
826+
827+
while (current_ptr < input_end) {
828+
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
829+
if (!ampersand_ptr) {
830+
size_t tail_len = input_end - current_ptr;
831+
if (tail_len > 0) {
832+
memcpy(output_ptr, current_ptr, tail_len);
833+
output_ptr += tail_len;
834+
}
835+
break;
836+
}
837+
838+
/* Copy everything up to the found '&' */
839+
size_t chunk_len = ampersand_ptr - current_ptr;
840+
if (chunk_len > 0) {
841+
memcpy(output_ptr, current_ptr, chunk_len);
842+
output_ptr += chunk_len;
843+
}
844+
845+
/* Now current_ptr points to the '&' character. */
846+
current_ptr = ampersand_ptr;
847+
848+
/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849+
if (current_ptr + 3 >= input_end) {
850+
*output_ptr++ = *current_ptr++;
851+
continue;
852+
}
853+
854+
unsigned code = 0, code2 = 0;
855+
const char *entity_end_ptr = NULL;
856+
int valid_entity = 1;
857+
858+
if (current_ptr[1] == '#') {
859+
/* Processing numeric entity */
860+
const char *num_start = current_ptr + 2;
861+
entity_end_ptr = num_start;
862+
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
863+
valid_entity = 0;
864+
}
865+
/* If we're in htmlspecialchars_decode, we're only decoding entities
852866
* that represent &, <, >, " and '. Is this one of them? */
853-
if (!all && (code > 63U ||
854-
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
855-
goto invalid_code;
856-
857-
/* are we allowed to decode this entity in this document type?
867+
if (valid_entity && !all &&
868+
(code > 63U ||
869+
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
870+
{
871+
valid_entity = 0;
872+
}
873+
/* are we allowed to decode this entity in this document type?
858874
* HTML 5 is the only that has a character that cannot be used in
859875
* a numeric entity but is allowed literally (U+000D). The
860876
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861-
if (!unicode_cp_is_allowed(code, doctype) ||
862-
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))
863-
goto invalid_code;
864-
} else {
865-
const char *start;
866-
size_t ent_len;
867-
868-
next = &p[1];
869-
start = next;
870-
871-
if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
872-
goto invalid_code;
873-
874-
if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
875-
if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
876-
&& start[1] == 'p' && start[2] == 'o' && start[3] == 's') {
877-
/* uses html4 inv_map, which doesn't include apos;. This is a
878-
* hack to support it */
879-
code = (unsigned) '\'';
880-
} else {
881-
goto invalid_code;
882-
}
883-
}
884-
}
885-
886-
assert(*next == ';');
887-
888-
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
889-
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
890-
/* && code2 == '\0' always true for current maps */)
891-
goto invalid_code;
892-
893-
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
877+
if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
878+
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)))
879+
{
880+
valid_entity = 0;
881+
}
882+
} else {
883+
/* Processing named entity */
884+
const char *name_start = current_ptr + 1;
885+
/* Search for ';' */
886+
const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1);
887+
if (!semi_colon_ptr) {
888+
valid_entity = 0;
889+
} else {
890+
size_t name_len = semi_colon_ptr - name_start;
891+
if (name_len == 0) {
892+
valid_entity = 0;
893+
} else {
894+
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
895+
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896+
name_start[0] == 'a' && name_start[1] == 'p' &&
897+
name_start[2] == 'o' && name_start[3] == 's')
898+
{
899+
/* uses html4 inv_map, which doesn't include apos;. This is a
900+
* hack to support it */
901+
code = (unsigned)'\'';
902+
} else {
903+
valid_entity = 0;
904+
}
905+
}
906+
entity_end_ptr = semi_colon_ptr;
907+
}
908+
}
909+
}
910+
911+
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912+
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
913+
*output_ptr++ = *current_ptr++;
914+
continue;
915+
}
916+
917+
/* Check if quotes are allowed for entities representing ' or " */
918+
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
919+
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
920+
{
921+
valid_entity = 0;
922+
}
923+
924+
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894925
* the call is needed to ensure the codepoint <= U+00FF) */
895-
if (charset != cs_utf_8) {
896-
/* replace unicode code point */
897-
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
898-
goto invalid_code; /* not representable in target charset */
899-
}
900-
901-
q += write_octet_sequence((unsigned char*)q, charset, code);
902-
if (code2) {
903-
q += write_octet_sequence((unsigned char*)q, charset, code2);
904-
}
905-
906-
/* jump over the valid entity; may go beyond size of buffer; np */
907-
p = next + 1;
908-
continue;
909-
910-
invalid_code:
911-
for (; p < next; p++) {
912-
*(q++) = *p;
913-
}
914-
}
915-
916-
*q = '\0';
917-
ZSTR_LEN(ret) = (size_t)(q - ZSTR_VAL(ret));
926+
if (valid_entity && charset != cs_utf_8) {
927+
/* replace unicode code point */
928+
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
929+
valid_entity = 0;
930+
}
931+
932+
if (valid_entity) {
933+
/* Write the parsed entity into the output buffer */
934+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
935+
if (code2) {
936+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
937+
}
938+
/* Move current_ptr past the semicolon */
939+
current_ptr = entity_end_ptr + 1;
940+
} else {
941+
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942+
if (entity_end_ptr) {
943+
size_t len = entity_end_ptr - current_ptr;
944+
memcpy(output_ptr, current_ptr, len);
945+
output_ptr += len;
946+
current_ptr = entity_end_ptr;
947+
} else {
948+
*output_ptr++ = *current_ptr++;
949+
}
950+
}
951+
}
952+
953+
*output_ptr = '\0';
954+
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
918955
}
919956
/* }}} */
920957

0 commit comments

Comments
 (0)
Failed to load comments.