@@ -809,112 +809,149 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809
809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810
810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811
811
static void traverse_for_entities (
812
- const char * old ,
813
- size_t oldlen ,
814
- zend_string * ret , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
812
+ const char * input ,
813
+ size_t input_len ,
814
+ zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815
815
int all ,
816
816
int flags ,
817
817
const entity_ht * inv_map ,
818
818
enum entity_charset charset )
819
819
{
820
- const char * p ,
821
- * lim ;
822
- char * q ;
823
- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824
-
825
- lim = old + oldlen ; /* terminator address */
826
- assert (* lim == '\0' );
827
-
828
- for (p = old , q = ZSTR_VAL (ret ); p < lim ;) {
829
- unsigned code , code2 = 0 ;
830
- const char * next = NULL ; /* when set, next > p, otherwise possible inf loop */
831
-
832
- /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833
- * ASCII range byte can be part of a multi-byte sequence.
834
- * However, they start at 0x40, therefore if we find a 0x26 byte,
835
- * we're sure it represents the '&' character. */
836
-
837
- /* assumes there are no single-char entities */
838
- if (p [0 ] != '&' || (p + 3 >= lim )) {
839
- * (q ++ ) = * (p ++ );
840
- continue ;
841
- }
842
-
843
- /* now p[3] is surely valid and is no terminator */
844
-
845
- /* numerical entity */
846
- if (p [1 ] == '#' ) {
847
- next = & p [2 ];
848
- if (process_numeric_entity (& next , & code ) == FAILURE )
849
- goto invalid_code ;
850
-
851
- /* If we're in htmlspecialchars_decode, we're only decoding entities
820
+ const char * current_ptr = input ;
821
+ const char * input_end = input + input_len ; /* terminator address */
822
+ char * output_ptr = ZSTR_VAL (output );
823
+ int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824
+
825
+ assert (* input_end == '\0' );
826
+
827
+ while (current_ptr < input_end ) {
828
+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
829
+ if (!ampersand_ptr ) {
830
+ size_t tail_len = input_end - current_ptr ;
831
+ if (tail_len > 0 ) {
832
+ memcpy (output_ptr , current_ptr , tail_len );
833
+ output_ptr += tail_len ;
834
+ }
835
+ break ;
836
+ }
837
+
838
+ /* Copy everything up to the found '&' */
839
+ size_t chunk_len = ampersand_ptr - current_ptr ;
840
+ if (chunk_len > 0 ) {
841
+ memcpy (output_ptr , current_ptr , chunk_len );
842
+ output_ptr += chunk_len ;
843
+ }
844
+
845
+ /* Now current_ptr points to the '&' character. */
846
+ current_ptr = ampersand_ptr ;
847
+
848
+ /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849
+ if (current_ptr + 3 >= input_end ) {
850
+ * output_ptr ++ = * current_ptr ++ ;
851
+ continue ;
852
+ }
853
+
854
+ unsigned code = 0 , code2 = 0 ;
855
+ const char * entity_end_ptr = NULL ;
856
+ int valid_entity = 1 ;
857
+
858
+ if (current_ptr [1 ] == '#' ) {
859
+ /* Processing numeric entity */
860
+ const char * num_start = current_ptr + 2 ;
861
+ entity_end_ptr = num_start ;
862
+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
863
+ valid_entity = 0 ;
864
+ }
865
+ /* If we're in htmlspecialchars_decode, we're only decoding entities
852
866
* that represent &, <, >, " and '. Is this one of them? */
853
- if (!all && (code > 63U ||
854
- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
855
- goto invalid_code ;
856
-
857
- /* are we allowed to decode this entity in this document type?
867
+ if (valid_entity && !all &&
868
+ (code > 63U ||
869
+ stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
870
+ {
871
+ valid_entity = 0 ;
872
+ }
873
+ /* are we allowed to decode this entity in this document type?
858
874
* HTML 5 is the only that has a character that cannot be used in
859
875
* a numeric entity but is allowed literally (U+000D). The
860
876
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861
- if (!unicode_cp_is_allowed (code , doctype ) ||
862
- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))
863
- goto invalid_code ;
864
- } else {
865
- const char * start ;
866
- size_t ent_len ;
867
-
868
- next = & p [1 ];
869
- start = next ;
870
-
871
- if (process_named_entity_html (& next , & start , & ent_len ) == FAILURE )
872
- goto invalid_code ;
873
-
874
- if (resolve_named_entity_html (start , ent_len , inv_map , & code , & code2 ) == FAILURE ) {
875
- if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start [0 ] == 'a'
876
- && start [1 ] == 'p' && start [2 ] == 'o' && start [3 ] == 's' ) {
877
- /* uses html4 inv_map, which doesn't include apos;. This is a
878
- * hack to support it */
879
- code = (unsigned ) '\'' ;
880
- } else {
881
- goto invalid_code ;
882
- }
883
- }
884
- }
885
-
886
- assert (* next == ';' );
887
-
888
- if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
889
- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
890
- /* && code2 == '\0' always true for current maps */ )
891
- goto invalid_code ;
892
-
893
- /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
877
+ if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
878
+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )))
879
+ {
880
+ valid_entity = 0 ;
881
+ }
882
+ } else {
883
+ /* Processing named entity */
884
+ const char * name_start = current_ptr + 1 ;
885
+ /* Search for ';' */
886
+ const char * semi_colon_ptr = memchr (name_start , ';' , LONGEST_ENTITY_LENGTH + 1 );
887
+ if (!semi_colon_ptr ) {
888
+ valid_entity = 0 ;
889
+ } else {
890
+ size_t name_len = semi_colon_ptr - name_start ;
891
+ if (name_len == 0 ) {
892
+ valid_entity = 0 ;
893
+ } else {
894
+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
895
+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896
+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
897
+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
898
+ {
899
+ /* uses html4 inv_map, which doesn't include apos;. This is a
900
+ * hack to support it */
901
+ code = (unsigned )'\'' ;
902
+ } else {
903
+ valid_entity = 0 ;
904
+ }
905
+ }
906
+ entity_end_ptr = semi_colon_ptr ;
907
+ }
908
+ }
909
+ }
910
+
911
+ /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912
+ if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
913
+ * output_ptr ++ = * current_ptr ++ ;
914
+ continue ;
915
+ }
916
+
917
+ /* Check if quotes are allowed for entities representing ' or " */
918
+ if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
919
+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE ))))
920
+ {
921
+ valid_entity = 0 ;
922
+ }
923
+
924
+ /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894
925
* the call is needed to ensure the codepoint <= U+00FF) */
895
- if (charset != cs_utf_8 ) {
896
- /* replace unicode code point */
897
- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
898
- goto invalid_code ; /* not representable in target charset */
899
- }
900
-
901
- q += write_octet_sequence ((unsigned char * )q , charset , code );
902
- if (code2 ) {
903
- q += write_octet_sequence ((unsigned char * )q , charset , code2 );
904
- }
905
-
906
- /* jump over the valid entity; may go beyond size of buffer; np */
907
- p = next + 1 ;
908
- continue ;
909
-
910
- invalid_code :
911
- for (; p < next ; p ++ ) {
912
- * (q ++ ) = * p ;
913
- }
914
- }
915
-
916
- * q = '\0' ;
917
- ZSTR_LEN (ret ) = (size_t )(q - ZSTR_VAL (ret ));
926
+ if (valid_entity && charset != cs_utf_8 ) {
927
+ /* replace unicode code point */
928
+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
929
+ valid_entity = 0 ;
930
+ }
931
+
932
+ if (valid_entity ) {
933
+ /* Write the parsed entity into the output buffer */
934
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
935
+ if (code2 ) {
936
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
937
+ }
938
+ /* Move current_ptr past the semicolon */
939
+ current_ptr = entity_end_ptr + 1 ;
940
+ } else {
941
+ /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942
+ if (entity_end_ptr ) {
943
+ size_t len = entity_end_ptr - current_ptr ;
944
+ memcpy (output_ptr , current_ptr , len );
945
+ output_ptr += len ;
946
+ current_ptr = entity_end_ptr ;
947
+ } else {
948
+ * output_ptr ++ = * current_ptr ++ ;
949
+ }
950
+ }
951
+ }
952
+
953
+ * output_ptr = '\0' ;
954
+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
918
955
}
919
956
/* }}} */
920
957
0 commit comments