From 7a313cd25a85569d4cfa5e8ef3b101a9be802d9a Mon Sep 17 00:00:00 2001 From: Mike Kaminski Date: Mon, 21 Jun 2021 20:23:52 -0400 Subject: [PATCH 1/2] Reduce memory allocations This commit aims to reduce the number of times emalloc() is called. Instead of allocating arrays and strings each time every time a literal or node list is compared in the interpreter, create literals and node list once at parse time. In addition, use a stack-allocated memory pool for AST nodes. This has the benefit of eliminating calls to emalloc(), allocating contiguous blocks of memory, and reducing the risk of future memory leaks. The tradeoff we're making here is that there is a fixed number of AST nodes (64), so there's a limit imposed on the size of the JSONPath query. We'll see in the future if this poses a problem, in which case we can implement Viktor's dynamic memory pool solution. (6cf186b) --- jsonpath.c | 6 +- src/jsonpath/interpreter.c | 69 ++++----------- src/jsonpath/parser.c | 162 ++++++++++++++++------------------- src/jsonpath/parser.h | 17 ++-- tests/bounds_checks/001.phpt | 22 ----- 5 files changed, 106 insertions(+), 170 deletions(-) delete mode 100644 tests/bounds_checks/001.phpt diff --git a/jsonpath.c b/jsonpath.c index 73aaa55..d3d328a 100644 --- a/jsonpath.c +++ b/jsonpath.c @@ -62,10 +62,12 @@ PHP_METHOD(JsonPath, find) { /* assemble an array of query execution instructions from parsed tokens */ int i = 0; - struct ast_node* head = parse_jsonpath(lex_tok, &i, lex_tok_count); + struct node_pool pool = {0}; + struct ast_node* head = parse_jsonpath(lex_tok, &i, lex_tok_count, &pool); if (head == NULL) { free(j_path_work_copy); + free_php_objects(&pool); return; } @@ -79,8 +81,8 @@ PHP_METHOD(JsonPath, find) { eval_ast(search_target, search_target, head, return_value); - free_ast_nodes(head); free(j_path_work_copy); + free_php_objects(&pool); /* return false if no results were found by the JSON-path query */ diff --git a/src/jsonpath/interpreter.c b/src/jsonpath/interpreter.c index 3af8fe8..77854a2 100644 --- a/src/jsonpath/interpreter.c +++ b/src/jsonpath/interpreter.c @@ -23,8 +23,6 @@ void exec_selector(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* re void exec_slice(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value); void exec_wildcard(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value); zval* evaluate_primary(struct ast_node* src, zval* tmp_dest, zval* arr_head, zval* arr_cur); -void copy_index_list_to_array(struct ast_node* tok, zval* val); -void free_primary_zvals(struct ast_node* tok, zval* val); bool break_if_result_found(zval* return_value); void copy_result_or_continue(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value); BOOL_OR_ERR evaluate_unary(zval* arr_head, zval* arr_cur, struct ast_node* tok); @@ -128,17 +126,16 @@ void exec_recursive_descent(zval* arr_head, zval* arr_cur, struct ast_node* tok, } void exec_node_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value) { - int i; zend_ulong idx; zval* data; - for (i = 0; i < tok->data.d_nodes.count; i++) { - if (ZEND_HANDLE_NUMERIC_STR(tok->data.d_nodes.str[i], tok->data.d_nodes.len[i], idx)) { + ZEND_HASH_FOREACH_VAL(tok->data.d_nodes.ht, data) { + if (ZEND_HANDLE_NUMERIC_STR(Z_STRVAL_P(data), Z_STRLEN_P(data), idx)) { /* look up numeric index */ data = zend_hash_index_find(HASH_OF(arr_cur), idx); } else { /* look up string index */ - data = zend_hash_str_find(HASH_OF(arr_cur), tok->data.d_nodes.str[i], tok->data.d_nodes.len[i]); + data = zend_hash_str_find(HASH_OF(arr_cur), Z_STRVAL_P(data), Z_STRLEN_P(data)); } if (data != NULL) { copy_result_or_continue(arr_head, data, tok, return_value); @@ -147,6 +144,7 @@ void exec_node_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* } } } + ZEND_HASH_FOREACH_END(); } void exec_index_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value) { @@ -308,7 +306,7 @@ zval* evaluate_primary(struct ast_node* src, zval* tmp_dest, zval* arr_head, zva ZVAL_DOUBLE(tmp_dest, src->data.d_double.value); return tmp_dest; case AST_LITERAL: - ZVAL_STRINGL(tmp_dest, src->data.d_literal.val, src->data.d_literal.len); + ZVAL_NEW_STR(tmp_dest, src->data.d_literal.str); return tmp_dest; case AST_LONG: ZVAL_LONG(tmp_dest, src->data.d_long.value); @@ -334,7 +332,7 @@ zval* evaluate_primary(struct ast_node* src, zval* tmp_dest, zval* arr_head, zva } return Z_INDIRECT_P(tmp_dest); case AST_NODE_LIST: - copy_index_list_to_array(src, tmp_dest); + ZVAL_ARR(tmp_dest, src->data.d_nodes.ht); return tmp_dest; default: zend_throw_exception(spl_ce_RuntimeException, "Unsupported expression operand", 0); @@ -342,23 +340,6 @@ zval* evaluate_primary(struct ast_node* src, zval* tmp_dest, zval* arr_head, zva } } -void copy_index_list_to_array(struct ast_node* tok, zval* val) { - array_init(val); - int i; - for (i = 0; i < tok->data.d_nodes.count; i++) { - add_index_stringl(val, i, tok->data.d_nodes.str[i], tok->data.d_nodes.len[i]); - } -} - -/* free zvals created by evaluate_primary() */ -void free_primary_zvals(struct ast_node* tok, zval* val) { - if (tok->type == AST_LITERAL) { - zval_ptr_dtor(val); - } else if (tok->type == AST_NODE_LIST) { - zend_array_destroy(Z_ARR_P(val)); - } -} - bool break_if_result_found(zval* return_value) { return Z_TYPE_P(return_value) == IS_INDIRECT && Z_INDIRECT_P(return_value) != NULL; } @@ -479,54 +460,36 @@ BOOL_OR_ERR evaluate_binary(zval* arr_head, zval* arr_cur, struct ast_node* tok) zval* val_rh = evaluate_operand(arr_head, arr_cur, tok, rh_operand, &tmp_rh); if (val_rh == NULL) { - free_primary_zvals(lh_operand, val_lh); return BOOL_ERR; } if (can_shortcut_binary_evaluation(tok, val_rh)) { - free_primary_zvals(lh_operand, val_lh); return false; } - bool ret = false; - switch (tok->type) { case AST_EQ: - ret = fast_is_identical_function(val_lh, val_rh); - break; + return fast_is_identical_function(val_lh, val_rh); case AST_NE: - ret = !fast_is_identical_function(val_lh, val_rh); - break; + return !fast_is_identical_function(val_lh, val_rh); case AST_LT: - ret = can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) < 0; - break; + return can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) < 0; case AST_LTE: - ret = can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) <= 0; - break; + return can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) <= 0; case AST_OR: - ret = (Z_TYPE_P(val_lh) == IS_TRUE) || (Z_TYPE_P(val_rh) == IS_TRUE); - break; + return (Z_TYPE_P(val_lh) == IS_TRUE) || (Z_TYPE_P(val_rh) == IS_TRUE); case AST_AND: - ret = (Z_TYPE_P(val_lh) == IS_TRUE) && (Z_TYPE_P(val_rh) == IS_TRUE); - break; + return (Z_TYPE_P(val_lh) == IS_TRUE) && (Z_TYPE_P(val_rh) == IS_TRUE); case AST_GT: - ret = can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) > 0; - break; + return can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) > 0; case AST_GTE: - ret = can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) >= 0; - break; + return can_check_inequality(val_lh, val_rh) && compare(val_lh, val_rh) >= 0; case AST_RGXP: - ret = compare_rgxp(val_lh, val_rh); - break; + return compare_rgxp(val_lh, val_rh); default: assert(0); - break; + return false; } - - free_primary_zvals(lh_operand, val_lh); - free_primary_zvals(rh_operand, val_rh); - - return ret; } /* Determine if two zvals can be checked for inequality (>, <, >=, <=). Specifically forbid comparing strings with diff --git a/src/jsonpath/parser.c b/src/jsonpath/parser.c index 62f8114..32f95aa 100644 --- a/src/jsonpath/parser.c +++ b/src/jsonpath/parser.c @@ -9,15 +9,20 @@ #define CONSUME_TOKEN() (*lex_idx)++ #define CUR_POS() *lex_idx -#define CUR_TOKEN_LITERAL() lex_tok[*lex_idx].val #define CUR_TOKEN_LEN() lex_tok[*lex_idx].len +#define CUR_TOKEN_LITERAL() lex_tok[*lex_idx].val #define CUR_TOKEN() lex_tok[*lex_idx].type +#define GET_NODE(type) get_node_from_pool(pool, type) +#define GET_BINARY_NODE(type, left, right) get_binary_node_from_pool(pool, type, left, right) #define HAS_TOKEN() (*lex_idx < lex_tok_count) -#define PARSER_ARGS lex_tok, lex_idx, lex_tok_count -#define PARSER_PARAMS struct jpath_token lex_tok[], int *lex_idx, int lex_tok_count +#define PARSER_ARGS lex_tok, lex_idx, lex_tok_count, pool +#define PARSER_PARAMS struct jpath_token lex_tok[], int *lex_idx, int lex_tok_count, struct node_pool *pool +#define RETURN_IF_NULL(param) \ + if (param == NULL) return NULL -static struct ast_node* ast_alloc_binary(enum ast_type type, struct ast_node* left, struct ast_node* right); -static struct ast_node* ast_alloc_node(struct ast_node* prev, enum ast_type type); +static struct ast_node* get_binary_node_from_pool(struct node_pool* pool, enum ast_type type, struct ast_node* left, + struct ast_node* right); +static struct ast_node* get_node_from_pool(struct node_pool* pool, enum ast_type type); static struct ast_node* parse_and(PARSER_PARAMS); static struct ast_node* parse_childpath(PARSER_PARAMS); @@ -66,24 +71,26 @@ const char* AST_STR[] = { "AST_WILD_CARD", /**/ }; -static struct ast_node* ast_alloc_binary(enum ast_type type, struct ast_node* left, struct ast_node* right) { - struct ast_node* node = ast_alloc_node(NULL, type); - +static struct ast_node* get_binary_node_from_pool(struct node_pool* pool, enum ast_type type, struct ast_node* left, + struct ast_node* right) { + struct ast_node* node = get_node_from_pool(pool, type); + if (node == NULL) { + return NULL; + } node->data.d_binary.left = left; node->data.d_binary.right = right; - return node; } -static struct ast_node* ast_alloc_node(struct ast_node* prev, enum ast_type type) { - struct ast_node* node = emalloc(sizeof(struct ast_node)); - memset(node, 0, sizeof(struct ast_node)); - - node->type = type; - if (prev != NULL) { - prev->next = node; +static struct ast_node* get_node_from_pool(struct node_pool* pool, enum ast_type type) { + if (pool->cur_index >= NODE_POOL_LEN) { + zend_throw_exception_ex( + spl_ce_RuntimeException, 0, + "Expression requires more parser node slots than are available (%d), try a shorter expression", NODE_POOL_LEN); + return NULL; } - + struct ast_node* node = &pool->nodes[pool->cur_index++]; + node->type = type; return node; } @@ -147,14 +154,13 @@ static bool parse_filter_list(PARSER_PARAMS, struct ast_node* tok) { zend_throw_exception(spl_ce_RuntimeException, "Array slice indexes must be integers", 0); return false; } - if (tok->data.d_nodes.count >= FILTER_ARR_LEN) { - zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Union filter may contain no more than %d elements", - FILTER_ARR_LEN); - return false; - } tok->type = sep_found = AST_NODE_LIST; - tok->data.d_nodes.str[tok->data.d_nodes.count] = CUR_TOKEN_LITERAL(); - tok->data.d_nodes.len[tok->data.d_nodes.count] = CUR_TOKEN_LEN(); + if (tok->data.d_nodes.ht == NULL) { + tok->data.d_nodes.ht = zend_new_array(1); + } + zval tmp; + ZVAL_ARR(&tmp, tok->data.d_nodes.ht); + add_index_stringl(&tmp, tok->data.d_nodes.count, CUR_TOKEN_LITERAL(), CUR_TOKEN_LEN()); tok->data.d_nodes.count++; } else { zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Unexpected token `%s` in filter", LEX_STR[CUR_TOKEN()]); @@ -181,7 +187,8 @@ struct ast_node* parse_jsonpath(PARSER_PARAMS) { } } - struct ast_node* ret = ast_alloc_node(NULL, AST_ROOT); + struct ast_node* ret = GET_NODE(AST_ROOT); + RETURN_IF_NULL(ret); ret->next = expr; return ret; @@ -196,7 +203,8 @@ static struct ast_node* parse_operator(PARSER_PARAMS) { switch (CUR_TOKEN()) { case LEX_NODE: - expr = ast_alloc_node(NULL, AST_SELECTOR); + expr = GET_NODE(AST_SELECTOR); + RETURN_IF_NULL(expr); expr->data.d_selector.val = CUR_TOKEN_LITERAL(); expr->data.d_selector.len = CUR_TOKEN_LEN(); CONSUME_TOKEN(); @@ -213,11 +221,13 @@ static struct ast_node* parse_operator(PARSER_PARAMS) { CONSUME_TOKEN(); break; case LEX_DEEP_SCAN: - expr = ast_alloc_node(NULL, AST_RECURSE); + expr = GET_NODE(AST_RECURSE); + RETURN_IF_NULL(expr); CONSUME_TOKEN(); break; case LEX_WILD_CARD: - expr = ast_alloc_node(NULL, AST_WILD_CARD); + expr = GET_NODE(AST_WILD_CARD); + RETURN_IF_NULL(expr); CONSUME_TOKEN(); break; case LEX_ROOT: @@ -238,7 +248,6 @@ static struct ast_node* parse_operator(PARSER_PARAMS) { * isn't an operator. */ expr->next = parse_operator(PARSER_ARGS); if (expr->next == NULL) { - free_ast_nodes(expr); return NULL; } } @@ -254,11 +263,11 @@ static struct ast_node* parse_expression(PARSER_PARAMS) { return NULL; } - struct ast_node* expr = ast_alloc_node(NULL, AST_EXPR); + struct ast_node* expr = GET_NODE(AST_EXPR); + RETURN_IF_NULL(expr); expr->data.d_expression.head = parse_or(PARSER_ARGS); if (!validate_expression_head(expr->data.d_expression.head)) { - free_ast_nodes(expr); return NULL; } @@ -280,14 +289,15 @@ static struct ast_node* parse_filter(PARSER_PARAMS) { case LEX_LITERAL_NUMERIC: case LEX_LITERAL: case LEX_SLICE: - expr = ast_alloc_node(NULL, AST_INDEX_LIST); + expr = GET_NODE(AST_INDEX_LIST); + RETURN_IF_NULL(expr); if (!parse_filter_list(PARSER_ARGS, expr)) { - free_ast_nodes(expr); return NULL; } break; case LEX_WILD_CARD: - expr = ast_alloc_node(NULL, AST_WILD_CARD); + expr = GET_NODE(AST_WILD_CARD); + RETURN_IF_NULL(expr); break; case LEX_EXPR_END: zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Filter must not be empty"); @@ -300,7 +310,6 @@ static struct ast_node* parse_filter(PARSER_PARAMS) { CONSUME_TOKEN(); if (!HAS_TOKEN() || CUR_TOKEN() != LEX_EXPR_END) { - free_ast_nodes(expr); zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Missing filter end `]`"); return NULL; } @@ -316,7 +325,8 @@ static struct ast_node* parse_or(PARSER_PARAMS) { struct ast_node* right = parse_and(PARSER_ARGS); - expr = ast_alloc_binary(AST_OR, expr, right); + expr = GET_BINARY_NODE(AST_OR, expr, right); + RETURN_IF_NULL(expr); } return expr; @@ -330,7 +340,8 @@ static struct ast_node* parse_and(PARSER_PARAMS) { struct ast_node* right = parse_equality(PARSER_ARGS); - expr = ast_alloc_binary(AST_AND, expr, right); + expr = GET_BINARY_NODE(AST_AND, expr, right); + RETURN_IF_NULL(expr); } return expr; @@ -355,12 +366,11 @@ static struct ast_node* parse_equality(PARSER_PARAMS) { struct ast_node* right = parse_comparison(PARSER_ARGS); if (right == NULL) { - free_ast_nodes(expr); - free_ast_nodes(right); return NULL; } - expr = ast_alloc_binary(type, expr, right); + expr = GET_BINARY_NODE(type, expr, right); + RETURN_IF_NULL(expr); } return expr; @@ -391,12 +401,11 @@ static struct ast_node* parse_comparison(PARSER_PARAMS) { struct ast_node* right = parse_unary(PARSER_ARGS); if (right == NULL) { - free_ast_nodes(expr); - free_ast_nodes(right); return NULL; } - expr = ast_alloc_binary(type, expr, right); + expr = GET_BINARY_NODE(type, expr, right); + RETURN_IF_NULL(expr); } return expr; @@ -405,7 +414,8 @@ static struct ast_node* parse_comparison(PARSER_PARAMS) { static struct ast_node* parse_unary(PARSER_PARAMS) { if (CUR_TOKEN() == LEX_NEGATION) { CONSUME_TOKEN(); - struct ast_node* expr = ast_alloc_node(NULL, AST_NEGATION); + struct ast_node* expr = GET_NODE(AST_NEGATION); + RETURN_IF_NULL(expr); expr->data.d_unary.right = parse_unary(PARSER_ARGS); return expr; } @@ -415,19 +425,19 @@ static struct ast_node* parse_unary(PARSER_PARAMS) { static struct ast_node* parse_primary(PARSER_PARAMS) { if (CUR_TOKEN() == LEX_LITERAL) { - struct ast_node* ret = ast_alloc_node(NULL, AST_LITERAL); + struct ast_node* ret = GET_NODE(AST_LITERAL); + RETURN_IF_NULL(ret); - ret->data.d_literal.val = CUR_TOKEN_LITERAL(); - ret->data.d_literal.len = CUR_TOKEN_LEN(); + ret->data.d_literal.str = zend_string_init(CUR_TOKEN_LITERAL(), CUR_TOKEN_LEN(), 0); CONSUME_TOKEN(); return ret; } if (CUR_TOKEN() == LEX_LITERAL_NUMERIC) { - struct ast_node* ret = ast_alloc_node(NULL, AST_DOUBLE); + struct ast_node* ret = GET_NODE(AST_DOUBLE); + RETURN_IF_NULL(ret); if (!make_numeric_node(ret, CUR_TOKEN_LITERAL(), CUR_TOKEN_LEN())) { - free_ast_nodes(ret); zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Unable to parse numeric"); return NULL; } @@ -436,14 +446,14 @@ static struct ast_node* parse_primary(PARSER_PARAMS) { } if (CUR_TOKEN() == LEX_LITERAL_BOOL) { - struct ast_node* ret = ast_alloc_node(NULL, AST_BOOL); + struct ast_node* ret = GET_NODE(AST_BOOL); + RETURN_IF_NULL(ret); if (strncasecmp("true", CUR_TOKEN_LITERAL(), 4) == 0) { ret->data.d_literal.value_bool = true; } else if (strncasecmp("false", CUR_TOKEN_LITERAL(), 5) == 0) { ret->data.d_literal.value_bool = false; } else { - free_ast_nodes(ret); zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Expected `true` or `false` for boolean token"); return NULL; } @@ -452,7 +462,8 @@ static struct ast_node* parse_primary(PARSER_PARAMS) { } if (CUR_TOKEN() == LEX_LITERAL_NULL) { - struct ast_node* ret = ast_alloc_node(NULL, AST_NULL); + struct ast_node* ret = GET_NODE(AST_NULL); + RETURN_IF_NULL(ret); CONSUME_TOKEN(); return ret; } @@ -470,7 +481,6 @@ static struct ast_node* parse_primary(PARSER_PARAMS) { CONSUME_TOKEN(); return expr; } else { - free_ast_nodes(expr); zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Missing closing paren `)`"); return NULL; } @@ -492,16 +502,16 @@ static struct ast_node* parse_primary(PARSER_PARAMS) { } /* Run parse_jsonpath on a subset of the lex stream, until the boundary of the sub-JSONPath */ - return parse_jsonpath(lex_tok, &start, stop); + return parse_jsonpath(lex_tok, &start, stop, pool); } if (CUR_TOKEN() == LEX_CUR_NODE) { - struct ast_node* expr = ast_alloc_node(NULL, AST_CUR_NODE); + struct ast_node* expr = GET_NODE(AST_CUR_NODE); + RETURN_IF_NULL(expr); CONSUME_TOKEN(); if (HAS_TOKEN() && !is_logical_operator(CUR_TOKEN()) && CUR_TOKEN() != LEX_PAREN_CLOSE) { expr->next = parse_operator(PARSER_ARGS); if (expr->next == NULL) { - free_ast_nodes(expr); return NULL; } } @@ -633,38 +643,16 @@ static bool is_logical_operator(lex_token type) { } } -void free_ast_nodes(struct ast_node* head) { - if (head == NULL) { - return; - } +void free_php_objects(struct node_pool* pool) { + int i; - switch (head->type) { - case AST_AND: - case AST_EQ: - case AST_GT: - case AST_GTE: - case AST_LT: - case AST_LTE: - case AST_NE: - case AST_OR: - case AST_RGXP: - free_ast_nodes(head->data.d_binary.left); - free_ast_nodes(head->data.d_binary.right); - break; - case AST_EXPR: - free_ast_nodes(head->data.d_expression.head); - break; - case AST_NEGATION: - free_ast_nodes(head->data.d_unary.right); - break; - default: - /* noop */ - break; + for (i = 0; i < pool->cur_index; i++) { + if (pool->nodes[i].type == AST_LITERAL) { + zend_string_release(pool->nodes[i].data.d_literal.str); + } else if (pool->nodes[i].type == AST_NODE_LIST) { + zend_array_destroy(pool->nodes[i].data.d_nodes.ht); + } } - - free_ast_nodes(head->next); - - efree((void*)head); } #ifdef JSONPATH_DEBUG @@ -725,7 +713,7 @@ void print_ast(struct ast_node* head, const char* m, int level) { printf("\n"); break; case AST_LITERAL: - printf(" [val=%.*s]\n", head->data.d_literal.len, head->data.d_literal.val); + printf(" [val=%.*s]\n", (int)head->data.d_literal.str->len, head->data.d_literal.str->val); break; case AST_INDEX_SLICE: printf(" [start=%d end=%d step=%d]\n", head->data.d_list.indexes[0], head->data.d_list.indexes[1], diff --git a/src/jsonpath/parser.h b/src/jsonpath/parser.h index 746a1ed..c24e1a3 100644 --- a/src/jsonpath/parser.h +++ b/src/jsonpath/parser.h @@ -4,10 +4,12 @@ #include #include #include +#include #include "lexer.h" #define FILTER_ARR_LEN 8 +#define NODE_POOL_LEN 64 typedef enum { TYPE_OPERAND, @@ -60,13 +62,11 @@ union ast_node_data { } d_list; struct { int count; - int len[FILTER_ARR_LEN]; - char* str[FILTER_ARR_LEN]; + HashTable* ht; } d_nodes; struct { - char* val; - int len; bool value_bool; + zend_string* str; } d_literal; struct { char* val; @@ -92,10 +92,15 @@ struct ast_node { union ast_node_data data; }; -void free_ast_nodes(struct ast_node* head); +struct node_pool { + struct ast_node nodes[NODE_POOL_LEN]; + int cur_index; +}; + bool is_binary(enum ast_type type); bool is_unary(enum ast_type type); -struct ast_node* parse_jsonpath(struct jpath_token lex_tok[], int* lex_idx, int lex_tok_count); +struct ast_node* parse_jsonpath(struct jpath_token lex_tok[], int* lex_idx, int lex_tok_count, struct node_pool* pool); +void free_php_objects(struct node_pool* pool); #ifdef JSONPATH_DEBUG void print_ast(struct ast_node* head, const char* m, int level); diff --git a/tests/bounds_checks/001.phpt b/tests/bounds_checks/001.phpt deleted file mode 100644 index f338500..0000000 --- a/tests/bounds_checks/001.phpt +++ /dev/null @@ -1,22 +0,0 @@ ---TEST-- -Test union with more than FILTER_ARR_LEN elements ---SKIPIF-- - ---FILE-- - "value", - "another" => "entry", -]; - -$jsonPath = new JsonPath(); -$result = $jsonPath->find($data, "$['one','two','three','four','five','six','seven','eight','nine']"); - -var_dump($result); -?> ---EXPECTF-- -Fatal error: Uncaught RuntimeException: Union filter may contain no more than 8 elements in %s -Stack trace: -%s -%s -%s \ No newline at end of file From fd756e7560d4815ed78fcc03ad2a382c03751dd0 Mon Sep 17 00:00:00 2001 From: Mike Kaminski Date: Wed, 23 Jun 2021 22:13:32 -0400 Subject: [PATCH 2/2] Replace fixed array with hastable for index filter This change reduces the size of the stack-allocated memory pool to < 2Kib and removes the limit imposed on the number of elements in the index filter. With a bit of more work, we could support integer array operands. (See tests/022.phpt) --- src/jsonpath/interpreter.c | 18 ++++++++------ src/jsonpath/parser.c | 48 +++++++++++++++++++----------------- src/jsonpath/parser.h | 8 +----- tests/bounds_checks/005.phpt | 22 ----------------- 4 files changed, 38 insertions(+), 58 deletions(-) delete mode 100644 tests/bounds_checks/005.phpt diff --git a/src/jsonpath/interpreter.c b/src/jsonpath/interpreter.c index 77854a2..3e853c7 100644 --- a/src/jsonpath/interpreter.c +++ b/src/jsonpath/interpreter.c @@ -129,7 +129,7 @@ void exec_node_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* zend_ulong idx; zval* data; - ZEND_HASH_FOREACH_VAL(tok->data.d_nodes.ht, data) { + ZEND_HASH_FOREACH_VAL(tok->data.d_list.ht, data) { if (ZEND_HANDLE_NUMERIC_STR(Z_STRVAL_P(data), Z_STRLEN_P(data), idx)) { /* look up numeric index */ data = zend_hash_index_find(HASH_OF(arr_cur), idx); @@ -150,8 +150,8 @@ void exec_node_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* void exec_index_filter(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* return_value) { int i; - for (i = 0; i < tok->data.d_list.count; i++) { - int index = tok->data.d_list.indexes[i]; + for (i = 0; i < zend_hash_num_elements(tok->data.d_list.ht); i++) { + int index = Z_LVAL_P(zend_hash_index_find(tok->data.d_list.ht, i)); if (index < 0) { index = zend_hash_num_elements(HASH_OF(arr_cur)) - abs(index); } @@ -175,9 +175,13 @@ void exec_slice(zval* arr_head, zval* arr_cur, struct ast_node* tok, zval* retur int data_length = zend_hash_num_elements(HASH_OF(arr_cur)); - int range_start = tok->data.d_list.indexes[0]; - int range_end = tok->data.d_list.count > 1 ? tok->data.d_list.indexes[1] : INT_MAX; - int range_step = tok->data.d_list.count > 2 ? tok->data.d_list.indexes[2] : 1; + int range_start = Z_LVAL_P(zend_hash_index_find(tok->data.d_list.ht, 0)); + int range_end = zend_hash_num_elements(tok->data.d_list.ht) > 1 + ? (int)Z_LVAL_P(zend_hash_index_find(tok->data.d_list.ht, 1)) + : INT_MAX; + int range_step = zend_hash_num_elements(tok->data.d_list.ht) > 2 + ? (int)Z_LVAL_P(zend_hash_index_find(tok->data.d_list.ht, 2)) + : 1; /* Zero-steps are not allowed, abort */ if (range_step == 0) { @@ -332,7 +336,7 @@ zval* evaluate_primary(struct ast_node* src, zval* tmp_dest, zval* arr_head, zva } return Z_INDIRECT_P(tmp_dest); case AST_NODE_LIST: - ZVAL_ARR(tmp_dest, src->data.d_nodes.ht); + ZVAL_ARR(tmp_dest, src->data.d_list.ht); return tmp_dest; default: zend_throw_exception(spl_ce_RuntimeException, "Unsupported expression operand", 0); diff --git a/src/jsonpath/parser.c b/src/jsonpath/parser.c index 32f95aa..a0404ff 100644 --- a/src/jsonpath/parser.c +++ b/src/jsonpath/parser.c @@ -23,6 +23,8 @@ static struct ast_node* get_binary_node_from_pool(struct node_pool* pool, enum ast_type type, struct ast_node* left, struct ast_node* right); static struct ast_node* get_node_from_pool(struct node_pool* pool, enum ast_type type); +static void ht_append_long(HashTable* ht, long val); +static void ht_append_string(HashTable* ht, char* str, int len); static struct ast_node* parse_and(PARSER_PARAMS); static struct ast_node* parse_childpath(PARSER_PARAMS); @@ -94,12 +96,25 @@ static struct ast_node* get_node_from_pool(struct node_pool* pool, enum ast_type return node; } +static void ht_append_long(HashTable* ht, long val) { + zval tmp; + ZVAL_ARR(&tmp, ht); + add_index_long(&tmp, zend_hash_num_elements(ht), val); +} + +static void ht_append_string(HashTable* ht, char* str, int len) { + zval tmp; + ZVAL_ARR(&tmp, ht); + add_index_stringl(&tmp, zend_hash_num_elements(ht), str, len); +} + static bool parse_filter_list(PARSER_PARAMS, struct ast_node* tok) { int slice_count = 0; /* assume filter type is an index list by default. this resolves type ambiguity of a filter containing no separators. * example: treat level4[0] as an index filter, not a slice. */ tok->type = AST_INDEX_LIST; + tok->data.d_list.ht = zend_new_array(1); /* used to determine if different separator types are present, default value is arbitrary */ enum ast_type sep_found = AST_AND; @@ -127,13 +142,12 @@ static bool parse_filter_list(PARSER_PARAMS, struct ast_node* tok) { slice_count++; /* [:a] => [0:a] */ /* [a::] => [a:0:] */ - if (slice_count > tok->data.d_list.count) { + if (slice_count > zend_hash_num_elements(tok->data.d_list.ht)) { if (slice_count == 1) { - tok->data.d_list.indexes[tok->data.d_list.count] = INT_MAX; + ht_append_long(tok->data.d_list.ht, INT_MAX); } else if (slice_count == 2) { - tok->data.d_list.indexes[tok->data.d_list.count] = INT_MAX; + ht_append_long(tok->data.d_list.ht, INT_MAX); } - tok->data.d_list.count++; } } else if (CUR_TOKEN() == LEX_LITERAL_NUMERIC) { long idx = 0; @@ -142,26 +156,14 @@ static bool parse_filter_list(PARSER_PARAMS, struct ast_node* tok) { zend_throw_exception(spl_ce_RuntimeException, "Unable to parse filter index value", 0); return false; } - if (tok->data.d_list.count >= FILTER_ARR_LEN) { - zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Index filter may contain no more than %d elements", - FILTER_ARR_LEN); - return false; - } - tok->data.d_list.indexes[tok->data.d_list.count] = idx; - tok->data.d_list.count++; + ht_append_long(tok->data.d_list.ht, idx); } else if (CUR_TOKEN() == LEX_LITERAL) { if (sep_found == AST_INDEX_SLICE) { zend_throw_exception(spl_ce_RuntimeException, "Array slice indexes must be integers", 0); return false; } tok->type = sep_found = AST_NODE_LIST; - if (tok->data.d_nodes.ht == NULL) { - tok->data.d_nodes.ht = zend_new_array(1); - } - zval tmp; - ZVAL_ARR(&tmp, tok->data.d_nodes.ht); - add_index_stringl(&tmp, tok->data.d_nodes.count, CUR_TOKEN_LITERAL(), CUR_TOKEN_LEN()); - tok->data.d_nodes.count++; + ht_append_string(tok->data.d_list.ht, CUR_TOKEN_LITERAL(), CUR_TOKEN_LEN()); } else { zend_throw_exception_ex(spl_ce_RuntimeException, 0, "Unexpected token `%s` in filter", LEX_STR[CUR_TOKEN()]); return false; @@ -649,8 +651,9 @@ void free_php_objects(struct node_pool* pool) { for (i = 0; i < pool->cur_index; i++) { if (pool->nodes[i].type == AST_LITERAL) { zend_string_release(pool->nodes[i].data.d_literal.str); - } else if (pool->nodes[i].type == AST_NODE_LIST) { - zend_array_destroy(pool->nodes[i].data.d_nodes.ht); + } else if (pool->nodes[i].type == AST_NODE_LIST || pool->nodes[i].type == AST_INDEX_LIST || + pool->nodes[i].type == AST_INDEX_SLICE) { + zend_array_destroy(pool->nodes[i].data.d_list.ht); } } } @@ -716,8 +719,9 @@ void print_ast(struct ast_node* head, const char* m, int level) { printf(" [val=%.*s]\n", (int)head->data.d_literal.str->len, head->data.d_literal.str->val); break; case AST_INDEX_SLICE: - printf(" [start=%d end=%d step=%d]\n", head->data.d_list.indexes[0], head->data.d_list.indexes[1], - head->data.d_list.indexes[2]); + printf(" [start=%d end=%d step=%d]\n", (int)Z_LVAL_P(zend_hash_index_find(head->data.d_list.ht, 0)), + (int)Z_LVAL_P(zend_hash_index_find(head->data.d_list.ht, 1)), + (int)Z_LVAL_P(zend_hash_index_find(head->data.d_list.ht, 2))); break; case AST_NEGATION: printf("\n"); diff --git a/src/jsonpath/parser.h b/src/jsonpath/parser.h index c24e1a3..74ff379 100644 --- a/src/jsonpath/parser.h +++ b/src/jsonpath/parser.h @@ -8,7 +8,6 @@ #include "lexer.h" -#define FILTER_ARR_LEN 8 #define NODE_POOL_LEN 64 typedef enum { @@ -57,13 +56,8 @@ union ast_node_data { struct ast_node* head; } d_expression; struct { - int count; - int indexes[FILTER_ARR_LEN]; - } d_list; - struct { - int count; HashTable* ht; - } d_nodes; + } d_list; struct { bool value_bool; zend_string* str; diff --git a/tests/bounds_checks/005.phpt b/tests/bounds_checks/005.phpt deleted file mode 100644 index a20c17b..0000000 --- a/tests/bounds_checks/005.phpt +++ /dev/null @@ -1,22 +0,0 @@ ---TEST-- -Test index filter with more than FILTER_ARR_LEN elements ---SKIPIF-- - ---FILE-- - "value", - "another" => "entry", -]; - -$jsonPath = new JsonPath(); -$result = $jsonPath->find($data, "$[0,1,2,3,4,5,6,7,8]"); - -var_dump($result); -?> ---EXPECTF-- -Fatal error: Uncaught RuntimeException: Index filter may contain no more than 8 elements in %s -Stack trace: -%s -%s -%s \ No newline at end of file