diff --git a/src/networking.c b/src/networking.c index bb7bab02c..a0d0a5f21 100644 --- a/src/networking.c +++ b/src/networking.c @@ -545,19 +545,26 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { server.stat_total_error_replies++; /* Increment the error stats * If the string already starts with "-..." then the error prefix - * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */ - if (s[0] != '-') { - incrementErrorCount("ERR", 3); - } else { + * is provided by the caller (we limit the search to 32 chars). Otherwise we use "-ERR". */ + char *err_prefix = "ERR"; + size_t prefix_len = 3; + if (s[0] == '-') { char *spaceloc = memchr(s, ' ', len < 32 ? len : 32); + /* If we cannot retrieve the error prefix, use the default: "ERR". */ if (spaceloc) { const size_t errEndPos = (size_t)(spaceloc - s); - incrementErrorCount(s + 1, errEndPos - 1); - } else { - /* Fallback to ERR if we can't retrieve the error prefix */ - incrementErrorCount("ERR", 3); + err_prefix = (char *)s + 1; + prefix_len = errEndPos - 1; } } + /* After the errors RAX reaches its limit, instead of tracking + * custom errors (e.g. LUA), we track the error under `errorstat_ERRORSTATS_OVERFLOW` */ + if (flags & ERR_REPLY_FLAG_CUSTOM && raxSize(server.errors) >= ERRORSTATS_LIMIT && + !raxFind(server.errors, (unsigned char *)err_prefix, prefix_len, NULL)) { + err_prefix = ERRORSTATS_OVERFLOW_ERR; + prefix_len = strlen(ERRORSTATS_OVERFLOW_ERR); + } + incrementErrorCount(err_prefix, prefix_len); } else { /* stat_total_error_replies will not be updated, which means that * the cmd stats will not be updated as well, we still want this command diff --git a/src/script_lua.c b/src/script_lua.c index a13c4cbf1..19f712fa3 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -617,8 +617,9 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu 1); /* pop the error message, we will use luaExtractErrorInformation to get error information */ errorInfo err_info = {0}; luaExtractErrorInformation(lua, &err_info); - addReplyErrorFormatEx(c, err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0, "-%s", - err_info.msg); + addReplyErrorFormatEx( + c, ERR_REPLY_FLAG_CUSTOM | (err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0), + "-%s", err_info.msg); luaErrorInformationDiscard(&err_info); lua_pop(lua, 1); /* pop the result table */ return; diff --git a/src/server.c b/src/server.c index 228307e3c..df16b09e3 100644 --- a/src/server.c +++ b/src/server.c @@ -2531,7 +2531,6 @@ void initServer(void) { server.main_thread_id = pthread_self(); server.current_client = NULL; server.errors = raxNew(); - server.errors_enabled = 1; server.execution_nesting = 0; server.clients = listCreate(); server.clients_index = raxNew(); @@ -3011,7 +3010,6 @@ void resetCommandTableStats(dict *commands) { void resetErrorTableStats(void) { freeErrorsRadixTreeAsync(server.errors); server.errors = raxNew(); - server.errors_enabled = 1; } /* ========================== OP Array API ============================ */ @@ -4048,48 +4046,9 @@ int processCommand(client *c) { /* ====================== Error lookup and execution ===================== */ -/* Users who abuse lua error_reply will generate a new error object on each - * error call, which can make server.errors get bigger and bigger. This will - * cause the server to block when calling INFO (we also return errorstats by - * default). To prevent the damage it can cause, when a misuse is detected, - * we will print the warning log and disable the errorstats to avoid adding - * more new errors. It can be re-enabled via CONFIG RESETSTAT. */ -#define ERROR_STATS_NUMBER 128 void incrementErrorCount(const char *fullerr, size_t namelen) { - /* errorstats is disabled, return ASAP. */ - if (!server.errors_enabled) return; - void *result; if (!raxFind(server.errors, (unsigned char *)fullerr, namelen, &result)) { - if (server.errors->numele >= ERROR_STATS_NUMBER) { - sds errors = sdsempty(); - raxIterator ri; - raxStart(&ri, server.errors); - raxSeek(&ri, "^", NULL, 0); - while (raxNext(&ri)) { - char *tmpsafe; - errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len); - errors = sdscatlen(errors, ", ", 2); - if (tmpsafe != NULL) zfree(tmpsafe); - } - sdsrange(errors, 0, -3); /* Remove final ", ". */ - raxStop(&ri); - - /* Print the warning log and the contents of server.errors to the log. */ - serverLog(LL_WARNING, "Errorstats stopped adding new errors because the number of " - "errors reached the limit, may be misuse of lua error_reply, " - "please check INFO ERRORSTATS, this can be re-enabled via " - "CONFIG RESETSTAT."); - serverLog(LL_WARNING, "Current errors code list: %s", errors); - sdsfree(errors); - - /* Reset the errors and add a single element to indicate that it is disabled. */ - resetErrorTableStats(); - incrementErrorCount("ERRORSTATS_DISABLED", 19); - server.errors_enabled = 0; - return; - } - struct serverError *error = zmalloc(sizeof(*error)); error->count = 1; raxInsert(server.errors, (unsigned char *)fullerr, namelen, error, NULL); diff --git a/src/server.h b/src/server.h index d56dfdcee..69c69d93a 100644 --- a/src/server.h +++ b/src/server.h @@ -1580,7 +1580,6 @@ struct valkeyServer { dict *orig_commands; /* Command table before command renaming. */ aeEventLoop *el; rax *errors; /* Errors table */ - int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */ unsigned int lruclock; /* Clock for LRU eviction */ volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */ mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */ @@ -2595,10 +2594,20 @@ int serverCommunicateSystemd(const char *sd_notify_msg); void serverSetCpuAffinity(const char *cpulist); void dictVanillaFree(dict *d, void *val); +/* ERROR STATS constants */ + +/* Once the errors RAX reaches this limit, instead of tracking custom + * errors (e.g. LUA), we track the error under the prefix below. */ +#define ERRORSTATS_LIMIT 128 +#define ERRORSTATS_OVERFLOW_ERR "ERRORSTATS_OVERFLOW" + /* afterErrorReply flags */ -#define ERR_REPLY_FLAG_NO_STATS_UPDATE \ - (1ULL << 0) /* Indicating that we should not update \ - error stats after sending error reply */ + +/* Indicating that we should not update error stats after sending error reply. */ +#define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL << 0) +/* Indicates the error message is custom (e.g. from LUA). */ +#define ERR_REPLY_FLAG_CUSTOM (1ULL << 1) + /* networking.c -- Networking and Client related operations */ client *createClient(connection *conn); void freeClient(client *c); diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 17dc6a186..d2ca6edc3 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -278,21 +278,34 @@ start_server {tags {"info" "external:skip"}} { r config resetstat for {set j 1} {$j <= 1100} {incr j} { assert_error "$j my error message" { - r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j + r eval {return server.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j } } - - assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1 - assert_equal [count_log_message 0 "Current errors code list"] 1 - assert_equal "count=1" [errorstat ERRORSTATS_DISABLED] - - # Since we currently have no metrics exposed for server.errors, we use lazyfree - # to verify that we only have 128 errors. - wait_for_condition 50 100 { - [s lazyfreed_objects] eq 128 - } else { - fail "errorstats resetstat lazyfree error" + # Validate that custom LUA errors are tracked in `ERRORSTATS_OVERFLOW` when errors + # has 128 entries. + assert_equal "count=972" [errorstat ERRORSTATS_OVERFLOW] + # Validate that non LUA errors continue to be tracked even when we have >=128 entries. + assert_error {ERR syntax error} {r set a b c d e f g} + assert_equal "count=1" [errorstat ERR] + # Validate that custom errors that were already tracked continue to increment when past 128 entries. + assert_equal "count=1" [errorstat 1] + assert_error "1 my error message" { + r eval {return server.error_reply(string.format('1 my error message', ARGV[1]))} 0 } + assert_equal "count=2" [errorstat 1] + # Test LUA error variants. + assert_error "My error message" {r eval {return server.error_reply('My error message')} 0} + assert_error "My error message" {r eval {return {err = 'My error message'}} 0} + assert_equal "count=974" [errorstat ERRORSTATS_OVERFLOW] + # Function calls that contain custom error messages should call be included in overflow counter + r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('customerrorfn', function() return server.error_reply('My error message') end)"] + assert_error "My error message" {r fcall customerrorfn 0} + assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW] + # Function calls that contain non lua errors should continue to be tracked normally (in a separate counter). + r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('invalidgetcmd', function() return server.call('get', 'x', 'x', 'x') end)"] + assert_error "ERR Wrong number of args*" {r fcall invalidgetcmd 0} + assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW] + assert_equal "count=2" [errorstat ERR] } test {stats: eventloop metrics} {