valkey-io · madolson · Jul 15, 2024 · May 15, 2024 · May 15, 2024 · May 16, 2024
diff --git a/src/networking.c b/src/networking.c
@@ -545,19 +545,26 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
         server.stat_total_error_replies++;
         /* Increment the error stats
          * If the string already starts with "-..." then the error prefix
-         * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
-        if (s[0] != '-') {
-            incrementErrorCount("ERR", 3);
-        } else {
+         * is provided by the caller (we limit the search to 32 chars). Otherwise we use "-ERR". */
+        char *err_prefix = "ERR";
+        size_t prefix_len = 3;
+        if (s[0] == '-') {
             char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
+            /* If we cannot retrieve the error prefix, use the default: "ERR". */
             if (spaceloc) {
                 const size_t errEndPos = (size_t)(spaceloc - s);
-                incrementErrorCount(s + 1, errEndPos - 1);
-            } else {
-                /* Fallback to ERR if we can't retrieve the error prefix */
-                incrementErrorCount("ERR", 3);
+                err_prefix = (char *)s + 1;
+                prefix_len = errEndPos - 1;
             }
         }
+        /* After the errors RAX reaches its limit, instead of tracking
+         * custom errors (e.g. LUA), we track the error under `errorstat_ERRORSTATS_OVERFLOW` */
+        if (flags & ERR_REPLY_FLAG_CUSTOM && !raxFind(server.errors, (unsigned char *)err_prefix, prefix_len, NULL) &&
+            raxSize(server.errors) >= ERROR_STATS_LIMIT) {
+            err_prefix = ERRORSTATS_OVERFLOW_ERR;
+            prefix_len = strlen(ERRORSTATS_OVERFLOW_ERR);
+        }
+        incrementErrorCount(err_prefix, prefix_len);
     } else {
         /* stat_total_error_replies will not be updated, which means that
          * the cmd stats will not be updated as well, we still want this command

diff --git a/src/script_lua.c b/src/script_lua.c
@@ -617,8 +617,9 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu
                     1); /* pop the error message, we will use luaExtractErrorInformation to get error information */
             errorInfo err_info = {0};
             luaExtractErrorInformation(lua, &err_info);
-            addReplyErrorFormatEx(c, err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0, "-%s",
-                                  err_info.msg);
+            addReplyErrorFormatEx(
+                c, ERR_REPLY_FLAG_CUSTOM | (err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0),
+                "-%s", err_info.msg);
             luaErrorInformationDiscard(&err_info);
             lua_pop(lua, 1); /* pop the result table */
             return;

diff --git a/src/server.c b/src/server.c
@@ -2531,7 +2531,6 @@ void initServer(void) {
     server.main_thread_id = pthread_self();
     server.current_client = NULL;
     server.errors = raxNew();
-    server.errors_enabled = 1;
     server.execution_nesting = 0;
     server.clients = listCreate();
     server.clients_index = raxNew();
@@ -3011,7 +3010,6 @@ void resetCommandTableStats(dict *commands) {
 void resetErrorTableStats(void) {
     freeErrorsRadixTreeAsync(server.errors);
     server.errors = raxNew();
-    server.errors_enabled = 1;
 }
 
 /* ========================== OP Array API ============================ */
@@ -4048,48 +4046,9 @@ int processCommand(client *c) {
 
 /* ====================== Error lookup and execution ===================== */
 
-/* Users who abuse lua error_reply will generate a new error object on each
- * error call, which can make server.errors get bigger and bigger. This will
- * cause the server to block when calling INFO (we also return errorstats by
- * default). To prevent the damage it can cause, when a misuse is detected,
- * we will print the warning log and disable the errorstats to avoid adding
- * more new errors. It can be re-enabled via CONFIG RESETSTAT. */
-#define ERROR_STATS_NUMBER 128
 void incrementErrorCount(const char *fullerr, size_t namelen) {
-    /* errorstats is disabled, return ASAP. */
-    if (!server.errors_enabled) return;
-
     void *result;
     if (!raxFind(server.errors, (unsigned char *)fullerr, namelen, &result)) {
-        if (server.errors->numele >= ERROR_STATS_NUMBER) {
-            sds errors = sdsempty();
-            raxIterator ri;
-            raxStart(&ri, server.errors);
-            raxSeek(&ri, "^", NULL, 0);
-            while (raxNext(&ri)) {
-                char *tmpsafe;
-                errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len);
-                errors = sdscatlen(errors, ", ", 2);
-                if (tmpsafe != NULL) zfree(tmpsafe);
-            }
-            sdsrange(errors, 0, -3); /* Remove final ", ". */
-            raxStop(&ri);
-
-            /* Print the warning log and the contents of server.errors to the log. */
-            serverLog(LL_WARNING, "Errorstats stopped adding new errors because the number of "
-                                  "errors reached the limit, may be misuse of lua error_reply, "
-                                  "please check INFO ERRORSTATS, this can be re-enabled via "
-                                  "CONFIG RESETSTAT.");
-            serverLog(LL_WARNING, "Current errors code list: %s", errors);
-            sdsfree(errors);
-
-            /* Reset the errors and add a single element to indicate that it is disabled. */
-            resetErrorTableStats();
-            incrementErrorCount("ERRORSTATS_DISABLED", 19);
-            server.errors_enabled = 0;
-            return;
-        }
-
         struct serverError *error = zmalloc(sizeof(*error));
         error->count = 1;
         raxInsert(server.errors, (unsigned char *)fullerr, namelen, error, NULL);

diff --git a/src/server.h b/src/server.h
@@ -1580,7 +1580,6 @@ struct valkeyServer {
     dict *orig_commands; /* Command table before command renaming. */
     aeEventLoop *el;
     rax *errors;                         /* Errors table */
-    int errors_enabled;                  /* If true, errorstats is enabled, and we will add new errors. */
     unsigned int lruclock;               /* Clock for LRU eviction */
     volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
     mstime_t shutdown_mstime;            /* Timestamp to limit graceful shutdown. */
@@ -2595,10 +2594,16 @@ int serverCommunicateSystemd(const char *sd_notify_msg);
 void serverSetCpuAffinity(const char *cpulist);
 void dictVanillaFree(dict *d, void *val);
 
+/* ERROR STATS constants */
+#define ERROR_STATS_LIMIT 128 /* Once the errors RAX reaches this limit, instead of tracking custom \
+                                    errors (e.g. LUA), we track the error under the prefix below. */
+#define ERRORSTATS_OVERFLOW_ERR "ERRORSTATS_OVERFLOW"
+
 /* afterErrorReply flags */
-#define ERR_REPLY_FLAG_NO_STATS_UPDATE                                                                                 \
-    (1ULL << 0) /* Indicating that we should not update                                                                \
-                   error stats after sending error reply */
+#define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL<<0) /* Indicating that we should not update            \
+                                                    error stats after sending error reply */
+#define ERR_REPLY_FLAG_CUSTOM (1ULL<<1) /* Indicates the error message is custom (e.g. from LUA). */
+
 /* networking.c -- Networking and Client related operations */
 client *createClient(connection *conn);
 void freeClient(client *c);

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
@@ -278,21 +278,34 @@ start_server {tags {"info" "external:skip"}} {
             r config resetstat
             for {set j 1} {$j <= 1100} {incr j} {
                 assert_error "$j my error message" {
-                    r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
+                    r eval {return server.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
                 }
             }
-
-            assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1
-            assert_equal [count_log_message 0 "Current errors code list"] 1
-            assert_equal "count=1" [errorstat ERRORSTATS_DISABLED]
-
-            # Since we currently have no metrics exposed for server.errors, we use lazyfree
-            # to verify that we only have 128 errors.
-            wait_for_condition 50 100 {
-                [s lazyfreed_objects] eq 128
-            } else {
-                fail "errorstats resetstat lazyfree error"
+            # Validate that custom LUA errors are tracked in `ERRORSTATS_OVERFLOW` when errors
+            # has 128 entries.
+            assert_equal "count=972" [errorstat ERRORSTATS_OVERFLOW]
+            # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
+            assert_error {ERR syntax error} {r set a b c d e f g}
+            assert_equal "count=1" [errorstat ERR]
+            # Validate that custom errors that were already tracked continue to increment when past 128 entries.
+            assert_equal "count=1" [errorstat 1]
+            assert_error "1 my error message" {
+                r eval {return server.error_reply(string.format('1 my error message', ARGV[1]))} 0
             }
+            assert_equal "count=2" [errorstat 1]
+            # Test LUA error variants.
+            assert_error "My error message" {r eval {return server.error_reply('My error message')} 0}
+            assert_error "My error message" {r eval {return {err = 'My error message'}} 0}
+            assert_equal "count=974" [errorstat ERRORSTATS_OVERFLOW]
+            # Function calls that contain custom error messages should call be included in overflow counter
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('customerrorfn', function() return server.error_reply('My error message') end)"]
+            assert_error "My error message" {r fcall customerrorfn 0}
+            assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW]
+            # Function calls that contain non lua errors should continue to be tracked normally (in a separate counter).
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('invalidgetcmd', function() return server.call('get', 'x', 'x', 'x') end)"]
+            assert_error "ERR Wrong number of args*" {r fcall invalidgetcmd 0}
+            assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW]
+            assert_equal "count=2" [errorstat ERR]
         }
 
         test {stats: eventloop metrics} {