Skip to content

Commit

Permalink
Add support for limiting custom LUA errors when over 128 error messag…
Browse files Browse the repository at this point in the history
…es while allowing non LUA errors to function as usual

Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com>
  • Loading branch information
KarthikSubbarao committed May 15, 2024
1 parent 6e4a610 commit 802570b
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 67 deletions.
28 changes: 17 additions & 11 deletions src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -513,19 +513,25 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
if (!(flags & ERR_REPLY_FLAG_NO_STATS_UPDATE)) {
/* Increment the global error counter */
server.stat_total_error_replies++;
/* Increment the error stats
* If the string already starts with "-..." then the error prefix
* is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
if (s[0] != '-') {
incrementErrorCount("ERR", 3);
/* After the errors RAX reaches this limit, instead of tracking
* custom LUA errors, we track the error under `error_LUA` */
if (flags & ERR_REPLY_FLAG_LUA && raxSize(server.errors) >= ERROR_STATS_LUA_LIMIT) {
incrementErrorCount("LUA_ERRORSTATS_DISABLED", 23);
} else {
char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
if (spaceloc) {
const size_t errEndPos = (size_t)(spaceloc - s);
incrementErrorCount(s+1, errEndPos-1);
} else {
/* Fallback to ERR if we can't retrieve the error prefix */
/* Increment the error stats
* If the string already starts with "-..." then the error prefix
* is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
if (s[0] != '-') {
incrementErrorCount("ERR", 3);
} else {
char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
if (spaceloc) {
const size_t errEndPos = (size_t)(spaceloc - s);
incrementErrorCount(s+1, errEndPos-1);
} else {
/* Fallback to ERR if we can't retrieve the error prefix */
incrementErrorCount("ERR", 3);
}
}
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/script_lua.c
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,7 @@ static void luaReplyToRedisReply(client *c, client* script_client, lua_State *lu
errorInfo err_info = {0};
luaExtractErrorInformation(lua, &err_info);
addReplyErrorFormatEx(c,
err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE: 0,
err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE | ERR_REPLY_FLAG_LUA: ERR_REPLY_FLAG_LUA,
"-%s",
err_info.msg);
luaErrorInformationDiscard(&err_info);
Expand Down
42 changes: 0 additions & 42 deletions src/server.c
Original file line number Diff line number Diff line change
Expand Up @@ -2629,7 +2629,6 @@ void initServer(void) {
server.main_thread_id = pthread_self();
server.current_client = NULL;
server.errors = raxNew();
server.errors_enabled = 1;
server.execution_nesting = 0;
server.clients = listCreate();
server.clients_index = raxNew();
Expand Down Expand Up @@ -3136,7 +3135,6 @@ void resetCommandTableStats(dict* commands) {
void resetErrorTableStats(void) {
freeErrorsRadixTreeAsync(server.errors);
server.errors = raxNew();
server.errors_enabled = 1;
}

/* ========================== OP Array API ============================ */
Expand Down Expand Up @@ -4233,49 +4231,9 @@ int processCommand(client *c) {

/* ====================== Error lookup and execution ===================== */

/* Users who abuse lua error_reply will generate a new error object on each
* error call, which can make server.errors get bigger and bigger. This will
* cause the server to block when calling INFO (we also return errorstats by
* default). To prevent the damage it can cause, when a misuse is detected,
* we will print the warning log and disable the errorstats to avoid adding
* more new errors. It can be re-enabled via CONFIG RESETSTAT. */
#define ERROR_STATS_NUMBER 128
void incrementErrorCount(const char *fullerr, size_t namelen) {
/* errorstats is disabled, return ASAP. */
if (!server.errors_enabled) return;

void *result;
if (!raxFind(server.errors,(unsigned char*)fullerr,namelen,&result)) {
if (server.errors->numele >= ERROR_STATS_NUMBER) {
sds errors = sdsempty();
raxIterator ri;
raxStart(&ri, server.errors);
raxSeek(&ri, "^", NULL, 0);
while (raxNext(&ri)) {
char *tmpsafe;
errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len);
errors = sdscatlen(errors, ", ", 2);
if (tmpsafe != NULL) zfree(tmpsafe);
}
sdsrange(errors, 0, -3); /* Remove final ", ". */
raxStop(&ri);

/* Print the warning log and the contents of server.errors to the log. */
serverLog(LL_WARNING,
"Errorstats stopped adding new errors because the number of "
"errors reached the limit, may be misuse of lua error_reply, "
"please check INFO ERRORSTATS, this can be re-enabled via "
"CONFIG RESETSTAT.");
serverLog(LL_WARNING, "Current errors code list: %s", errors);
sdsfree(errors);

/* Reset the errors and add a single element to indicate that it is disabled. */
resetErrorTableStats();
incrementErrorCount("ERRORSTATS_DISABLED", 19);
server.errors_enabled = 0;
return;
}

struct serverError *error = zmalloc(sizeof(*error));
error->count = 1;
raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL);
Expand Down
7 changes: 6 additions & 1 deletion src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -1568,7 +1568,6 @@ struct valkeyServer {
dict *orig_commands; /* Command table before command renaming. */
aeEventLoop *el;
rax *errors; /* Errors table */
int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */
unsigned int lruclock; /* Clock for LRU eviction */
volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */
Expand Down Expand Up @@ -2563,9 +2562,15 @@ int validateProcTitleTemplate(const char *template);
int serverCommunicateSystemd(const char *sd_notify_msg);
void serverSetCpuAffinity(const char *cpulist);

/* ERROR STATS constants */
#define ERROR_STATS_LUA_LIMIT 128 /* After the errors RAX reaches this limit, instead of tracking
custom LUA errors, we track the error under `error_LUA`. */

/* afterErrorReply flags */
#define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL<<0) /* Indicating that we should not update
error stats after sending error reply */
#define ERR_REPLY_FLAG_LUA (1ULL<<1) /* Indicating that we are coming in from ScriptCall */

/* networking.c -- Networking and Client related operations */
client *createClient(connection *conn);
void freeClient(client *c);
Expand Down
18 changes: 6 additions & 12 deletions tests/unit/info.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -281,18 +281,12 @@ start_server {tags {"info" "external:skip"}} {
r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
}
}

assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1
assert_equal [count_log_message 0 "Current errors code list"] 1
assert_equal "count=1" [errorstat ERRORSTATS_DISABLED]

# Since we currently have no metrics exposed for server.errors, we use lazyfree
# to verify that we only have 128 errors.
wait_for_condition 50 100 {
[s lazyfreed_objects] eq 128
} else {
fail "errorstats resetstat lazyfree error"
}
# Validate that custom LUA errors are tracked in `LUA_ERRORSTATS_DISABLED` when errors
# has 128 entries.
assert_equal "count=972" [errorstat LUA_ERRORSTATS_DISABLED]
# Validate that non LUA errors continue to be tracked even when we have >=128 entries.
assert_error {ERR syntax error} {r set a b c d e f g}
assert_equal "count=1" [errorstat ERR]
}

test {stats: eventloop metrics} {
Expand Down

0 comments on commit 802570b

Please sign in to comment.