Skip to content

Commit

Permalink
Define meaning of 0 retries for jobs as no retries
Browse files Browse the repository at this point in the history
So far, we have not differentiated between 0 and -1 retries of a job. -1 was defined as an infinite number of retries. The behavior of 0 was not well defined and also handled as an infinite number of retries. This commit defines 0 as no retries.

Fixes: #6691
  • Loading branch information
jnidzwetzki committed Feb 26, 2024
1 parent e1676ce commit 3ea1417
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 36 deletions.
2 changes: 2 additions & 0 deletions .unreleased/pr_6698
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fixes: #6698 Define meaning of 0 retries for jobs as no retries
Thanks: @bvanelli for reporting an issue with the jobs retry count
4 changes: 4 additions & 0 deletions sql/updates/latest-dev.sql
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,7 @@ UPDATE _timescaledb_catalog.continuous_aggs_bucket_function
-- attributes. This is now replaced by proper NULL values. We use TRIM() to ensure we handle empty string well.
UPDATE _timescaledb_catalog.continuous_aggs_bucket_function SET bucket_origin = NULL WHERE TRIM(bucket_origin) = '';
UPDATE _timescaledb_catalog.continuous_aggs_bucket_function SET bucket_timezone = NULL WHERE TRIM(bucket_timezone) = '';

-- So far, there were no difference between 0 and -1 retries. Since now on, 0 means no retries. Updating the retry
-- count of existing jobs to -1 to keep the current semantics.
UPDATE _timescaledb_config.bgw_job SET max_retries = -1 WHERE max_retries = 0;
2 changes: 1 addition & 1 deletion src/bgw/job.c
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ ts_bgw_job_check_max_retries(BgwJob *job)
job_stat = ts_bgw_job_stat_find(job->fd.id);

/* stop to execute failing jobs after reached the "max_retries" option */
if (job->fd.max_retries > 0 && job_stat->fd.consecutive_failures >= job->fd.max_retries)
if (job->fd.max_retries >= 0 && job_stat->fd.consecutive_failures >= job->fd.max_retries)
{
ereport(WARNING,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
Expand Down
24 changes: 12 additions & 12 deletions tsl/src/bgw_policy/compression_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,26 @@
#include <utils/builtins.h>

#include "compression_api.h"
#include "errors.h"
#include "hypertable.h"
#include "hypertable_cache.h"
#include "policy_utils.h"
#include "utils.h"
#include "guc.h"
#include "jsonb_utils.h"
#include "bgw/job.h"
#include "bgw_policy/job.h"

#include "bgw_policy/continuous_aggregate_api.h"
#include "bgw_policy/job_api.h"
#include "bgw_policy/job.h"
#include "bgw_policy/policies_v2.h"
#include "bgw/job_stat.h"
#include "bgw/job.h"
#include "bgw/timer.h"
#include "errors.h"
#include "guc.h"
#include "hypertable_cache.h"
#include "hypertable.h"
#include "jsonb_utils.h"
#include "policy_utils.h"
#include "utils.h"

/* Default max runtime is unlimited for compress chunks */
#define DEFAULT_MAX_RUNTIME \
DatumGetIntervalP(DirectFunctionCall3(interval_in, CStringGetDatum("0"), InvalidOid, -1))

/* Right now, there is an infinite number of retries for compress_chunks jobs */
#define DEFAULT_MAX_RETRIES (-1)
/* Default retry period for reorder_jobs is currently 1 hour */
#define DEFAULT_RETRY_PERIOD \
DatumGetIntervalP(DirectFunctionCall3(interval_in, CStringGetDatum("1 hour"), InvalidOid, -1))
Expand Down Expand Up @@ -350,7 +350,7 @@ policy_compression_add_internal(Oid user_rel_oid, Datum compress_after_datum,
job_id = ts_bgw_job_insert_relation(&application_name,
default_schedule_interval,
DEFAULT_MAX_RUNTIME,
DEFAULT_MAX_RETRIES,
JOB_RETRY_UNLIMITED,
DEFAULT_RETRY_PERIOD,
&proc_schema,
&proc_name,
Expand Down
19 changes: 9 additions & 10 deletions tsl/src/bgw_policy/continuous_aggregate_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,27 @@

#include <jsonb_utils.h>
#include <utils/builtins.h>

#include "bgw_policy/continuous_aggregate_api.h"
#include "bgw_policy/job_api.h"
#include "bgw_policy/job.h"
#include "bgw_policy/policies_v2.h"
#include "bgw_policy/policy_utils.h"
#include "bgw/job_stat.h"
#include "bgw/job.h"
#include "ts_catalog/continuous_agg.h"
#include "bgw/timer.h"
#include "continuous_aggs/materialize.h"
#include "dimension.h"
#include "guc.h"
#include "hypertable_cache.h"
#include "time_utils.h"
#include "policy_utils.h"
#include "guc.h"
#include "bgw_policy/policies_v2.h"
#include "bgw/job_stat.h"
#include "bgw/timer.h"
#include "time_utils.h"
#include "ts_catalog/continuous_agg.h"

/* Default max runtime for a continuous aggregate jobs is unlimited for now */
#define DEFAULT_MAX_RUNTIME \
DatumGetIntervalP(DirectFunctionCall3(interval_in, CStringGetDatum("0"), InvalidOid, -1))

/* infinite number of retries for continuous aggregate jobs */
#define DEFAULT_MAX_RETRIES (-1)

int32
policy_continuous_aggregate_get_mat_hypertable_id(const Jsonb *config)
{
Expand Down Expand Up @@ -630,7 +629,7 @@ policy_refresh_cagg_add_internal(Oid cagg_oid, Oid start_offset_type, NullableDa
job_id = ts_bgw_job_insert_relation(&application_name,
&refresh_interval,
DEFAULT_MAX_RUNTIME,
DEFAULT_MAX_RETRIES,
JOB_RETRY_UNLIMITED,
&refresh_interval,
&proc_schema,
&proc_name,
Expand Down
4 changes: 1 addition & 3 deletions tsl/src/bgw_policy/job_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
/* Default max runtime for a custom job is unlimited for now */
#define DEFAULT_MAX_RUNTIME 0

/* Right now, there is an infinite number of retries for custom jobs */
#define DEFAULT_MAX_RETRIES (-1)
/* Default retry period for reorder_jobs is currently 5 minutes */
#define DEFAULT_RETRY_PERIOD (5 * USECS_PER_MINUTE)

Expand Down Expand Up @@ -176,7 +174,7 @@ job_add(PG_FUNCTION_ARGS)
job_id = ts_bgw_job_insert_relation(&application_name,
schedule_interval,
&max_runtime,
DEFAULT_MAX_RETRIES,
JOB_RETRY_UNLIMITED,
&retry_period,
&proc_schema,
&proc_name,
Expand Down
4 changes: 4 additions & 0 deletions tsl/src/bgw_policy/job_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

#include <postgres.h>

/* Special values for the number of retries of a failed job */
#define JOB_RETRY_UNLIMITED (-1)
#define JOB_RETRY_NONE 0

extern Datum job_add(PG_FUNCTION_ARGS);
extern Datum job_alter(PG_FUNCTION_ARGS);
extern Datum job_delete(PG_FUNCTION_ARGS);
Expand Down
15 changes: 7 additions & 8 deletions tsl/src/bgw_policy/reorder_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@
#include <hypertable_cache.h>
#include <jsonb_utils.h>

#include "bgw/job.h"
#include "bgw_policy/job_api.h"
#include "bgw_policy/job.h"
#include "bgw_policy/reorder_api.h"
#include "bgw/job_stat.h"
#include "bgw/job.h"
#include "bgw/timer.h"
#include "errors.h"
#include "guc.h"
#include "hypertable.h"
#include "reorder.h"
#include "utils.h"
#include "guc.h"
#include "bgw/job_stat.h"
#include "bgw/timer.h"

/*
* Default scheduled interval for reorder jobs should be 1/2 of the default chunk length.
* If no such length is specified for the hypertable, then
Expand All @@ -43,8 +43,7 @@
/* Default max runtime for a reorder job is unlimited for now */
#define DEFAULT_MAX_RUNTIME \
DatumGetIntervalP(DirectFunctionCall3(interval_in, CStringGetDatum("0"), InvalidOid, -1))
/* Right now, there is an infinite number of retries for reorder jobs */
#define DEFAULT_MAX_RETRIES (-1)

/* Default retry period for reorder_jobs is currently 5 minutes */
#define DEFAULT_RETRY_PERIOD \
DatumGetIntervalP(DirectFunctionCall3(interval_in, CStringGetDatum("5 min"), InvalidOid, -1))
Expand Down Expand Up @@ -272,7 +271,7 @@ policy_reorder_add(PG_FUNCTION_ARGS)
job_id = ts_bgw_job_insert_relation(&application_name,
&schedule_interval,
DEFAULT_MAX_RUNTIME,
DEFAULT_MAX_RETRIES,
JOB_RETRY_UNLIMITED,
DEFAULT_RETRY_PERIOD,
&proc_schema,
&proc_name,
Expand Down
88 changes: 88 additions & 0 deletions tsl/test/expected/bgw_db_scheduler.out
Original file line number Diff line number Diff line change
Expand Up @@ -1646,7 +1646,95 @@ DETAIL: owner of job 1026
DELETE FROM _timescaledb_config.bgw_job WHERE id = :job_id;
-- This should succeed
DROP USER renamed_user;
--
-- Test without retry
--
\c :TEST_DBNAME :ROLE_SUPERUSER
TRUNCATE bgw_log;
TRUNCATE _timescaledb_internal.bgw_job_stat;
SELECT ts_bgw_params_reset_time();
ts_bgw_params_reset_time
--------------------------

(1 row)

SELECT ts_bgw_params_mock_wait_returns_immediately(:WAIT_ON_JOB);
ts_bgw_params_mock_wait_returns_immediately
---------------------------------------------

(1 row)

DELETE FROM _timescaledb_config.bgw_job;
INSERT INTO _timescaledb_config.bgw_job(application_name, schedule_interval, max_runtime, max_retries, retry_period, proc_schema, proc_name) VALUES('bgw_test_job_2_error', INTERVAL '5000ms', INTERVAL '20ms', 0, INTERVAL '20ms', 'public', 'bgw_test_job_2_error') RETURNING id;
id
------
1027
(1 row)

\c :TEST_DBNAME :ROLE_DEFAULT_PERM_USER
-- Run the first time
SELECT ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish(25);
ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish
------------------------------------------------------------

(1 row)

SELECT job_id, last_run_success, total_runs, total_successes, total_failures, total_crashes FROM _timescaledb_internal.bgw_job_stat;
job_id | last_run_success | total_runs | total_successes | total_failures | total_crashes
--------+------------------+------------+-----------------+----------------+---------------
1027 | f | 1 | 0 | 1 | 0
(1 row)

SELECT * FROM sorted_bgw_log;
msg_no | application_name | msg
--------+----------------------+-----------------------------------------------------------
0 | DB Scheduler | [TESTING] Registered new background worker
1 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM)
1 | bgw_test_job_2_error | job 1027 reached max_retries after 1 consecutive failures
2 | bgw_test_job_2_error | job 1027 threw an error
3 | bgw_test_job_2_error | Error job 2
2 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM)
(6 rows)

SELECT last_finish, last_successful_finish, last_run_success FROM _timescaledb_internal.bgw_job_stat;
last_finish | last_successful_finish | last_run_success
------------------------------+------------------------+------------------
Fri Dec 31 16:00:00 1999 PST | -infinity | f
(1 row)

-- Run the second time
SELECT ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish(100, 50);
ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish
------------------------------------------------------------

(1 row)

SELECT job_id, last_run_success, total_runs, total_successes, total_failures, total_crashes FROM _timescaledb_internal.bgw_job_stat;
job_id | last_run_success | total_runs | total_successes | total_failures | total_crashes
--------+------------------+------------+-----------------+----------------+---------------
1027 | f | 1 | 0 | 1 | 0
(1 row)

SELECT * FROM sorted_bgw_log;
msg_no | application_name | msg
--------+----------------------+-----------------------------------------------------------
0 | DB Scheduler | [TESTING] Registered new background worker
1 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM)
1 | bgw_test_job_2_error | job 1027 reached max_retries after 1 consecutive failures
2 | bgw_test_job_2_error | job 1027 threw an error
3 | bgw_test_job_2_error | Error job 2
2 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM)
0 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM)
(7 rows)

SELECT last_finish, last_successful_finish, last_run_success FROM _timescaledb_internal.bgw_job_stat;
last_finish | last_successful_finish | last_run_success
------------------------------+------------------------+------------------
Fri Dec 31 16:00:00 1999 PST | -infinity | f
(1 row)

-- clean up jobs
\c :TEST_DBNAME :ROLE_SUPERUSER
SELECT _timescaledb_functions.stop_background_workers();
stop_background_workers
-------------------------
Expand Down
27 changes: 25 additions & 2 deletions tsl/test/sql/bgw_db_scheduler.sql
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,31 @@ DELETE FROM _timescaledb_config.bgw_job WHERE id = :job_id;
-- This should succeed
DROP USER renamed_user;

-- clean up jobs
SELECT _timescaledb_functions.stop_background_workers();

--
-- Test without retry
--
\c :TEST_DBNAME :ROLE_SUPERUSER
TRUNCATE bgw_log;
TRUNCATE _timescaledb_internal.bgw_job_stat;
SELECT ts_bgw_params_reset_time();
SELECT ts_bgw_params_mock_wait_returns_immediately(:WAIT_ON_JOB);
DELETE FROM _timescaledb_config.bgw_job;
INSERT INTO _timescaledb_config.bgw_job(application_name, schedule_interval, max_runtime, max_retries, retry_period, proc_schema, proc_name) VALUES('bgw_test_job_2_error', INTERVAL '5000ms', INTERVAL '20ms', 0, INTERVAL '20ms', 'public', 'bgw_test_job_2_error') RETURNING id;
\c :TEST_DBNAME :ROLE_DEFAULT_PERM_USER

-- Run the first time
SELECT ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish(25);
SELECT job_id, last_run_success, total_runs, total_successes, total_failures, total_crashes FROM _timescaledb_internal.bgw_job_stat;
SELECT * FROM sorted_bgw_log;
SELECT last_finish, last_successful_finish, last_run_success FROM _timescaledb_internal.bgw_job_stat;

-- Run the second time
SELECT ts_bgw_db_scheduler_test_run_and_wait_for_scheduler_finish(100, 50);
SELECT job_id, last_run_success, total_runs, total_successes, total_failures, total_crashes FROM _timescaledb_internal.bgw_job_stat;
SELECT * FROM sorted_bgw_log;
SELECT last_finish, last_successful_finish, last_run_success FROM _timescaledb_internal.bgw_job_stat;

-- clean up jobs
\c :TEST_DBNAME :ROLE_SUPERUSER
SELECT _timescaledb_functions.stop_background_workers();

0 comments on commit 3ea1417

Please sign in to comment.