diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b36462b..5b58f17 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -270,6 +270,21 @@ jobs: echo "============================================" [ $FAIL -eq 0 ] || exit 1 + - name: "Test: Snapshot-keeper resilience (issue #9)" + env: + PGHOST: localhost + PGPORT: 5434 + SOURCE_HOST: localhost + SOURCE_PORT: 5433 + SOURCE_DB: source_db + SOURCE_USER: postgres + SOURCE_PW: testpass + run: | + # The test script defaults its target connection to + # `psql -U postgres -d target_db`, which matches the CI + # target DB created on port 5434. + bash test/test_snapshot_keeper.sh + - name: Show logs on failure if: failure() run: | diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 1e865a3..6073e0a 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -116,6 +116,16 @@ Failure handling: if any pool worker fails to import, it sets `pool.snapshot_fai Opt-out: `'{"consistent": false}'` in any options-JSON skips all of the above and runs with v4.2.x semantics. The keeper connection is then closed immediately after the table-list read like before, and the pool coordinator is not launched. +### Snapshot-keeper resilience (v4.3.1, issue #9) + +The keeper transaction sits idle-in-transaction for the bulk of a long clone. Three independent paths can silently terminate it and invalidate the exported snapshot: + +1. **Firewall / NAT idle TCP drop** on remote source connections. pgclone injects `keepalives=1 keepalives_idle=30 keepalives_interval=10 keepalives_count=6` into every source conninfo (unless the user already set them), giving ~90s drop detection and keeping perimeter NAT entries warm. Done at the libpq layer via `PQconninfoParse` + `PQconnectdbParams`, so both URI (`postgresql://...`) and keyword-form conninfo strings are handled. +2. **`idle_in_transaction_session_timeout`** on the source server. The keeper's BEGIN issues `SET LOCAL idle_in_transaction_session_timeout = 0` and `SET LOCAL statement_timeout = 0`; both GUCs are `PGC_USERSET` (no privilege required) and `SET LOCAL` reverts at COMMIT, so the settings never leak into pooled connections. +3. **Silent post-drop continuation.** Between major sub-phases of `pgclone.schema()` (per-table loop, FK retry, views, functions, triggers) and `pgclone.database()` (per-schema loop) pgclone runs a cheap `SELECT 1` ping on the keeper. A failure produces a clear error naming the keeper rather than the misleading `invalid snapshot identifier` that PostgreSQL emits when the exported-snapshot file has been reaped. + +Operators who explicitly want to bypass snapshot sharing — for example, on an unreliable link where one of the above is guaranteed to fire — can pass `'{"consistent": false}'` in the options JSON. Each importer then begins its own REPEATABLE READ transaction independently. Cross-table referential consistency is no longer guaranteed, but every per-table copy is still internally consistent. + ### COPY Protocol Data Transfer Data is transferred using PostgreSQL's COPY protocol, which is significantly faster than row-by-row INSERT: diff --git a/pgclone.control b/pgclone.control index ba826bb..48ace0e 100644 --- a/pgclone.control +++ b/pgclone.control @@ -1,6 +1,6 @@ # pgclone extension comment = 'Clone PostgreSQL databases, schemas, tables, roles and permissions with selective columns, data filtering, data masking/anonymization, async, parallel cloning, materialized views, resume, and conflict resolution' -default_version = '4.3.0' +default_version = '4.3.1' module_pathname = '$libdir/pgclone' relocatable = false superuser = true diff --git a/sql/pgclone--4.3.0--4.3.1.sql b/sql/pgclone--4.3.0--4.3.1.sql new file mode 100644 index 0000000..be0389e --- /dev/null +++ b/sql/pgclone--4.3.0--4.3.1.sql @@ -0,0 +1,17 @@ +/* pgclone--4.3.0--4.3.1.sql */ +\echo Use "ALTER EXTENSION pgclone UPDATE" to load this file. \quit + +-- v4.3.1: Snapshot-keeper resilience (issue #9) +-- +-- No SQL signature changes. The fix is entirely in the C module: +-- * TCP keepalives injected into every source connection so the +-- idle keeper survives firewall/NAT idle timeouts. +-- * SET LOCAL idle_in_transaction_session_timeout = 0 and +-- statement_timeout = 0 inside the keeper transaction so a +-- non-zero source-side setting can't kill it. +-- * Specific errhint when SET TRANSACTION SNAPSHOT returns +-- "invalid snapshot identifier" pointing at the most common +-- cause and the {"consistent": false} opt-out. +-- * Keeper liveness ping between sub-phases of pgclone.schema() +-- and pgclone.database() so a silent connection drop produces +-- a clear error before — not during — the next importer. diff --git a/sql/pgclone--4.3.1.sql b/sql/pgclone--4.3.1.sql new file mode 100644 index 0000000..4a6f005 --- /dev/null +++ b/sql/pgclone--4.3.1.sql @@ -0,0 +1,172 @@ +/* pgclone--4.3.0.sql */ +\echo Use "CREATE EXTENSION pgclone" to load this file. \quit + +-- v4.3.0: Consistent-snapshot clones (REPEATABLE READ READ ONLY) +-- No new SQL surface — the behaviour is enabled by default for every +-- table/schema/database clone (sync and async, including parallel +-- pool mode) and can be disabled per-call with the options JSON: +-- SELECT pgclone.schema('...', 'public', true, '{"consistent": false}'); +-- See CHANGELOG.md / docs/USAGE.md for details and tradeoffs. + +-- v4.0.0: All functions now live under the 'pgclone' schema. +-- Usage: SELECT pgclone.table(...), pgclone.schema(...), etc. +CREATE SCHEMA IF NOT EXISTS pgclone; + +-- SYNCHRONOUS +CREATE FUNCTION pgclone.table(source_conninfo TEXT, schema_name TEXT, table_name TEXT, include_data BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_table' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.table(source_conninfo TEXT, schema_name TEXT, table_name TEXT, include_data BOOLEAN, target_table_name TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_table' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.table(source_conninfo TEXT, schema_name TEXT, table_name TEXT, include_data BOOLEAN, target_table_name TEXT, options TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_table' LANGUAGE C VOLATILE; +COMMENT ON FUNCTION pgclone.table(TEXT, TEXT, TEXT, BOOLEAN, TEXT, TEXT) IS 'Clone table with JSON options: {"columns":["col1","col2"], "where":"status=''active''", "indexes":false, "constraints":false, "triggers":false, "mask":{"email":"email","name":"name","phone":"phone","col":{"type":"partial","prefix":2,"suffix":3},"col2":"hash","col3":"null","col4":{"type":"random_int","min":0,"max":100},"col5":{"type":"constant","value":"REDACTED"}}}'; +CREATE FUNCTION pgclone.table_ex(source_conninfo TEXT, schema_name TEXT, table_name TEXT, include_data BOOLEAN, target_table_name TEXT, include_indexes BOOLEAN DEFAULT true, include_constraints BOOLEAN DEFAULT true, include_triggers BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_table_ex' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.schema(source_conninfo TEXT, schema_name TEXT, include_data BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_schema' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.schema(source_conninfo TEXT, schema_name TEXT, include_data BOOLEAN, options TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_schema' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.schema_ex(source_conninfo TEXT, schema_name TEXT, include_data BOOLEAN, include_indexes BOOLEAN DEFAULT true, include_constraints BOOLEAN DEFAULT true, include_triggers BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_schema_ex' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.functions(source_conninfo TEXT, schema_name TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_functions' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.database(source_conninfo TEXT, include_data BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_database' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.database(source_conninfo TEXT, include_data BOOLEAN, options TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_database' LANGUAGE C VOLATILE; + +-- v2.0.1: Create target database and clone into it +CREATE FUNCTION pgclone.database_create(source_conninfo TEXT, target_dbname TEXT, include_data BOOLEAN DEFAULT true) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_database_create' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.database_create(source_conninfo TEXT, target_dbname TEXT, include_data BOOLEAN, options TEXT) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_database_create' LANGUAGE C VOLATILE; +COMMENT ON FUNCTION pgclone.database_create(TEXT, TEXT, BOOLEAN) IS 'Create target database if not exists, then clone all schemas/tables/functions from source. Run from postgres DB.'; + +-- ASYNC (require shared_preload_libraries = 'pgclone') +CREATE FUNCTION pgclone.table_async(source_conninfo TEXT, schema_name TEXT, table_name TEXT, include_data BOOLEAN DEFAULT true, target_table_name TEXT DEFAULT NULL, options TEXT DEFAULT NULL) RETURNS INTEGER AS 'MODULE_PATHNAME', 'pgclone_table_async' LANGUAGE C VOLATILE; +CREATE FUNCTION pgclone.schema_async(source_conninfo TEXT, schema_name TEXT, include_data BOOLEAN DEFAULT true, options TEXT DEFAULT NULL) RETURNS INTEGER AS 'MODULE_PATHNAME', 'pgclone_schema_async' LANGUAGE C VOLATILE; + +-- PROGRESS & JOB MANAGEMENT +CREATE FUNCTION pgclone.progress(job_id INTEGER) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_progress' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.cancel(job_id INTEGER) RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_cancel' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.resume(job_id INTEGER) RETURNS INTEGER AS 'MODULE_PATHNAME', 'pgclone_resume' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.jobs() RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_jobs' LANGUAGE C VOLATILE STRICT; +CREATE FUNCTION pgclone.clear_jobs() RETURNS INTEGER AS 'MODULE_PATHNAME', 'pgclone_clear_jobs' LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.clear_jobs() IS 'Clear completed/failed/cancelled job slots from shared memory'; + +-- v2.1.0+v2.1.1+v2.1.2: Progress Tracking View with progress bar, elapsed time, ETA +CREATE FUNCTION pgclone.progress_detail() +RETURNS TABLE ( + job_id INTEGER, + status TEXT, + op_type TEXT, + schema_name TEXT, + table_name TEXT, + current_phase TEXT, + current_table TEXT, + tables_total BIGINT, + tables_completed BIGINT, + rows_copied BIGINT, + bytes_copied BIGINT, + elapsed_ms BIGINT, + start_time TIMESTAMPTZ, + end_time TIMESTAMPTZ, + error_message TEXT, + pct_complete DOUBLE PRECISION, + progress_bar TEXT, + elapsed_time TEXT +) AS 'MODULE_PATHNAME', 'pgclone_progress_view' +LANGUAGE C VOLATILE STRICT; + +COMMENT ON FUNCTION pgclone.progress_detail() IS 'Returns tabular progress with visual progress bar and elapsed time for all clone jobs'; + +-- VIEW: convenient wrapper +CREATE VIEW pgclone.jobs_view AS + SELECT * FROM pgclone.progress_detail(); + +COMMENT ON VIEW pgclone.jobs_view IS 'Live progress tracking view with progress bar and elapsed time for all pgclone async clone jobs'; + +-- v3.1.0: Auto-discovery of sensitive data +CREATE FUNCTION pgclone.discover_sensitive(source_conninfo TEXT, schema_name TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_discover_sensitive' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.discover_sensitive(TEXT, TEXT) IS 'Scan source schema for columns matching sensitive data patterns (email, name, phone, ssn, salary, etc.) and return suggested mask rules as JSON'; + +-- v3.2.0: Static data masking on local tables +CREATE FUNCTION pgclone.mask_in_place(schema_name TEXT, table_name TEXT, mask_json TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_mask_in_place' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.mask_in_place(TEXT, TEXT, TEXT) IS 'Apply data masking to an existing local table via UPDATE. mask_json uses same format as clone mask option: {"email": "email", "name": "name", "ssn": "null"}'; + +-- v3.3.0: Dynamic data masking via views and role-based access +CREATE FUNCTION pgclone.create_masking_policy(schema_name TEXT, table_name TEXT, mask_json TEXT, privileged_role TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_create_masking_policy' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.create_masking_policy(TEXT, TEXT, TEXT, TEXT) IS 'Create a dynamic masking policy: creates a masked view, revokes base table access from PUBLIC, grants view to PUBLIC, grants base table to privileged role'; + +CREATE FUNCTION pgclone.drop_masking_policy(schema_name TEXT, table_name TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_drop_masking_policy' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.drop_masking_policy(TEXT, TEXT) IS 'Remove a dynamic masking policy: drops the masked view and restores base table access to PUBLIC'; + +-- v3.4.0: Clone roles with permissions and passwords +CREATE FUNCTION pgclone.clone_roles(source_conninfo TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_clone_roles' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.clone_roles(TEXT) IS 'Clone all non-system roles from source with encrypted passwords, attributes, memberships, and all permissions. Requires superuser on both source and target.'; + +CREATE FUNCTION pgclone.clone_roles(source_conninfo TEXT, role_names TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_clone_roles' +LANGUAGE C VOLATILE; +COMMENT ON FUNCTION pgclone.clone_roles(TEXT, TEXT) IS 'Clone specific roles (comma-separated) from source with encrypted passwords, attributes, memberships, and permissions. If role exists on target, syncs password and attributes without dropping.'; + +-- v3.5.0: Clone verification — compare row counts +CREATE FUNCTION pgclone.verify(source_conninfo TEXT, schema_name TEXT) +RETURNS TABLE ( + schema_name TEXT, + table_name TEXT, + source_rows BIGINT, + target_rows BIGINT, + match TEXT +) AS 'MODULE_PATHNAME', 'pgclone_verify' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.verify(TEXT, TEXT) IS 'Compare row counts between source and local target for all tables in a schema. Returns side-by-side comparison with match status.'; + +CREATE FUNCTION pgclone.verify(source_conninfo TEXT) +RETURNS TABLE ( + schema_name TEXT, + table_name TEXT, + source_rows BIGINT, + target_rows BIGINT, + match TEXT +) AS 'MODULE_PATHNAME', 'pgclone_verify' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.verify(TEXT) IS 'Compare row counts between source and local target for all user tables across all schemas. Returns side-by-side comparison with match status.'; + +-- v3.6.0: GDPR/Compliance masking report +CREATE FUNCTION pgclone.masking_report(schema_name TEXT) +RETURNS TABLE ( + schema_name TEXT, + table_name TEXT, + column_name TEXT, + sensitivity TEXT, + mask_status TEXT, + recommendation TEXT +) AS 'MODULE_PATHNAME', 'pgclone_masking_report' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.masking_report(TEXT) IS 'Generate GDPR/compliance audit report: lists sensitive columns, their masking status, and recommendations. Checks for masked views.'; + +-- v4.1.0: Schema diff — DDL drift detection between source and local target +CREATE FUNCTION pgclone.diff(source_conninfo TEXT, schema_name TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_diff' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.diff(TEXT, TEXT) IS + 'Compare DDL of a schema between source and the local target. ' + 'Returns JSON drift report listing objects only_in_source / only_in_target / modified ' + 'across tables (with per-column type/nullability/default drift), indexes, ' + 'constraints, triggers, views, and sequences. Read-only on both sides.'; + +-- v4.2.0: Pre-flight validator — connection, permissions, version, capacity, +-- name-conflict, role and tablespace checks before a clone. +CREATE FUNCTION pgclone.preflight(source_conninfo TEXT, schema_name TEXT) +RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_preflight' +LANGUAGE C VOLATILE STRICT; +COMMENT ON FUNCTION pgclone.preflight(TEXT, TEXT) IS + 'Validate that a clone of the given schema from source into the local ' + 'target is likely to succeed. Returns JSON with errors / warnings / info ' + 'arrays plus a per-check breakdown covering: source/target connection, ' + 'PostgreSQL versions, schema existence, USAGE/SELECT/CREATE permissions, ' + 'estimated source size, target database size, object counts, name ' + 'conflicts on the target schema, missing extensions, missing roles, and ' + 'missing non-default tablespaces. Read-only on both sides.'; + +-- VERSION +CREATE FUNCTION pgclone.version() RETURNS TEXT AS 'MODULE_PATHNAME', 'pgclone_version' LANGUAGE C IMMUTABLE STRICT; diff --git a/src/pgclone.c b/src/pgclone.c index 3be5dbb..d3f8329 100644 --- a/src/pgclone.c +++ b/src/pgclone.c @@ -679,14 +679,122 @@ pgclone_normalize_session(PGconn *conn) } /* --------------------------------------------------------------- - * Internal helper: connect to a remote PostgreSQL host + * Internal helper: open a libpq connection with TCP keepalives + * forced on, unless the caller already set them. + * + * Long-running pgclone operations leave the "snapshot keeper" + * connection idle-in-transaction for the bulk of the clone + * (often hours). Without TCP keepalives, perimeter firewalls and + * NAT gateways silently drop the idle TCP session, the exporting + * transaction dies on the server, and the snapshot file is + * removed — causing every subsequent SET TRANSACTION SNAPSHOT + * importer to fail with the misleading message + * "invalid snapshot identifier" (issue #9). + * + * Defaults injected (only when not already present): + * keepalives=1 — enable + * keepalives_idle=30 — seconds idle before first probe + * keepalives_interval=10 — seconds between probes + * keepalives_count=6 — probes before declaring dead + * + * We parse the user's conninfo with PQconninfoParse (so URI and + * keyword forms both work), copy each set option into a new + * keyword/value array, append our defaults for any keepalive + * keyword the user did NOT specify, and connect via + * PQconnectdbParams. This preserves any explicit keepalive choice + * the user made (including keepalives=0 to disable). + * --------------------------------------------------------------- */ +static PGconn * +pgclone_connect_with_keepalives(const char *conninfo) +{ + PQconninfoOption *parsed; + PQconninfoOption *opt; + char *parse_err = NULL; + const char **keywords; + const char **values; + int nopts = 0; + int i; + bool have_keepalives = false; + bool have_keepalives_idle = false; + bool have_keepalives_interval = false; + bool have_keepalives_count = false; + PGconn *conn; + + parsed = PQconninfoParse(conninfo, &parse_err); + if (parsed == NULL) + { + char *err_copy = parse_err ? pstrdup(parse_err) : pstrdup("(unknown)"); + if (parse_err) + PQfreemem(parse_err); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("pgclone: could not parse conninfo: %s", err_copy))); + } + + /* Count set options and detect existing keepalive settings. */ + for (opt = parsed; opt->keyword != NULL; opt++) + { + if (opt->val != NULL && opt->val[0] != '\0') + { + nopts++; + if (strcmp(opt->keyword, "keepalives") == 0) + have_keepalives = true; + else if (strcmp(opt->keyword, "keepalives_idle") == 0) + have_keepalives_idle = true; + else if (strcmp(opt->keyword, "keepalives_interval") == 0) + have_keepalives_interval = true; + else if (strcmp(opt->keyword, "keepalives_count") == 0) + have_keepalives_count = true; + } + } + + /* +4 slots for injected defaults, +1 for the NULL terminator. */ + keywords = (const char **) palloc0(sizeof(char *) * (nopts + 5)); + values = (const char **) palloc0(sizeof(char *) * (nopts + 5)); + + i = 0; + for (opt = parsed; opt->keyword != NULL; opt++) + { + if (opt->val != NULL && opt->val[0] != '\0') + { + keywords[i] = pstrdup(opt->keyword); + values[i] = pstrdup(opt->val); + i++; + } + } + + if (!have_keepalives) { keywords[i] = "keepalives"; values[i++] = "1"; } + if (!have_keepalives_idle) { keywords[i] = "keepalives_idle"; values[i++] = "30"; } + if (!have_keepalives_interval) { keywords[i] = "keepalives_interval"; values[i++] = "10"; } + if (!have_keepalives_count) { keywords[i] = "keepalives_count"; values[i++] = "6"; } + + keywords[i] = NULL; + values[i] = NULL; + + PQconninfoFree(parsed); + + /* expand_dbname=0 — we have already parsed and expanded. */ + conn = PQconnectdbParams(keywords, values, 0); + + pfree(keywords); + pfree(values); + + return conn; +} + +/* --------------------------------------------------------------- + * Internal helper: connect to a remote PostgreSQL host. + * + * v4.3.1: routed through pgclone_connect_with_keepalives() so the + * snapshot keeper (and every other source connection) survives + * idle firewall/NAT timeouts (issue #9). * --------------------------------------------------------------- */ static PGconn * pgclone_connect(const char *conninfo) { PGconn *conn; - conn = PQconnectdb(conninfo); + conn = pgclone_connect_with_keepalives(conninfo); if (PQstatus(conn) != CONNECTION_OK) { @@ -722,7 +830,18 @@ pgclone_connect(const char *conninfo) /* Open BEGIN ISOLATION LEVEL REPEATABLE READ READ ONLY on the source * connection. No-op if a transaction is already open on this conn - * (e.g. caller already imported a snapshot). */ + * (e.g. caller already imported a snapshot). + * + * v4.3.1: also disables idle_in_transaction_session_timeout and + * statement_timeout for the lifetime of this transaction via + * SET LOCAL. The snapshot keeper sits idle-in-transaction for the + * bulk of a long clone; if the source has a non-zero + * idle_in_transaction_session_timeout configured (a common + * production safeguard) the keeper would be killed and the + * exported snapshot reaped, breaking every subsequent importer + * (issue #9). SET LOCAL scopes to the transaction, so the values + * revert automatically at COMMIT and never leak into pooled + * connections. Both GUCs are PGC_USERSET — no privilege required. */ static void pgclone_begin_repeatable_read(PGconn *conn) { @@ -742,6 +861,17 @@ pgclone_begin_repeatable_read(PGconn *conn) errmsg_copy))); } PQclear(res); + + /* Defeat server-side timeouts for the keeper's idle window. + * Failures here are non-fatal — TCP keepalives still protect + * us against the firewall path. */ + res = PQexec(conn, + "SET LOCAL idle_in_transaction_session_timeout = 0; " + "SET LOCAL statement_timeout = 0"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + elog(DEBUG1, "pgclone: could not disable source-side timeouts: %s", + PQerrorMessage(conn)); + PQclear(res); } /* COMMIT the source transaction. Safe to call when no transaction is @@ -802,7 +932,13 @@ pgclone_export_snapshot(PGconn *conn, char *out_id, size_t out_id_len) /* Open BEGIN ISOLATION LEVEL REPEATABLE READ READ ONLY then SET * TRANSACTION SNAPSHOT ''. The keeper that exported this snapshot - * must still be alive (idle in transaction) at this point. */ + * must still be alive (idle in transaction) at this point. + * + * v4.3.1: when SET TRANSACTION SNAPSHOT fails with PostgreSQL's + * "invalid snapshot identifier" message — which the server emits + * both for malformed IDs AND for IDs whose backing file has been + * removed (keeper transaction terminated) — emit a hint pointing + * at the most common cause. See issue #9. */ static void pgclone_begin_with_imported_snapshot(PGconn *conn, const char *snapshot_id) { @@ -820,11 +956,76 @@ pgclone_begin_with_imported_snapshot(PGconn *conn, const char *snapshot_id) if (PQresultStatus(res) != PGRES_COMMAND_OK) { char *errmsg_copy = pstrdup(PQerrorMessage(conn)); + bool looks_like_gone_snapshot = + (strstr(errmsg_copy, "invalid snapshot identifier") != NULL); + PQclear(res); + + if (looks_like_gone_snapshot) + ereport(ERROR, + (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), + errmsg("pgclone: could not import snapshot %s on source: %s", + snapshot_id, errmsg_copy), + errhint("The exporting (keeper) transaction was likely " + "terminated mid-clone. Common causes: a firewall " + "or NAT gateway dropped the idle TCP session, or " + "the source has a non-zero " + "idle_in_transaction_session_timeout. pgclone " + "4.3.1 injects TCP keepalives and clears those " + "timeouts on the keeper transaction; verify the " + "extension was reloaded after upgrade. As an " + "emergency workaround, pass " + "'{\"consistent\": false}' in the options " + "argument to disable cross-table snapshot " + "sharing."))); + else + ereport(ERROR, + (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), + errmsg("pgclone: could not import snapshot %s on source: %s", + snapshot_id, errmsg_copy))); + } + PQclear(res); +} + +/* Lightweight liveness check on the snapshot keeper. Issues a + * cheap round-trip so libpq detects a silently-dropped TCP + * session BEFORE the next importer tries to bind to a snapshot + * the server has already reaped. On failure emits a clear ERROR + * that names the root cause rather than letting the next SET + * TRANSACTION SNAPSHOT fail with the misleading "invalid snapshot + * identifier" message. No-op when conn is NULL or no transaction + * is open (consistent mode disabled). v4.3.1 (issue #9). */ +static void +pgclone_keeper_ping(PGconn *conn) +{ + PGresult *res; + + if (conn == NULL) + return; + if (PQtransactionStatus(conn) != PQTRANS_INTRANS) + return; + + if (PQstatus(conn) != CONNECTION_OK) ereport(ERROR, - (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), - errmsg("pgclone: could not import snapshot %s on source: %s", - snapshot_id, errmsg_copy))); + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("pgclone: snapshot keeper connection is no longer alive: %s", + PQerrorMessage(conn)), + errhint("The exported snapshot has been invalidated. " + "Re-run the clone; if the failure repeats, the " + "source's idle_in_transaction_session_timeout, a " + "firewall idle timeout, or wal_sender_timeout is " + "killing the keeper transaction."))); + + res = PQexec(conn, "SELECT 1"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + char *err_copy = pstrdup(PQerrorMessage(conn)); + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("pgclone: snapshot keeper ping failed: %s", err_copy), + errhint("The keeper transaction was terminated mid-clone. " + "See pgclone issue #9."))); } PQclear(res); } @@ -2253,6 +2454,14 @@ pgclone_schema(PG_FUNCTION_ARGS) { Datum result; + /* v4.3.1: validate the keeper before each importer + * opens its own SET TRANSACTION SNAPSHOT — fail fast + * with a clear error instead of letting the importer + * hit the misleading "invalid snapshot identifier" + * (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + /* Call 6-arg version: conninfo, schema, table, include_data, target_name, options */ result = DirectFunctionCall6(pgclone_table, CStringGetTextDatum(source_conninfo), @@ -2273,10 +2482,18 @@ pgclone_schema(PG_FUNCTION_ARGS) /* ---- Step 4: Retry FK constraints if constraints enabled ---- */ if (opts.include_constraints) { - PGconn *src_retry = pgclone_connect(source_conninfo); - PGconn *lcl_retry = pgclone_connect_local(); + PGconn *src_retry; + PGconn *lcl_retry; int fk_created = 0; + /* v4.3.1: keeper liveness gate before opening another importer + * (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + + src_retry = pgclone_connect(source_conninfo); + lcl_retry = pgclone_connect_local(); + pgclone_setup_source_txn(src_retry, &opts); for (i = 0; i < ntables; i++) @@ -2331,8 +2548,16 @@ pgclone_schema(PG_FUNCTION_ARGS) /* ---- Step 5: Clone views ---- */ { - PGconn *src_views = pgclone_connect(source_conninfo); - PGconn *lcl_views = pgclone_connect_local(); + PGconn *src_views; + PGconn *lcl_views; + + /* v4.3.1: keeper liveness gate before opening another importer + * (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + + src_views = pgclone_connect(source_conninfo); + lcl_views = pgclone_connect_local(); pgclone_setup_source_txn(src_views, &opts); @@ -2443,10 +2668,18 @@ pgclone_schema(PG_FUNCTION_ARGS) * references inside the body is whatever the original author wrote. */ { - PGconn *src_funcs = pgclone_connect(source_conninfo); - PGconn *lcl_funcs = pgclone_connect_local(); + PGconn *src_funcs; + PGconn *lcl_funcs; int fcount; + /* v4.3.1: keeper liveness gate before opening another importer + * (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + + src_funcs = pgclone_connect(source_conninfo); + lcl_funcs = pgclone_connect_local(); + pgclone_setup_source_txn(src_funcs, &opts); resetStringInfo(&buf); @@ -2481,10 +2714,18 @@ pgclone_schema(PG_FUNCTION_ARGS) */ if (opts.include_triggers && ntables > 0) { - PGconn *src_trig = pgclone_connect(source_conninfo); - PGconn *lcl_trig = pgclone_connect_local(); + PGconn *src_trig; + PGconn *lcl_trig; int trig_total = 0; + /* v4.3.1: keeper liveness gate before opening another importer + * (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + + src_trig = pgclone_connect(source_conninfo); + lcl_trig = pgclone_connect_local(); + pgclone_setup_source_txn(src_trig, &opts); for (i = 0; i < ntables; i++) @@ -2672,6 +2913,10 @@ pgclone_database(PG_FUNCTION_ARGS) { Datum result; + /* v4.3.1: keeper liveness check between schemas (issue #9). */ + if (opts.consistent && opts.snapshot_id[0] != '\0') + pgclone_keeper_ping(source_conn); + elog(DEBUG1, "pgclone: cloning schema %s (%d/%d)", schema_names[i], i + 1, nschemas); @@ -4408,7 +4653,7 @@ PG_FUNCTION_INFO_V1(pgclone_version); Datum pgclone_version(PG_FUNCTION_ARGS) { - PG_RETURN_TEXT_P(cstring_to_text("pgclone 4.3.0")); + PG_RETURN_TEXT_P(cstring_to_text("pgclone 4.3.1")); } /* =============================================================== diff --git a/test/run_tests.sh b/test/run_tests.sh index 5ae71c3..e021ce9 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -103,6 +103,16 @@ if echo "$SPL" | grep -q "pgclone"; then if [ $CONSISTENT_EXIT -ne 0 ]; then TEST_EXIT=1 fi + + echo "" + echo "============================================" + echo "Running pgclone v4.3.1 snapshot-keeper tests (issue #9)..." + echo "============================================" + bash /build/pgclone/test/test_snapshot_keeper.sh 2>&1 + KEEPER_EXIT=$? + if [ $KEEPER_EXIT -ne 0 ]; then + TEST_EXIT=1 + fi else echo "WARNING: pgclone not in shared_preload_libraries, skipping async tests" echo "To run async tests, add to postgresql.conf:" diff --git a/test/test_snapshot_keeper.sh b/test/test_snapshot_keeper.sh new file mode 100755 index 0000000..cdced90 --- /dev/null +++ b/test/test_snapshot_keeper.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# ============================================================ +# pgclone v4.3.1 snapshot-keeper resilience test (issue #9) +# +# Reproduces the failure mode where the snapshot keeper transaction +# is killed by a non-zero idle_in_transaction_session_timeout on +# the source, leading to "ERROR: pgclone: could not import +# snapshot ... invalid snapshot identifier" on the next per-table +# importer. +# +# Strategy: +# 1. Build a small multi-table schema on the source. +# 2. Set idle_in_transaction_session_timeout = 1s on the role +# used by the test. +# 3. Run pgclone.schema(). With the v4.3.1 fix the keeper's +# BEGIN issues SET LOCAL idle_in_transaction_session_timeout=0 +# and the clone completes. Without the fix, the keeper is +# killed during the per-table phase and a later importer +# fails with "invalid snapshot identifier". +# ============================================================ + +set -euo pipefail + +PASS=0 +FAIL=0 + +SOURCE_HOST="${SOURCE_HOST:-source-db}" +SOURCE_PORT="${SOURCE_PORT:-5432}" +SOURCE_DB="${SOURCE_DB:-source_db}" +SOURCE_USER="${SOURCE_USER:-postgres}" +SOURCE_PW="${SOURCE_PW:-testpass}" + +src() { + PGPASSWORD="$SOURCE_PW" psql -h "$SOURCE_HOST" -p "$SOURCE_PORT" \ + -U "$SOURCE_USER" -d "$SOURCE_DB" \ + -X -q -v ON_ERROR_STOP=1 "$@" +} + +tgt() { + psql -U postgres -d target_db -tAc "$1" +} + +run_test() { + local desc="$1" + local cmd="$2" + if eval "$cmd"; then + echo " PASS: $desc"; PASS=$((PASS + 1)) + else + echo " FAIL: $desc" + echo " cmd: $cmd" + FAIL=$((FAIL + 1)) + fi +} + +echo "============================================" +echo "Testing pgclone v4.3.1 snapshot keeper (#9)" +echo "============================================" + +# ---- Build a small multi-table schema on the source ---- +echo "" +echo "---- Building source keeper_test schema ----" +src <<'SQL' +DROP SCHEMA IF EXISTS keeper_test CASCADE; +CREATE SCHEMA keeper_test; + +CREATE TABLE keeper_test.t1 (id int PRIMARY KEY, payload text); +CREATE TABLE keeper_test.t2 (id int PRIMARY KEY, payload text); +CREATE TABLE keeper_test.t3 (id int PRIMARY KEY, payload text); +CREATE TABLE keeper_test.t4 (id int PRIMARY KEY, payload text); +CREATE TABLE keeper_test.t5 (id int PRIMARY KEY, payload text); + +-- Enough rows per table that the per-table COPY plus loopback +-- DDL adds up to several seconds — easily exceeding the +-- 1s idle_in_transaction_session_timeout we set below. +INSERT INTO keeper_test.t1 SELECT g, repeat('x', 200) FROM generate_series(1, 5000) g; +INSERT INTO keeper_test.t2 SELECT g, repeat('y', 200) FROM generate_series(1, 5000) g; +INSERT INTO keeper_test.t3 SELECT g, repeat('z', 200) FROM generate_series(1, 5000) g; +INSERT INTO keeper_test.t4 SELECT g, repeat('w', 200) FROM generate_series(1, 5000) g; +INSERT INTO keeper_test.t5 SELECT g, repeat('v', 200) FROM generate_series(1, 5000) g; +SQL + +# ---- Set a tight idle_in_transaction_session_timeout on the +# ---- source role used by pgclone. Without the v4.3.1 fix the +# ---- keeper transaction will be terminated during the +# ---- per-table loop. ALTER ROLE persists; we revert in cleanup. +echo "" +echo "---- Setting idle_in_transaction_session_timeout = 1s on source role ----" +src </dev/null 2>&1 || true +ALTER ROLE $SOURCE_USER RESET idle_in_transaction_session_timeout; +DROP SCHEMA IF EXISTS keeper_test CASCADE; +SQL + tgt "DROP SCHEMA IF EXISTS keeper_test CASCADE" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +# ---- Clean target schema if a previous run left it ---- +tgt "DROP SCHEMA IF EXISTS keeper_test CASCADE" >/dev/null + +# ---- Run the schema clone ---- +echo "" +echo "---- Running pgclone.schema(...) ----" +CONNINFO="host=$SOURCE_HOST port=$SOURCE_PORT dbname=$SOURCE_DB user=$SOURCE_USER password=$SOURCE_PW" + +CLONE_RC=0 +CLONE_OUT=$(tgt "SELECT pgclone.schema('$CONNINFO', 'keeper_test', true)" 2>&1) || CLONE_RC=$? + +echo " exit code: $CLONE_RC" + +run_test "pgclone.schema returns OK under tight idle_in_transaction_session_timeout" \ + "[ '$CLONE_RC' = '0' ] && [ '$CLONE_OUT' = 'OK' ]" + +run_test "no 'invalid snapshot identifier' error in clone output" \ + "! echo '$CLONE_OUT' | grep -qi 'invalid snapshot identifier'" + +# ---- Verify all 5 tables made it and row counts match ---- +for tbl in t1 t2 t3 t4 t5; do + n=$(tgt "SELECT count(*) FROM keeper_test.$tbl" 2>/dev/null || echo "missing") + run_test "keeper_test.$tbl was cloned with 5000 rows (actual: $n)" \ + "[ '$n' = '5000' ]" +done + +echo "" +echo "============================================" +echo "Results: $PASS passed, $FAIL failed" +echo "============================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi +exit 0