diff --git a/ansible/files/postgres_exporter.service.j2 b/ansible/files/postgres_exporter.service.j2 index bc63d7e34..035d8e49d 100644 --- a/ansible/files/postgres_exporter.service.j2 +++ b/ansible/files/postgres_exporter.service.j2 @@ -10,7 +10,7 @@ StandardError=append:/var/log/postgres_exporter.error Restart=always RestartSec=3 Environment="DATA_SOURCE_URI=localhost/postgres?sslmode=disable" -Environment="DATA_SOURCE_USER=supabase_admin" +Environment="DATA_SOURCE_USER=supabase_metrics" [Install] WantedBy=multi-user.target diff --git a/ansible/files/queries.yml.j2 b/ansible/files/queries.yml.j2 index 0de4ca2a1..319f9e40d 100644 --- a/ansible/files/queries.yml.j2 +++ b/ansible/files/queries.yml.j2 @@ -1,13 +1,15 @@ pg_database: - query: "SELECT SUM(pg_database_size(pg_database.datname)) / (1024 * 1024) as size_mb FROM pg_database" master: true - cache_seconds: 30 + cache_seconds: 60 + query: "SELECT SUM(pg_database_size(pg_database.datname)) / (1024 * 1024) as size_mb FROM pg_database" metrics: - size_mb: usage: "GAUGE" description: "Disk space used by the database" pg_stat_bgwriter: + master: true + cache_seconds: 60 query: | select checkpoints_timed as checkpoints_timed_total, checkpoints_req as checkpoints_req_total, @@ -21,8 +23,6 @@ pg_stat_bgwriter: buffers_alloc as buffers_alloc_total, stats_reset from pg_stat_bgwriter - cache_seconds: 30 - master: true metrics: - checkpoints_timed_total: usage: "COUNTER" @@ -58,9 +58,9 @@ pg_stat_bgwriter: usage: "COUNTER" description: "Most recent stat reset time" - pg_stat_database: - cache_seconds: 30 + master: true + cache_seconds: 60 query: | SELECT sum(numbackends) as num_backends, sum(xact_commit) as xact_commit_total, @@ -78,7 +78,6 @@ pg_stat_database: sum(deadlocks) as deadlocks_total, max(stats_reset) as most_recent_reset FROM pg_stat_database - master: true metrics: - num_backends: usage: "GAUGE" @@ -127,6 +126,8 @@ pg_stat_database: description: "The most recent time one of the databases had its statistics reset" pg_stat_database_conflicts: + master: true + cache_seconds: 60 query: | SELECT sum(confl_tablespace) as confl_tablespace_total, sum(confl_lock) as confl_lock_total, @@ -134,8 +135,6 @@ pg_stat_database_conflicts: sum(confl_bufferpin) as confl_bufferpin_total, sum(confl_deadlock) as confl_deadlock_total from pg_stat_database_conflicts - cache_seconds: 30 - master: true metrics: - confl_tablespace_total: usage: "COUNTER" @@ -154,8 +153,9 @@ pg_stat_database_conflicts: description: "Queries cancelled due to deadlocks" pg_stat_statements: - query: "SELECT sum(calls) as total_queries, sum(total_exec_time / 1000) as total_time_seconds FROM extensions.pg_stat_statements t1 JOIN pg_database t3 ON (t1.dbid=t3.oid)" master: true + cache_seconds: 60 + query: "SELECT sum(calls) as total_queries, sum(total_exec_time / 1000) as total_time_seconds FROM extensions.pg_stat_statements t1 JOIN pg_database t3 ON (t1.dbid=t3.oid)" metrics: - total_queries: usage: "COUNTER" @@ -165,17 +165,18 @@ pg_stat_statements: description: "Total time spent, in seconds" auth_users: - query: "select count(id) as user_count from auth.users" master: true - cache_seconds: 30 + cache_seconds: 60 + query: "select count(id) as user_count from auth.users" metrics: - user_count: usage: "GAUGE" description: "Number of users in the project db" replication: - query: "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS realtime_lag_bytes, active AS realtime_slot_status FROM pg_replication_slots where slot_name = 'realtime'" master: true + cache_seconds: 60 + query: "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS realtime_lag_bytes, active AS realtime_slot_status FROM pg_replication_slots where slot_name = 'realtime'" metrics: - realtime_lag_bytes: usage: "GAUGE" @@ -185,10 +186,34 @@ replication: description: "Replication Slot active status" storage: - query: "select sum(size) / (1024 * 1024) as storage_size_mb from storage.get_size_by_bucket()" master: true - cache_seconds: 30 + cache_seconds: 60 + query: "select sum(size) / (1024 * 1024) as storage_size_mb from storage.get_size_by_bucket()" metrics: - storage_size_mb: usage: "GAUGE" description: "The total size used for all storage buckets, in mb" + +supabase_usage_metrics: + # pg_stat_statements collects metrics from all databases on the cluster, so querying just the master db should be sufficient + master: true + cache_seconds: 60 + query: | + select sum(calls) as user_queries_total + from extensions.pg_stat_statements + join pg_roles on userid = oid + where rolname <> 'supabase_metrics' + and query not like 'SELECT%FROM net.http_request_queue%' + and query not like 'DELETE FROM net.http_request_queue%' + and query <> 'SELECT version()' + -- the rest of these would get removed once we implement a more minimal healthcheck endpoint for postgrest + and query not like 'select set_config(%' + and query not like '%left join pg_catalog.pg_description d on d.objoid = n.oid%' + and query <> 'COMMIT' + and query <> 'BEGIN ISOLATION LEVEL READ COMMITTED READ ONLY' + and query <> 'SET client_encoding = ''UTF8''' + and query <> 'SET client_min_messages TO WARNING'; + metrics: + - user_queries_total: + usage: "COUNTER" + description: "The total number of user queries executed" diff --git a/ansible/tasks/internal/supautils.yml b/ansible/tasks/internal/supautils.yml index d6c340971..5f5c29d98 100644 --- a/ansible/tasks/internal/supautils.yml +++ b/ansible/tasks/internal/supautils.yml @@ -28,7 +28,7 @@ lineinfile: path: /etc/postgresql/postgresql.conf state: present - line: supautils.reserved_roles = 'supabase_admin, supabase_auth_admin, supabase_storage_admin, dashboard_user, pgbouncer, service_role, authenticator, authenticated, anon' + line: supautils.reserved_roles = 'supabase_admin, supabase_metrics, supabase_auth_admin, supabase_storage_admin, dashboard_user, pgbouncer, service_role, authenticator, authenticated, anon' - name: supautils - set supautils.reserved_memberships become: yes diff --git a/ansible/tasks/setup-fail2ban.yml b/ansible/tasks/setup-fail2ban.yml index 7b0666b07..1949c5e3c 100644 --- a/ansible/tasks/setup-fail2ban.yml +++ b/ansible/tasks/setup-fail2ban.yml @@ -26,6 +26,7 @@ line: "{{ item.line }}" loop: - { line: ' ^.*,.*,.*,.*,":.*password authentication failed for user ""supabase_admin".*$' } + - { line: ' ^.*,.*,.*,.*,":.*password authentication failed for user ""supabase_metrics".*$' } - { line: ' ^.*,.*,.*,.*,":.*password authentication failed for user ""supabase_auth_admin".*$' } - { line: ' ^.*,.*,.*,.*,":.*password authentication failed for user ""supabase_storage_admin".*$' } - { line: ' ^.*,.*,.*,.*,":.*password authentication failed for user ""authenticator".*$' } @@ -38,4 +39,4 @@ - name: fail2ban - restart systemd: name: fail2ban - state: restarted \ No newline at end of file + state: restarted