This repository has been archived by the owner on Jun 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 23
/
check_postgres_replication
executable file
·229 lines (198 loc) · 5.89 KB
/
check_postgres_replication
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/bin/bash
# ========================================================================================
# Postgres replication lag nagios check using psql and bash.
#
# 2013 Wanelo Inc, Apache License.
# This script expects psql to be in the PATH.
#
# Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
# [-w <warn_perc>] [-c <critical_perc>]
# -h --host replica host (default 127.0.0.1)
# -m --master master fqdn or ip (required)
# -U --user database user (default postgres)
# -x --units units of measurement to display (KB or MB, default MB)
# -w --warning warning threshold (default 10MB)
# -c --critical critical threshold (default 15MB)
# ========================================================================================
# Nagios return codes
readonly STATE_OK=0
readonly STATE_WARNING=1
readonly STATE_CRITICAL=2
readonly STATE_UNKNOWN=3
readonly ARGS="$@"
# set thresholds in bytes
readonly DEFAULT_WARNING_THRESHOLD=10485760
readonly DEFAULT_CRITICAL_THRESHOLD=15728640
readonly DEFAULT_HOST="127.0.0.1"
readonly DEFAULT_USER=postgres
readonly DEFAULT_UNITS=MB
readonly PATH=/opt/local/bin:${PATH}
readonly NODENAME=$(cat /etc/nodename)
readonly MASTER_SQL="SELECT pg_current_xlog_location()"
readonly REPLICA_SQL="SELECT pg_last_xlog_replay_location()"
readonly REPLICA_TIME_LAG="select now() - pg_last_xact_replay_timestamp()"
readonly ERR=/tmp/repl_chec.$$
usage() {
cat <<-EOF
Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
[-w <warn_perc>] [-c <critical_perc>]
-h --host replica host (default 127.0.0.1)
-m --master master fqdn or ip (required)
-U --user database user (default postgres)
-x --units units of measurement to display (KB or MB, default MB)
-w --warning warning threshold (default 10MB)
-c --critical critical threshold (default 15MB)
--help show this message
--verbose
EOF
}
# Parse parameters
parse_arguments() {
local arg=$1
for arg; do
local delim=""
case "$arg" in
--host) args="${args}-h ";;
--master) args="${args}-m ";;
--user) args="${args}-U ";;
--units) args="${args}-x ";;
--warning) args="${args}-w ";;
--critical) args="${args}-c ";;
--help) args="${args}-H ";;
--verbose) args="${args}-v ";;
*) [[ "${arg:0:1}" == "-" ]] || delim="\""
args="${args}${delim}${arg}${delim} ";;
esac
done
eval set -- $args
while getopts "h:m:U:x:w:c:Hv" OPTION
do
case $OPTION in
v)
set -x
;;
H)
usage
exit
;;
h)
local host=$OPTARG
;;
m)
readonly MASTER=$OPTARG
;;
U)
local user=$OPTARG
;;
x)
local units=$OPTARG
;;
w)
local warning_threshold=$OPTARG
;;
c)
local critical_threshold=$OPTARG
;;
esac
done
readonly USER=${user:-$DEFAULT_USER}
readonly HOST=${host:-$DEFAULT_HOST}
readonly UNITS=${units:-$DEFAULT_UNITS}
readonly WARNING_THRESHOLD=${warning_threshold:-$DEFAULT_WARNING_THRESHOLD}
readonly CRITICAL_THRESHOLD=${critical_threshold:-$DEFAULT_CRITICAL_THRESHOLD}
}
check_required_arguments() {
if [ -z "$MASTER" ]; then
echo "pass master host in parameters via -m flag"
exit 1
fi
}
normalize_units() {
# Error checking of arguments
case "$UNITS" in
KB)
readonly DIVISOR=1024
;;
MB)
readonly DIVISOR=1048576
;;
*)
echo "Incorrect unit of measurement"
usage
exit 1
;;
esac
}
result() {
local description=$1
local status=$2
local diff=$3
local time_lag=$4
local error=$(cat $ERR 2>/dev/null)
if [[ "${status}" -eq "${STATE_CRITICAL}" && ! -z "${error}" ]]; then
local message="replication check error ${error}"
else
local diff_units=$(bytes_to_units $diff)
local message="replication lag is ${diff_units}${UNITS} : time lag is ${time_lag}"
fi
echo "REPLICATION $description : ${NODENAME} $message|repl=${diff},time_lag=${time_lag};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}"
rm -f $ERR
exit $status
}
get_replica_current_xlog() {
echo $(psql -U $USER -Atc "$REPLICA_SQL" -h $HOST 2>$ERR)
}
get_master_current_xlog() {
echo $(psql -U $USER -Atc "$MASTER_SQL" -h $MASTER 2>$ERR)
}
check_replica_time_lag() {
echo $(psql -U $USER -Atc "${REPLICA_TIME_LAG}" -h ${HOST} 2>${ERR})
}
check_errors() {
if [ $1 -ne 0 ]; then
result "CRITICAL" $STATE_CRITICAL
fi
}
xlog_to_bytes() {
# http://eulerto.blogspot.com/2011/11/understanding-wal-nomenclature.html
local logid="${1%%/*}"
local offset="${1##*/}"
echo $((0xFF000000 * 0x$logid + 0x$offset))
}
bytes_to_units() {
local diff=$1
if [ -z "$diff" ]; then
echo "ERROR: NO DATA AVAILABLE"
else
echo $(( $diff / $DIVISOR ))
fi
}
main() {
parse_arguments $ARGS
check_required_arguments
normalize_units
local replica_xlog=$(get_replica_current_xlog)
check_errors $?
local replica_bytes=$(xlog_to_bytes ${replica_xlog})
if [ -z "${replica_xlog}" ]; then
echo -n "Unable to find replica XLOG replay location" > $ERR
result "CRITICAL" $STATE_CRITICAL
fi
# Query master and replica for latest xlog
local master_xlog=$(get_master_current_xlog)
check_errors $?
local master_bytes=$(xlog_to_bytes $master_xlog)
# Calculate xlog diff in bytes
local diff=$(($master_bytes - $replica_bytes))
local time_lag=$(check_replica_time_lag)
# Output response
if [ $diff -ge $WARNING_THRESHOLD ] && [ $diff -lt $CRITICAL_THRESHOLD ]; then
result "WARNING" $STATE_WARNING $diff $time_lag
elif [ $diff -ge $CRITICAL_THRESHOLD ]; then
result "CRITICAL" $STATE_CRITICAL $diff $time_lag
else
result "OK" $STATE_OK $diff $time_lag
fi
rm -f $ERR
}
main