Skip to content
This repository has been archived by the owner before Nov 9, 2022. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 229 lines (198 sloc) 5.89 KB
#!/bin/bash
# ========================================================================================
# Postgres replication lag nagios check using psql and bash.
#
# 2013 Wanelo Inc, Apache License.
# This script expects psql to be in the PATH.
#
# Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
# [-w <warn_perc>] [-c <critical_perc>]
# -h --host replica host (default 127.0.0.1)
# -m --master master fqdn or ip (required)
# -U --user database user (default postgres)
# -x --units units of measurement to display (KB or MB, default MB)
# -w --warning warning threshold (default 10MB)
# -c --critical critical threshold (default 15MB)
# ========================================================================================
# Nagios return codes
readonly STATE_OK=0
readonly STATE_WARNING=1
readonly STATE_CRITICAL=2
readonly STATE_UNKNOWN=3
readonly ARGS="$@"
# set thresholds in bytes
readonly DEFAULT_WARNING_THRESHOLD=10485760
readonly DEFAULT_CRITICAL_THRESHOLD=15728640
readonly DEFAULT_HOST="127.0.0.1"
readonly DEFAULT_USER=postgres
readonly DEFAULT_UNITS=MB
readonly PATH=/opt/local/bin:${PATH}
readonly NODENAME=$(cat /etc/nodename)
readonly MASTER_SQL="SELECT pg_current_xlog_location()"
readonly REPLICA_SQL="SELECT pg_last_xlog_replay_location()"
readonly REPLICA_TIME_LAG="select now() - pg_last_xact_replay_timestamp()"
readonly ERR=/tmp/repl_chec.$$
usage() {
cat <<-EOF
Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
[-w <warn_perc>] [-c <critical_perc>]
-h --host replica host (default 127.0.0.1)
-m --master master fqdn or ip (required)
-U --user database user (default postgres)
-x --units units of measurement to display (KB or MB, default MB)
-w --warning warning threshold (default 10MB)
-c --critical critical threshold (default 15MB)
--help show this message
--verbose
EOF
}
# Parse parameters
parse_arguments() {
local arg=$1
for arg; do
local delim=""
case "$arg" in
--host) args="${args}-h ";;
--master) args="${args}-m ";;
--user) args="${args}-U ";;
--units) args="${args}-x ";;
--warning) args="${args}-w ";;
--critical) args="${args}-c ";;
--help) args="${args}-H ";;
--verbose) args="${args}-v ";;
*) [[ "${arg:0:1}" == "-" ]] || delim="\""
args="${args}${delim}${arg}${delim} ";;
esac
done
eval set -- $args
while getopts "h:m:U:x:w:c:Hv" OPTION
do
case $OPTION in
v)
set -x
;;
H)
usage
exit
;;
h)
local host=$OPTARG
;;
m)
readonly MASTER=$OPTARG
;;
U)
local user=$OPTARG
;;
x)
local units=$OPTARG
;;
w)
local warning_threshold=$OPTARG
;;
c)
local critical_threshold=$OPTARG
;;
esac
done
readonly USER=${user:-$DEFAULT_USER}
readonly HOST=${host:-$DEFAULT_HOST}
readonly UNITS=${units:-$DEFAULT_UNITS}
readonly WARNING_THRESHOLD=${warning_threshold:-$DEFAULT_WARNING_THRESHOLD}
readonly CRITICAL_THRESHOLD=${critical_threshold:-$DEFAULT_CRITICAL_THRESHOLD}
}
check_required_arguments() {
if [ -z "$MASTER" ]; then
echo "pass master host in parameters via -m flag"
exit 1
fi
}
normalize_units() {
# Error checking of arguments
case "$UNITS" in
KB)
readonly DIVISOR=1024
;;
MB)
readonly DIVISOR=1048576
;;
*)
echo "Incorrect unit of measurement"
usage
exit 1
;;
esac
}
result() {
local description=$1
local status=$2
local diff=$3
local time_lag=$4
local error=$(cat $ERR 2>/dev/null)
if [[ "${status}" -eq "${STATE_CRITICAL}" && ! -z "${error}" ]]; then
local message="replication check error ${error}"
else
local diff_units=$(bytes_to_units $diff)
local message="replication lag is ${diff_units}${UNITS} : time lag is ${time_lag}"
fi
echo "REPLICATION $description : ${NODENAME} $message|repl=${diff},time_lag=${time_lag};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}"
rm -f $ERR
exit $status
}
get_replica_current_xlog() {
echo $(psql -U $USER -Atc "$REPLICA_SQL" -h $HOST 2>$ERR)
}
get_master_current_xlog() {
echo $(psql -U $USER -Atc "$MASTER_SQL" -h $MASTER 2>$ERR)
}
check_replica_time_lag() {
echo $(psql -U $USER -Atc "${REPLICA_TIME_LAG}" -h ${HOST} 2>${ERR})
}
check_errors() {
if [ $1 -ne 0 ]; then
result "CRITICAL" $STATE_CRITICAL
fi
}
xlog_to_bytes() {
# http://eulerto.blogspot.com/2011/11/understanding-wal-nomenclature.html
local logid="${1%%/*}"
local offset="${1##*/}"
echo $((0xFF000000 * 0x$logid + 0x$offset))
}
bytes_to_units() {
local diff=$1
if [ -z "$diff" ]; then
echo "ERROR: NO DATA AVAILABLE"
else
echo $(( $diff / $DIVISOR ))
fi
}
main() {
parse_arguments $ARGS
check_required_arguments
normalize_units
local replica_xlog=$(get_replica_current_xlog)
check_errors $?
local replica_bytes=$(xlog_to_bytes ${replica_xlog})
if [ -z "${replica_xlog}" ]; then
echo -n "Unable to find replica XLOG replay location" > $ERR
result "CRITICAL" $STATE_CRITICAL
fi
# Query master and replica for latest xlog
local master_xlog=$(get_master_current_xlog)
check_errors $?
local master_bytes=$(xlog_to_bytes $master_xlog)
# Calculate xlog diff in bytes
local diff=$(($master_bytes - $replica_bytes))
local time_lag=$(check_replica_time_lag)
# Output response
if [ $diff -ge $WARNING_THRESHOLD ] && [ $diff -lt $CRITICAL_THRESHOLD ]; then
result "WARNING" $STATE_WARNING $diff $time_lag
elif [ $diff -ge $CRITICAL_THRESHOLD ]; then
result "CRITICAL" $STATE_CRITICAL $diff $time_lag
else
result "OK" $STATE_OK $diff $time_lag
fi
rm -f $ERR
}
main