Permalink
Browse files

Update to lpreserver backend:

Added new "replicate init <dataset>" option, which will re-init the remote side of the replication server, cleaning up any corrupt datasets and preparing to sync fresh again.

Also added a check during replication, we won't try to kickoff a second replication task if a previous one is still running, to avoid collisions. In addition we will not delete any snapshots until the initial replication is finished, in order to prevent undercutting the active replication
  • Loading branch information...
kmoore134 committed Sep 12, 2013
1 parent 651bd28 commit c0e6494681b181fe94bb9390168be2491697fd7b
Showing with 107 additions and 24 deletions.
  1. +50 −0 src-sh/lpreserver/backend/functions.sh
  2. +44 −23 src-sh/lpreserver/backend/runsnap.sh
  3. +13 −1 src-sh/lpreserver/lpreserver
@@ -256,6 +256,21 @@ check_rep_task() {
# If we are checking for a sync task, and the rep isn't marked as sync we can return
if [ "$2" = "sync" -a "$REPTIME" != "sync" ] ; then return 0; fi
+ # Doing a replication task, check if one is in progress
+ export pidFile="${DBDIR}/.reptask-`echo ${LDATA} | sed 's|/|-|g'`"
+ if [ -e "${pidFile}" ] ; then
+ pgrep -F ${pidFile} >/dev/null 2>/dev/null
+ if [ $? -eq 0 ] ; then
+ echo_log "Skipped replication on $LDATA, previous replication is still running."
+ return 0
+ else
+ rm ${pidFile}
+ fi
+ fi
+
+ # Save this PID
+ echo "$$" > ${pidFile}
+
# Is this a sync-task we do at the time of a snapshot?
if [ "$2" = "sync" -a "$REPTIME" = "sync" ] ; then
export DIDREP=1
@@ -285,6 +300,7 @@ start_rep_task() {
if [ "$lastSEND" = "$lastSNAP" ] ; then
queue_msg "`date`: Last snapshot $lastSNAP is already marked as replicated!"
+ rm ${pidFile}
return 1
fi
@@ -331,6 +347,7 @@ start_rep_task() {
echo_log "FAILED replication task on ${DATASET}: LOGFILE: $FLOG"
fi
+ rm ${pidFile}
return $zStatus
}
@@ -551,3 +568,36 @@ online_zpool_disk() {
zpool online $pool $disk
exit $?
}
+
+init_rep_task() {
+
+ LDATA="$1"
+
+ repLine=`cat ${REPCONF} | grep "^${LDATA}:"`
+ if [ -z "$repLine" ] ; then return 0; fi
+
+ # We have a replication task for this set, get some vars
+ hName=`hostname`
+ REPHOST=`echo $repLine | cut -d ':' -f 3`
+ REPUSER=`echo $repLine | cut -d ':' -f 4`
+ REPPORT=`echo $repLine | cut -d ':' -f 5`
+ REPRDATA=`echo $repLine | cut -d ':' -f 6`
+
+ # First check if we even have a dataset on the remote
+ ssh -p ${REPPORT} ${REPUSER}@${REPHOST} zfs list ${REPRDATA}/${hName} 2>/dev/null >/dev/null
+ if [ $? -eq 0 ] ; then
+ # Lets cleanup the remote side
+ echo "Removing remote dataset: ${REPRDATA}/${hName}"
+ ssh -p ${REPPORT} ${REPUSER}@${REPHOST} zfs destroy -r ${REPRDATA}/${hName}
+ if [ $? -ne 0 ] ; then
+ echo "Warning: Could not delete remote dataset ${REPRDATA}/${hName}"
+ fi
+ fi
+
+ # Now lets mark none of our datasets as replicated
+ lastSEND=`zfs get -r backup:lpreserver ${LDATA} | grep LATEST | awk '{$1=$1}1' OFS=" " | tail -1 | cut -d '@' -f 2 | cut -d ' ' -f 1`
+ if [ -n "$lastSEND" ] ; then
+ zfs set backup:lpreserver=' ' ${LDATA}@$lastSEND
+ fi
+
+}
@@ -17,12 +17,18 @@ if [ -z "${DATASET}" ]; then
exit_err "No dataset specified!"
fi
+# Make sure this is a valid DATASET
+zfs list ${DATASET} >/dev/null 2>/dev/null
+if [ $? -ne 0 ] ; then
+ exit_err "Invalid dataset specified ${DATASET}"
+fi
+
# Create the snapshot now with the "auto-" tag
echo_log "Creating snapshot on ${DATASET}"
mkZFSSnap "${DATASET}" "auto-"
if [ $? -ne 0 ] ; then
echo_log "ERROR: Failed creating snapshot on ${DATASET}"
- queue_msg "Snapshot ERROR" "ERROR: Failed creating snapshot on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
+ queue_msg "ERROR: Failed creating snapshot on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
snapStat=1
else
queue_msg "Success creating snapshot on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
@@ -37,30 +43,45 @@ do
rSnaps="$tmp $rSnaps"
done
-# Do any pruning
-num=0
-for snap in $rSnaps
-do
- # Only remove snapshots which are auto-created, so we don't delete one the user
- # made specifically
- cur="`echo $snap | cut -d '-' -f 1`"
- if [ "$cur" != "auto" ] ; then
- continue;
- fi
+# Before we start pruning, check if any replication is running
+skipPrune=0
+export pidFile="${DBDIR}/.reptask-`echo ${DATASET} | sed 's|/|-|g'`"
+if [ -e "${pidFile}" ] ; then
+ pgrep -F ${pidFile} >/dev/null 2>/dev/null
+ if [ $? -eq 0 ] ; then skipPrune=1; fi
+fi
- num=`expr $num + 1`
- if [ $num -gt $KEEP ] ; then
- echo_log "Pruning old snapshot: $snap"
- rmZFSSnap "${DATASET}" "$snap"
- if [ $? -ne 0 ] ; then
- echo_log "ERROR: Failed pruning snapshot $snap on ${DATASET}"
- queue_msg "Snapshot ERROR" "ERROR: Failed pruning snapshot $snap on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
- snapStat=1
- else
- queue_msg "Success pruning snapshot $snap on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
+if [ $skipPrune -eq 1 ] ; then
+ # No pruning since replication is currently running
+ echo_log "WARNING: Skipped pruning snapshots on ${DATASET} while replication is running."
+ queue_msg "WARNING: Skipped pruning snapshots on ${DATASET} while replication is running."
+
+else
+ # Do any pruning
+ num=0
+ for snap in $rSnaps
+ do
+ # Only remove snapshots which are auto-created, so we don't delete one the user
+ # made specifically
+ cur="`echo $snap | cut -d '-' -f 1`"
+ if [ "$cur" != "auto" ] ; then
+ continue;
+ fi
+
+ num=`expr $num + 1`
+ if [ $num -gt $KEEP ] ; then
+ echo_log "Pruning old snapshot: $snap"
+ rmZFSSnap "${DATASET}" "$snap"
+ if [ $? -ne 0 ] ; then
+ echo_log "ERROR: Failed pruning snapshot $snap on ${DATASET}"
+ queue_msg "ERROR: Failed pruning snapshot $snap on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
+ snapStat=1
+ else
+ queue_msg "Success pruning snapshot $snap on ${DATASET} @ `date`\n\r`cat $CMDLOG`"
+ fi
fi
- fi
-done
+ done
+fi
# If we failed at any point, sent out a notice
if [ $snapStat -ne 0 ] ; then
@@ -145,8 +145,10 @@ on the remote dataset:
# zfs allow -u <user> create,receive,mount,userprop,destroy,send,hold <remotedataset>
Available Flags:
- list - List replication targets
+
add - Add a new replication target
+ init - Initialize the remote side again
+ list - List replication targets
remove - Remove a replication target
Add Options:
@@ -165,6 +167,15 @@ Add Options:
Will schedule replication of tank1 to tankbackup/backups at 10PM, notated in 24hour time
+
+Init Options:
+
+ init <localdataset/zpool>
+
+ Will re-init the remote side of the replication. This can be useful
+ when your replication gets stuck. Doing this option will remove
+ all the data on the remote side, and require a full re-sync again.
+
Remove Options:
remove <dataset>
@@ -485,6 +496,7 @@ revertsnap) require_root
shift
case ${1} in
add) add_rep_task "$2" "$3" "$4" "$5" "$6" "$7" ;;
+ init) init_rep_task "$2" ;;
list) list_rep_task ;;
remove) cat ${REPCONF} | grep -q "^${2}:"
if [ $? -eq 0 ] ; then

0 comments on commit c0e6494

Please sign in to comment.