From a807e6a76a584af4ac7ea498c3016289e117cf73 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Tue, 17 Mar 2020 15:04:15 +0000 Subject: [PATCH 1/2] CA-331454: scan all SRs on master startup Signed-off-by: Mark Syms --- drivers/02-vhdcleanup | 21 ++++++++++++++++----- drivers/SR.py | 7 +++++++ drivers/SRCommand.py | 10 ++++++++-- drivers/cleanup.py | 18 +++++++++++++++++- drivers/util.py | 10 +++++++++- tests/test_cleanup.py | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 9 deletions(-) diff --git a/drivers/02-vhdcleanup b/drivers/02-vhdcleanup index 17f395f18..76ce2aef1 100644 --- a/drivers/02-vhdcleanup +++ b/drivers/02-vhdcleanup @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Copyright (C) Citrix Systems Inc. # @@ -23,17 +23,27 @@ CLEANUP_SCRIPT="/opt/xensource/sm/cleanup.py" LVHD_UTIL_SCRIPT="/opt/xensource/sm/lvhdutil.py" +declare -A LVM_TYPES + +for f in lvhdoiscsi lvhdohba lvhdofcoe lvmoiscsi lvmohba lvmofcoe; do + LVM_TYPES[$f]=y +done + start() { echo -n $"Fixing refcounts on new master: " - for type in lvhdoiscsi lvhdohba lvmoiscsi lvmohba; do + logger -p local2.info "Fixing refcounts on new master: " + + TYPES=`xe sm-list required-api-version=1.0 params=type --minimal | tr ',' '\n'` + for type in $TYPES; do srUuids=`xe sr-list type=$type params=uuid --minimal | sed "s/,/ /g"` for uuid in $srUuids; do - python $LVHD_UTIL_SCRIPT fixrefcounts $uuid + if [ -n "${LVM_TYPES[$type]+check}" ]; then + echo -n "Fixing $type" + python $LVHD_UTIL_SCRIPT fixrefcounts $uuid + fi # do a scan to handle any journaled interrupted operations: # VDI inflate, VHD ops, VDI.clone xe sr-scan uuid=$uuid - # kick off GC in case there is already work to do - $CLEANUP_SCRIPT -u $uuid -g done done echo -n $"OK" @@ -44,6 +54,7 @@ start() { stop() { echo -n $"Aborting running cleaners: " + logger -p local2.info "Aborting running cleaners: " TYPES=`xe sm-list required-api-version=1.0 params=type --minimal | tr ',' '\n'` for type in $TYPES; do srUuids=`xe sr-list type=$type params=uuid --minimal | sed "s/,/ /g"` diff --git a/drivers/SR.py b/drivers/SR.py index e7cabac7e..b8dda2661 100755 --- a/drivers/SR.py +++ b/drivers/SR.py @@ -277,6 +277,13 @@ def attach(self, uuid): """ raise xs_errors.XenError('Unimplemented') + def after_master_attach(self, uuid): + """Perform actions required after attaching on the pool master + Return: + None + """ + self.scan(uuid) + def detach(self, uuid): """Remove local access to the SR. Destroys any device state initiated by the sr_attach() operation. diff --git a/drivers/SRCommand.py b/drivers/SRCommand.py index 5ebaeefac..a7e49c630 100755 --- a/drivers/SRCommand.py +++ b/drivers/SRCommand.py @@ -339,14 +339,20 @@ def _run(self, sr, target): if sr.dconf.get("SRmaster") == "true": is_master = True + sr_uuid = self.params['sr_uuid'] + resetvdis.reset_sr(sr.session, util.get_this_host(), - self.params['sr_uuid'], is_master) + sr_uuid, is_master) if is_master: # Schedule a scan only when attaching on the SRmaster util.set_dirty(sr.session, self.params["sr_ref"]) - return sr.attach(self.params['sr_uuid']) + try: + return sr.attach(sr_uuid) + finally: + if is_master: + sr.after_master_attach(sr_uuid) elif self.cmd == 'sr_detach': return sr.detach(self.params['sr_uuid']) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 55922287c..220a6daae 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -80,6 +80,8 @@ N_RUNNING_AVERAGE = 10 +NON_PERSISTENT_DIR = '/run/nonpersistent/sm' + class AbortException(util.SMException): pass @@ -2761,6 +2763,17 @@ def normalizeType(type): GCPAUSE_DEFAULT_SLEEP = 5 * 60 +def _gc_init_file(sr_uuid): + return os.path.join(NON_PERSISTENT_DIR, str(sr_uuid), 'gc_init') + + +def _create_init_file(sr_uuid): + util.makedirs(os.path.join(NON_PERSISTENT_DIR, str(sr_uuid))) + with open(os.path.join( + NON_PERSISTENT_DIR, str(sr_uuid), 'gc_init'), 'w+') as f: + f.write('1') + + def _gcLoopPause(sr, dryRun): # Check to see if the GCPAUSE_FISTPOINT is present. If so the fist @@ -2770,7 +2783,7 @@ def _gcLoopPause(sr, dryRun): util.fistpoint.activate_custom_fn(util.GCPAUSE_FISTPOINT, lambda *args: None) - else: + elif os.path.exists(_gc_init_file(sr.uuid)): def abortTest(): return IPCFlag(sr.uuid).test(FLAG_TYPE_ABORT) @@ -2810,6 +2823,8 @@ def _gcLoop(sr, dryRun): if not sr.gcEnabled(): break sr.cleanupCoalesceJournals() + # Create the init file here in case startup is waiting on it + _create_init_file(sr.uuid) sr.scanLocked() sr.updateBlockInfo() @@ -2841,6 +2856,7 @@ def _gcLoop(sr, dryRun): lockRunning.release() finally: Util.log("GC process exiting, no work left") + _create_init_file(sr.uuid) lockActive.release() def _gc(session, srUuid, dryRun): diff --git a/drivers/util.py b/drivers/util.py index 4b874388d..337add2ba 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -551,7 +551,15 @@ def makedirs(name, mode=0777): makedirs(head, mode) if tail == os.curdir: return - os.mkdir(name, mode) + try: + os.mkdir(name, mode) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(name): + if mode: + os.chmod(name, mode) + pass + else: + raise def zeroOut(path, fromByte, bytes): """write 'bytes' zeros to 'path' starting from fromByte (inclusive)""" diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index 753733d08..a98416e1f 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -220,8 +220,10 @@ def test_gcPause_calls_fist_point( @mock.patch('util.fistpoint', autospec=True) @mock.patch('cleanup.SR', autospec=True) @mock.patch('cleanup.Util.runAbortable') + @mock.patch('os.path.exists', autospec=True) def test_gcPause_calls_abortable_sleep( self, + mock_exists, mock_abortable, mock_sr, mock_fist_point): @@ -232,6 +234,8 @@ def test_gcPause_calls_abortable_sleep( # Fake that the fist point is not active. mock_fist_point.is_active.return_value = False + # The GC init file does exist + mock_exists.return_value = True cleanup._gcLoopPause(mock_sr, False) @@ -243,6 +247,34 @@ def test_gcPause_calls_abortable_sleep( mock.ANY, cleanup.VDI.POLL_INTERVAL, cleanup.GCPAUSE_DEFAULT_SLEEP * 1.1) + @mock.patch('util.fistpoint', autospec=True) + @mock.patch('cleanup.SR', autospec=True) + @mock.patch('cleanup.Util.runAbortable') + @mock.patch('os.path.exists', autospec=True) + def test_gcPause_skipped_on_first_run( + self, + mock_exists, + mock_abortable, + mock_sr, + mock_fist_point): + """ + Don't sleep the GC on the first run after host boot. + """ + self.setup_mock_sr(mock_sr) + + # Fake that the fist point is not active. + mock_fist_point.is_active.return_value = False + # The GC init file doesn't exist + mock_exists.return_value = False + + cleanup._gcLoopPause(mock_sr, False) + + # Make sure we check for the active fist point. + mock_fist_point.is_active.assert_called_with(util.GCPAUSE_FISTPOINT) + + # Fist point is not active so call abortable sleep. + self.assertEqual(0, mock_abortable.call_count) + @mock.patch('cleanup.SR', autospec=True) @mock.patch('cleanup._abort') def test_lock_released_by_abort_when_held( From efd47af6e259e44a3090ba5a204d7027003a5681 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Fri, 20 Mar 2020 16:38:59 +0000 Subject: [PATCH 2/2] CA-331454: Remove leaf coalesce journal immediately before deleting parent otherwise it's inconsistent Signed-off-by: Mark Syms --- drivers/cleanup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 220a6daae..19bce3690 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2112,12 +2112,14 @@ def _doCoalesceLeaf(self, vdi): util.fistpoint.activate("LVHDRT_coaleaf_before_delete", self.uuid) self.deleteVDI(vdi) util.fistpoint.activate("LVHDRT_coaleaf_after_delete", self.uuid) + + util.fistpoint.activate("LVHDRT_coaleaf_before_remove_j", self.uuid) + self.journaler.remove(VDI.JRN_LEAF, vdiUuid) + self.forgetVDI(origParentUuid) self._finishCoalesceLeaf(parent) self._updateSlavesOnResize(parent) - util.fistpoint.activate("LVHDRT_coaleaf_before_remove_j", self.uuid) - self.journaler.remove(VDI.JRN_LEAF, vdiUuid) def _calcExtraSpaceNeeded(self, child, parent): assert(not parent.raw) # raw parents not supported