Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CA-331454: Handle interrupted GC #491

Merged
merged 2 commits into from
Mar 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions drivers/02-vhdcleanup
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash
#
# Copyright (C) Citrix Systems Inc.
#
Expand All @@ -23,17 +23,27 @@
CLEANUP_SCRIPT="/opt/xensource/sm/cleanup.py"
LVHD_UTIL_SCRIPT="/opt/xensource/sm/lvhdutil.py"

declare -A LVM_TYPES

for f in lvhdoiscsi lvhdohba lvhdofcoe lvmoiscsi lvmohba lvmofcoe; do
LVM_TYPES[$f]=y
done

start() {
echo -n $"Fixing refcounts on new master: "
for type in lvhdoiscsi lvhdohba lvmoiscsi lvmohba; do
logger -p local2.info "Fixing refcounts on new master: "

TYPES=`xe sm-list required-api-version=1.0 params=type --minimal | tr ',' '\n'`
for type in $TYPES; do
srUuids=`xe sr-list type=$type params=uuid --minimal | sed "s/,/ /g"`
for uuid in $srUuids; do
python $LVHD_UTIL_SCRIPT fixrefcounts $uuid
if [ -n "${LVM_TYPES[$type]+check}" ]; then
echo -n "Fixing $type"
python $LVHD_UTIL_SCRIPT fixrefcounts $uuid
fi
# do a scan to handle any journaled interrupted operations:
# VDI inflate, VHD ops, VDI.clone
xe sr-scan uuid=$uuid
# kick off GC in case there is already work to do
$CLEANUP_SCRIPT -u $uuid -g
done
done
echo -n $"OK"
Expand All @@ -44,6 +54,7 @@ start() {

stop() {
echo -n $"Aborting running cleaners: "
logger -p local2.info "Aborting running cleaners: "
TYPES=`xe sm-list required-api-version=1.0 params=type --minimal | tr ',' '\n'`
for type in $TYPES; do
srUuids=`xe sr-list type=$type params=uuid --minimal | sed "s/,/ /g"`
Expand Down
7 changes: 7 additions & 0 deletions drivers/SR.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,13 @@ def attach(self, uuid):
"""
raise xs_errors.XenError('Unimplemented')

def after_master_attach(self, uuid):
"""Perform actions required after attaching on the pool master
Return:
None
"""
self.scan(uuid)

def detach(self, uuid):
"""Remove local access to the SR. Destroys any device
state initiated by the sr_attach() operation.
Expand Down
10 changes: 8 additions & 2 deletions drivers/SRCommand.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,20 @@ def _run(self, sr, target):
if sr.dconf.get("SRmaster") == "true":
is_master = True

sr_uuid = self.params['sr_uuid']

resetvdis.reset_sr(sr.session, util.get_this_host(),
self.params['sr_uuid'], is_master)
sr_uuid, is_master)

if is_master:
# Schedule a scan only when attaching on the SRmaster
util.set_dirty(sr.session, self.params["sr_ref"])

return sr.attach(self.params['sr_uuid'])
try:
return sr.attach(sr_uuid)
finally:
if is_master:
sr.after_master_attach(sr_uuid)

elif self.cmd == 'sr_detach':
return sr.detach(self.params['sr_uuid'])
Expand Down
24 changes: 21 additions & 3 deletions drivers/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@

N_RUNNING_AVERAGE = 10

NON_PERSISTENT_DIR = '/run/nonpersistent/sm'

class AbortException(util.SMException):
pass

Expand Down Expand Up @@ -2110,12 +2112,14 @@ def _doCoalesceLeaf(self, vdi):
util.fistpoint.activate("LVHDRT_coaleaf_before_delete", self.uuid)
self.deleteVDI(vdi)
util.fistpoint.activate("LVHDRT_coaleaf_after_delete", self.uuid)

util.fistpoint.activate("LVHDRT_coaleaf_before_remove_j", self.uuid)
self.journaler.remove(VDI.JRN_LEAF, vdiUuid)

self.forgetVDI(origParentUuid)
self._finishCoalesceLeaf(parent)
self._updateSlavesOnResize(parent)

util.fistpoint.activate("LVHDRT_coaleaf_before_remove_j", self.uuid)
self.journaler.remove(VDI.JRN_LEAF, vdiUuid)

def _calcExtraSpaceNeeded(self, child, parent):
assert(not parent.raw) # raw parents not supported
Expand Down Expand Up @@ -2761,6 +2765,17 @@ def normalizeType(type):
GCPAUSE_DEFAULT_SLEEP = 5 * 60


def _gc_init_file(sr_uuid):
return os.path.join(NON_PERSISTENT_DIR, str(sr_uuid), 'gc_init')


def _create_init_file(sr_uuid):
util.makedirs(os.path.join(NON_PERSISTENT_DIR, str(sr_uuid)))
with open(os.path.join(
NON_PERSISTENT_DIR, str(sr_uuid), 'gc_init'), 'w+') as f:
f.write('1')


def _gcLoopPause(sr, dryRun):

# Check to see if the GCPAUSE_FISTPOINT is present. If so the fist
Expand All @@ -2770,7 +2785,7 @@ def _gcLoopPause(sr, dryRun):

util.fistpoint.activate_custom_fn(util.GCPAUSE_FISTPOINT,
lambda *args: None)
else:
elif os.path.exists(_gc_init_file(sr.uuid)):
def abortTest():
return IPCFlag(sr.uuid).test(FLAG_TYPE_ABORT)

Expand Down Expand Up @@ -2810,6 +2825,8 @@ def _gcLoop(sr, dryRun):
if not sr.gcEnabled():
break
sr.cleanupCoalesceJournals()
# Create the init file here in case startup is waiting on it
_create_init_file(sr.uuid)
sr.scanLocked()
sr.updateBlockInfo()

Expand Down Expand Up @@ -2841,6 +2858,7 @@ def _gcLoop(sr, dryRun):
lockRunning.release()
finally:
Util.log("GC process exiting, no work left")
_create_init_file(sr.uuid)
lockActive.release()

def _gc(session, srUuid, dryRun):
Expand Down
10 changes: 9 additions & 1 deletion drivers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,15 @@ def makedirs(name, mode=0777):
makedirs(head, mode)
if tail == os.curdir:
return
os.mkdir(name, mode)
try:
os.mkdir(name, mode)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(name):
if mode:
os.chmod(name, mode)
pass
else:
raise

def zeroOut(path, fromByte, bytes):
"""write 'bytes' zeros to 'path' starting from fromByte (inclusive)"""
Expand Down
32 changes: 32 additions & 0 deletions tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,10 @@ def test_gcPause_calls_fist_point(
@mock.patch('util.fistpoint', autospec=True)
@mock.patch('cleanup.SR', autospec=True)
@mock.patch('cleanup.Util.runAbortable')
@mock.patch('os.path.exists', autospec=True)
def test_gcPause_calls_abortable_sleep(
self,
mock_exists,
mock_abortable,
mock_sr,
mock_fist_point):
Expand All @@ -232,6 +234,8 @@ def test_gcPause_calls_abortable_sleep(

# Fake that the fist point is not active.
mock_fist_point.is_active.return_value = False
# The GC init file does exist
mock_exists.return_value = True

cleanup._gcLoopPause(mock_sr, False)

Expand All @@ -243,6 +247,34 @@ def test_gcPause_calls_abortable_sleep(
mock.ANY, cleanup.VDI.POLL_INTERVAL,
cleanup.GCPAUSE_DEFAULT_SLEEP * 1.1)

@mock.patch('util.fistpoint', autospec=True)
@mock.patch('cleanup.SR', autospec=True)
@mock.patch('cleanup.Util.runAbortable')
@mock.patch('os.path.exists', autospec=True)
def test_gcPause_skipped_on_first_run(
self,
mock_exists,
mock_abortable,
mock_sr,
mock_fist_point):
"""
Don't sleep the GC on the first run after host boot.
"""
self.setup_mock_sr(mock_sr)

# Fake that the fist point is not active.
mock_fist_point.is_active.return_value = False
# The GC init file doesn't exist
mock_exists.return_value = False

cleanup._gcLoopPause(mock_sr, False)

# Make sure we check for the active fist point.
mock_fist_point.is_active.assert_called_with(util.GCPAUSE_FISTPOINT)

# Fist point is not active so call abortable sleep.
self.assertEqual(0, mock_abortable.call_count)

@mock.patch('cleanup.SR', autospec=True)
@mock.patch('cleanup._abort')
def test_lock_released_by_abort_when_held(
Expand Down