From f511716f1095a576c8a6ca8bdf05ab88c6bb2cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Tue, 23 Feb 2016 07:37:26 +0100 Subject: [PATCH] Fix race in membership and ring consistency Without this change, membership and ring could be not in sync. Scenario: A - current node. B - alive (according to C). B - suspect (according to D). C - thinks B is faulty. D - thinks B is alive. At the same time: 1. C gossips A that B is faulty. A removes B to the membership list. 2. D gossips A that B is alive. A adds B from the membership list. Changes in the ring: 3. The lock on m.members is released, so updates on Ring due to (1) and (2) can execute in any order. In our example, the operations are reversed: 3.1. A adds B to the ring list due to (2). 3.2. A removes B to the ring list due to (1). Now B is in the membership list, but not in the ring. This is the second iteration of the change. First iteration did not introduce a new lock, but a new invariant: a goroutine needs to lock both memberlist and disseminator, lock memberlist first, then disseminator. However, the invariant caused requestAdminStats to fail on travis-ci, and I couldn't find out why. --- swim/memberlist.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/swim/memberlist.go b/swim/memberlist.go index 4a3650f9..39059e37 100644 --- a/swim/memberlist.go +++ b/swim/memberlist.go @@ -45,6 +45,11 @@ type memberlist struct { checksum uint32 sync.RWMutex } + + // Required for Update() operation. We need to keep membership list and + // hash ring atomic, so lock with this mutex in that code. + // TODO: rework locking. This mutex is the last straw. See #113 + sync.Mutex } // newMemberlist returns a new member list @@ -230,6 +235,7 @@ func (m *memberlist) Update(changes []Change) (applied []Change) { m.node.emit(MemberlistChangesReceivedEvent{changes}) + m.Lock() m.members.Lock() for _, change := range changes { @@ -281,6 +287,7 @@ func (m *memberlist) Update(changes []Change) (applied []Change) { m.node.rollup.TrackUpdates(applied) } + m.Unlock() return applied }