From dcb3457d7fc9707ebca847245791aadf74c9e6ab Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Mon, 23 Mar 2026 16:40:45 -0500 Subject: [PATCH] libtailscale: fix multiTUN.Write deadlock during wireguard reconfig Fixes tailscale/tailscale#18679 Signed-off-by: Nick Khyl --- libtailscale/backend.go | 12 ++++- libtailscale/multitun.go | 95 +++++++++++++++++++++++++++++++++++++--- libtailscale/net.go | 24 +++++++++- 3 files changed, 122 insertions(+), 9 deletions(-) diff --git a/libtailscale/backend.go b/libtailscale/backend.go index d0daaea27a..9ace7352d9 100644 --- a/libtailscale/backend.go +++ b/libtailscale/backend.go @@ -145,7 +145,10 @@ func (a *App) runBackend(ctx context.Context, hardwareAttestation bool) error { if hardwareAttestation { a.backend.SetHardwareAttested() } - defer b.CloseTUNs() + defer func() { + b.devices.Down() + b.CloseTUNs() + }() hc := localapi.HandlerConfig{ Actor: ipnauth.Self, @@ -253,6 +256,9 @@ func (a *App) runBackend(ctx context.Context, hardwareAttestation bool) error { } case s := <-onDisconnect: if vpnService.service != nil && vpnService.service.ID() == s.ID() { + if b.devices.Down() { + log.Printf("tunnel brought down on disconnect") + } b.CloseTUNs() netns.SetAndroidProtectFunc(nil) netns.SetAndroidBindToNetworkFunc(nil) @@ -399,7 +405,9 @@ func (a *App) closeVpnService(err error, b *backend) { log.Printf("localapi edit prefs error %v", localApiErr) } - b.lastCfg = nil + if b.devices.Down() { + log.Printf("tunnel brought down on VPN service error: %v", err) + } b.CloseTUNs() vpnService.service.DisconnectVPN() diff --git a/libtailscale/multitun.go b/libtailscale/multitun.go index 3fa89efa74..2626f54b91 100644 --- a/libtailscale/multitun.go +++ b/libtailscale/multitun.go @@ -7,8 +7,10 @@ import ( "log" "os" "runtime/debug" + "sync" "github.com/tailscale/wireguard-go/tun" + "tailscale.com/syncs" ) // multiTUN implements a tun.Device that supports multiple @@ -30,6 +32,19 @@ type multiTUN struct { names chan chan nameReply shutdowns chan struct{} shutdownDone chan struct{} + + downMu sync.Mutex + // downCh is closed when the multiTUN is brought down, + // such as when Tailscale transitions to the Stopped state. + // This indicates that all outgoing packets should be dropped, + // and [multiTUN.Write] should return immediately without blocking + // until [multiTUN.Up] is called. See [multiTUN.Down] + // and tailscale/tailscale#18679 for more details. + // + // It can be read without holding downMu, but the mutex + // must be held when writing. + downCh syncs.AtomicValue[chan struct{}] + down bool // whether the downCh is closed } // tunDevice wraps and drives a single run.Device. @@ -76,7 +91,11 @@ func newTUNDevices() *multiTUN { names: make(chan chan nameReply), shutdowns: make(chan struct{}), shutdownDone: make(chan struct{}), + down: true, // The device is initially down. } + downCh := make(chan struct{}) + d.downCh.Store(downCh) + close(downCh) go d.run() return d } @@ -256,6 +275,47 @@ func (d *multiTUN) add(dev tun.Device) { d.devices <- dev } +// Up brings the multiTUN up, allowing it to write packets +// to the underlying tunnel device. If there is no underlying +// device yet, write operations are pended until a new device +// is added with [multiTUN.add]. +// +// It reports whether this call brought the device up. +func (d *multiTUN) Up() bool { + d.downMu.Lock() + defer d.downMu.Unlock() + if !d.down { + return false + } + d.downCh.Store(make(chan struct{})) + d.down = false + return true +} + +// Down brings the multiTUN down, causing all outgoing packets +// to be dropped without waiting for the underlying tunnel device, +// and makes all [multiTUN.Write] calls return immediately +// until [multiTUN.Up] is called. +// +// It mainly exists to distinguish between cases where the underlying +// device is temporarily unavailable due to VPN reconfiguration, +// in which case write requests should be pended, and cases where +// Tailscale is stopped, where any pending and new requests +// should complete immediately to prevent deadlocks. +// See tailscale/tailscale#18679. +// +// It reports whether this call brought the device down. +func (d *multiTUN) Down() bool { + d.downMu.Lock() + defer d.downMu.Unlock() + if d.down { + return false + } + close(d.downCh.Load()) + d.down = true + return true +} + func (d *multiTUN) File() *os.File { // The underlying file descriptor is not constant on Android. // Let's hope no-one uses it. @@ -264,16 +324,39 @@ func (d *multiTUN) File() *os.File { func (d *multiTUN) Read(data [][]byte, sizes []int, offset int) (int, error) { r := make(chan ioReply) - d.reads <- ioRequest{data, sizes, offset, r} - rep := <-r - return rep.count, rep.err + select { + // We don't care about d.downCh here, as it's fine + // to continue waiting until the tunnel is up again + // or the multiTUN device is permanently closed. + // This does not block WireGuard reconfiguration. + case d.reads <- ioRequest{data, sizes, offset, r}: + rep := <-r + return rep.count, rep.err + case <-d.close: + // Return immediately if the multiTUN device is closed. + return 0, os.ErrClosed + } } func (d *multiTUN) Write(data [][]byte, offset int) (int, error) { r := make(chan ioReply) - d.writes <- ioRequest{data, nil, offset, r} - rep := <-r - return rep.count, rep.err + select { + case d.writes <- ioRequest{data, nil, offset, r}: + rep := <-r + return rep.count, rep.err + case <-d.downCh.Load(): + // Drop the packet silently if the tunnel is down. + // Otherwise, a race may occur during wireguard reconfig + // and result in a deadlock, since a wireguard-go/device.Peer + // cannot be removed until its RoutineSequentialReceiver + // returns, and it will not return if it is blocked in + // (*multiTUN).Write while sending to d.writes without + // a receiver on the other side of the pipe. + return 0, nil + case <-d.close: + // Return immediately if the multiTUN device is closed. + return 0, os.ErrClosed + } } func (d *multiTUN) MTU() (int, error) { diff --git a/libtailscale/net.go b/libtailscale/net.go index 29242d8544..b005c98821 100644 --- a/libtailscale/net.go +++ b/libtailscale/net.go @@ -75,7 +75,7 @@ var googleDNSServers = []netip.Addr{ netip.MustParseAddr("2001:4860:4860::8844"), } -func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error { +func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) (err error) { b.logger.Logf("updateTUN: changed") defer b.logger.Logf("updateTUN: finished") @@ -89,6 +89,23 @@ func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error { b.CloseTUNs() b.logger.Logf("updateTUN: closed old TUNs") + // Since the previous tunnel(s) are closed, the [multiTUN] device is + // not operational until a new underlying tunnel is created and added, + // which may never happen in case of an error or an empty [router.Config]. + // + // Therefore, to prevent deadlocks where a [multiTUN.Write] would + // block waiting for a new tunnel to be added, we bring the multiTUN + // device down on exit unless a new [tun.Device] is created and added + // successfully. See tailscale/tailscale#18679. + // + // TODO(nickkhyl): revisit and simplify the [multiTUN] implementation? + hasTunnel := false + defer func() { + if !hasTunnel && b.devices.Down() { + b.logger.Logf("updateTUN: tunnel brought down: %v", err) + } + }() + if len(rcfg.LocalAddrs) == 0 { return nil } @@ -182,8 +199,13 @@ func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error { b.logger.Logf("updateTUN: created TUN device") b.devices.add(tunDev) + hasTunnel = true b.logger.Logf("updateTUN: added TUN device") + if b.devices.Up() { + b.logger.Logf("tunnel brought up") + } + b.lastCfg = rcfg b.lastDNSCfg = dcfg return nil