Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmd/viam-agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ func setupExitSignalHandling() (context.Context, context.CancelFunc) {
case syscall.SIGABRT:
fallthrough
case syscall.SIGTERM:
globalLogger.Info("exiting")
globalLogger.Infof("Signal %s was received. %s will now exit to be restarted by service manager",
Copy link
Member Author

@benjirewis benjirewis Sep 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change, and the addition and usage of reason for the Exit method in manager.go are for RSDK-11266. Hopefully these messages will be useful for debugging.

sig, agent.SubsystemName)
signal.Ignore(os.Interrupt, syscall.SIGTERM, syscall.SIGABRT) // keeping SIGQUIT for stack trace debugging
return

Expand Down
137 changes: 106 additions & 31 deletions manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"net/url"
"os"
"regexp"
Expand All @@ -21,13 +22,18 @@ import (
"github.com/viamrobotics/agent/subsystems/viamserver"
"github.com/viamrobotics/agent/utils"
pb "go.viam.com/api/app/agent/v1"
apppb "go.viam.com/api/app/v1"
"go.viam.com/rdk/logging"
goutils "go.viam.com/utils"
"go.viam.com/utils/rpc"
)

const (
minimalCheckInterval = time.Second * 5
// The minimal (and default) interval for checking for config updates via DeviceAgentConfig.
minimalDeviceAgentConfigCheckInterval = time.Second * 5
// The minimal (and default) interval for checking whether agent needs to be restarted.
minimalNeedsRestartCheckInterval = time.Second * 1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now have two background check goroutines: one checks for a new config every 5s (existing), and another checks for a restart every 1s (new). Each check can receive a new, different interval from the app call, so they need to be running at different cadences in different goroutines. You'll also notice that I renamed some interval variable names in this file to be more specific as to which "interval" they were associated with.


defaultNetworkTimeout = time.Second * 15
// stopAllTimeout must be lower than systemd subsystems/viamagent/viam-agent.service timeout of 4mins
// and higher than subsystems/viamserver/viamserver.go timeout of 2mins.
Expand All @@ -42,7 +48,6 @@ type Manager struct {

connMu sync.RWMutex
conn rpc.ClientConn
client pb.AgentDeviceServiceClient
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO this was a pointless field to store on the manager; creating a gRPC client on top of conn above through which to call DeviceAgentConfig requires no actual, blocking work. It was confusing to store this variable on the struct and check its existence to see if we had dialed already.

cloudConfig *logging.CloudConfig

logger logging.Logger
Expand Down Expand Up @@ -209,7 +214,7 @@ func (m *Manager) SubsystemUpdates(ctx context.Context) {
m.logger.Warn(err)
}
if m.viamAgentNeedsRestart {
m.Exit()
m.Exit(fmt.Sprintf("A new version of %s has been installed", SubsystemName))
return
}
} else {
Expand All @@ -221,17 +226,19 @@ func (m *Manager) SubsystemUpdates(ctx context.Context) {
needRestartConfigChange := m.viamServer.Update(ctx, m.cfg)

if needRestart || needRestartConfigChange || m.viamServerNeedsRestart || m.viamAgentNeedsRestart {
if m.viamServer.(viamserver.RestartCheck).SafeToRestart(ctx) {
if m.viamServer.Property(ctx, viamserver.RestartPropertyRestartAllowed) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the RestartCheck interface, and instead added a Property method to the Subsystem interface. I think it's a bit easier to read. I also moved logging about the result of querying restart_allowed to this file (line below this).

m.logger.Infof("%s has allowed a restart; will restart", viamserver.SubsysName)
if err := m.viamServer.Stop(ctx); err != nil {
m.logger.Warn(err)
} else {
m.viamServerNeedsRestart = false
}
if m.viamAgentNeedsRestart {
m.Exit()
m.Exit(fmt.Sprintf("A new version of %s has been installed", SubsystemName))
return
}
} else {
m.logger.Warnf("%s has NOT allowed a restart; will NOT restart", viamserver.SubsysName)
m.viamServerNeedsRestart = true
}
}
Expand Down Expand Up @@ -280,26 +287,26 @@ func (m *Manager) SubsystemUpdates(ctx context.Context) {
// CheckUpdates retrieves an updated config from the cloud, and then passes it to SubsystemUpdates().
func (m *Manager) CheckUpdates(ctx context.Context) time.Duration {
defer utils.Recover(m.logger, nil)
m.logger.Debug("Checking cloud for update")
interval, err := m.GetConfig(ctx)
m.logger.Debug("Checking cloud for device agent config updates")
deviceAgentConfigCheckInterval, err := m.GetConfig(ctx)

if interval < minimalCheckInterval {
interval = minimalCheckInterval
if deviceAgentConfigCheckInterval < minimalDeviceAgentConfigCheckInterval {
deviceAgentConfigCheckInterval = minimalDeviceAgentConfigCheckInterval
}

// randomly fuzz the interval by +/- 5%
interval = utils.FuzzTime(interval, 0.05)
deviceAgentConfigCheckInterval = utils.FuzzTime(deviceAgentConfigCheckInterval, 0.05)

// we already log in all error cases inside GetConfig, so
// no need to log again.
if err != nil {
return interval
return deviceAgentConfigCheckInterval
}

// update and (re)start subsystems
m.SubsystemUpdates(ctx)

return interval
return deviceAgentConfigCheckInterval
}

func (m *Manager) setDebug(debug bool) {
Expand Down Expand Up @@ -380,13 +387,51 @@ func (m *Manager) SubsystemHealthChecks(ctx context.Context) {
}
}

// CheckIfNeedsRestart returns the check restart interval and whether the agent (and
// therefore all its subsystems) has been forcibly restarted by app.
func (m *Manager) CheckIfNeedsRestart(ctx context.Context) (time.Duration, bool) {
m.logger.Debug("Checking cloud for forced restarts")
if m.cloudConfig == nil {
m.logger.Warn("can't CheckIfNeedsRestart until successful config load")
return minimalNeedsRestartCheckInterval, false
}

// Only continue this check if viam-server does not handle restart checking itself
// (return early if viamserver _does_ handle restart checking).
if !m.viamServer.Property(ctx, viamserver.RestartPropertyDoesNotHandleNeedsRestart) {
return minimalNeedsRestartCheckInterval, false
}

m.logger.Debug("Checking cloud for forced restarts")
timeoutCtx, cancelFunc := context.WithTimeout(ctx, defaultNetworkTimeout)
defer cancelFunc()

if err := m.dial(timeoutCtx); err != nil {
m.logger.Warn(errw.Wrapf(err, "dialing to check if restart needed"))
return minimalNeedsRestartCheckInterval, false
}

robotServiceClient := apppb.NewRobotServiceClient(m.conn)
req := &apppb.NeedsRestartRequest{Id: m.cloudConfig.ID}
res, err := robotServiceClient.NeedsRestart(timeoutCtx, req)
if err != nil {
m.logger.Warn(errw.Wrapf(err, "checking if restart needed"))
return minimalNeedsRestartCheckInterval, false
}

return res.GetRestartCheckInterval().AsDuration(), res.GetMustRestart()
}

// CloseAll stops all subsystems and closes the cloud connection.
func (m *Manager) CloseAll() {
ctx, cancel := context.WithCancel(context.Background())

// Use a slow goroutine watcher to log and continue if shutdown is taking too long.
slowWatcher, slowWatcherCancel := goutils.SlowGoroutineWatcher(
stopAllTimeout, "Agent is taking a while to shut down,", m.logger)
stopAllTimeout,
fmt.Sprintf("Viam agent subsystems and/or background workers failed to shut down within %v", stopAllTimeout),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[drive-by] This log was getting output after agent shutdown timed out, so the message was slightly inaccurate.

m.logger,
)

slowTicker := time.NewTicker(10 * time.Second)
defer slowTicker.Stop()
Expand Down Expand Up @@ -430,7 +475,6 @@ func (m *Manager) CloseAll() {
}
}

m.client = nil
m.conn = nil
})

Expand Down Expand Up @@ -479,7 +523,8 @@ func (m *Manager) CloseAll() {
}
}

// StartBackgroundChecks kicks off a go routine that loops on a timer to check for updates and health checks.
// StartBackgroundChecks kicks off go routines that loop on a timerr to check for updates,
// health checks, and restarts.
func (m *Manager) StartBackgroundChecks(ctx context.Context) {
if ctx.Err() != nil {
return
Expand All @@ -495,18 +540,18 @@ func (m *Manager) StartBackgroundChecks(ctx context.Context) {
})
defer m.activeBackgroundWorkers.Done()

checkInterval := minimalCheckInterval
deviceAgentConfigCheckInterval := minimalDeviceAgentConfigCheckInterval
m.cfgMu.RLock()
wait := m.cfg.AdvancedSettings.WaitForUpdateCheck.Get()
m.cfgMu.RUnlock()
if wait {
checkInterval = m.CheckUpdates(ctx)
deviceAgentConfigCheckInterval = m.CheckUpdates(ctx)
} else {
// premptively start things before we go into the regular update/check/restart
m.SubsystemHealthChecks(ctx)
}

timer := time.NewTimer(checkInterval)
timer := time.NewTimer(deviceAgentConfigCheckInterval)
defer timer.Stop()
for {
if ctx.Err() != nil {
Expand All @@ -516,9 +561,39 @@ func (m *Manager) StartBackgroundChecks(ctx context.Context) {
case <-ctx.Done():
return
case <-timer.C:
checkInterval = m.CheckUpdates(ctx)
deviceAgentConfigCheckInterval = m.CheckUpdates(ctx)
m.SubsystemHealthChecks(ctx)
timer.Reset(checkInterval)
timer.Reset(deviceAgentConfigCheckInterval)
}
}
}()

m.activeBackgroundWorkers.Add(1)
go func() {
defer m.activeBackgroundWorkers.Done()

timer := time.NewTimer(minimalNeedsRestartCheckInterval)
defer timer.Stop()
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case <-timer.C:
needsRestartCheckInterval, needsRestart := m.CheckIfNeedsRestart(ctx)
if needsRestartCheckInterval < minimalNeedsRestartCheckInterval {
needsRestartCheckInterval = minimalNeedsRestartCheckInterval
}
if needsRestart {
// Do not mark m.agentNeedsRestart and instead Exit immediately; we do not want
// to wait for viam-server to allow a restart as it may be in a bad state.
m.Exit(fmt.Sprintf("A restart of %s was requested from app", SubsystemName))
}
// As with the device agent config check interval, randomly fuzz the interval by
// +/- 5%.
timer.Reset(utils.FuzzTime(needsRestartCheckInterval, 0.05))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were doing this for the config check interval, too... I'm not sure why; anyone know?

}
}
}()
Expand All @@ -531,11 +606,11 @@ func (m *Manager) dial(ctx context.Context) error {
return ctx.Err()
}
if m.cloudConfig == nil {
return errors.New("cannot dial() until successful LoadConfig")
return errors.New("cannot dial() until successful config load")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[drive-by] LoadConfig is now called LoadConfigFromCache (changed a while ago), so I changed these errors to be more generic.

}
m.connMu.Lock()
defer m.connMu.Unlock()
if m.client != nil {
if m.conn != nil {
return nil
}

Expand Down Expand Up @@ -564,7 +639,6 @@ func (m *Manager) dial(ctx context.Context) error {
return err
}
m.conn = conn
m.client = pb.NewAgentDeviceServiceClient(m.conn)

if m.netAppender != nil {
m.netAppender.SetConn(conn, true)
Expand All @@ -577,27 +651,28 @@ func (m *Manager) dial(ctx context.Context) error {
// GetConfig retrieves the configuration from the cloud.
func (m *Manager) GetConfig(ctx context.Context) (time.Duration, error) {
if m.cloudConfig == nil {
err := errors.New("can't GetConfig until successful LoadConfig")
err := errors.New("can't GetConfig until successful config load")
m.logger.Warn(err)
return minimalCheckInterval, err
return minimalDeviceAgentConfigCheckInterval, err
}
timeoutCtx, cancelFunc := context.WithTimeout(ctx, defaultNetworkTimeout)
defer cancelFunc()

if err := m.dial(timeoutCtx); err != nil {
m.logger.Warn(errw.Wrapf(err, "fetching %s config", SubsystemName))
return minimalCheckInterval, err
m.logger.Warn(errw.Wrapf(err, "dialing to fetch %s config", SubsystemName))
return minimalDeviceAgentConfigCheckInterval, err
}

agentDeviceServiceClient := pb.NewAgentDeviceServiceClient(m.conn)
req := &pb.DeviceAgentConfigRequest{
Id: m.cloudConfig.ID,
HostInfo: m.getHostInfo(),
VersionInfo: m.getVersions(),
}
resp, err := m.client.DeviceAgentConfig(timeoutCtx, req)
resp, err := agentDeviceServiceClient.DeviceAgentConfig(timeoutCtx, req)
if err != nil {
m.logger.Warn(errw.Wrapf(err, "fetching %s config", SubsystemName))
return minimalCheckInterval, err
return minimalDeviceAgentConfigCheckInterval, err
}
fixWindowsPaths(resp)

Expand Down Expand Up @@ -699,7 +774,7 @@ func (m *Manager) getVersions() *pb.VersionInfo {
return vers
}

func (m *Manager) Exit() {
m.logger.Info("A new viam-agent has been installed. Will now exit to be restarted by service manager.")
func (m *Manager) Exit(reason string) {
m.logger.Infof("%s. %s will now exit to be restarted by service manager", reason, SubsystemName)
m.globalCancel()
}
5 changes: 5 additions & 0 deletions subsystems/networking/networking_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -425,3 +425,8 @@ func (n *Networking) writeWifiPowerSave(ctx context.Context) error {

return nil
}

// Property is a noop for the networking subsystem.
func (n *Networking) Property(_ context.Context, _ string) bool {
return false
}
7 changes: 7 additions & 0 deletions subsystems/subsystems.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ type Subsystem interface {

// HealthCheck reports if a subsystem is running correctly (it is restarted if not)
HealthCheck(ctx context.Context) error

// Property gets an arbitrary property about the running subystem.
Property(ctx context.Context, property string) bool
Copy link
Member Author

@benjirewis benjirewis Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the new method I added. I realize the interface here is meant to reveal a limited API from manager -> subsystems, but I found that the manager truly needed to know a couple "properties" of the running viamserver subsystem (whether restart was currently allowed and whether viamserver was already handling restart checking logic), so I thought this was worth adding despite it opening a pretty generic API to subsystems.

}

// Dummy is a fake subsystem for when a particular OS doesn't (yet) have support.
Expand All @@ -39,3 +42,7 @@ func (d *Dummy) Update(_ context.Context, _ utils.AgentConfig) bool {
func (d *Dummy) HealthCheck(_ context.Context) error {
return nil
}

func (d *Dummy) Property(_ context.Context, _ string) bool {
return false
}
5 changes: 5 additions & 0 deletions subsystems/syscfg/syscfg_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,8 @@ func (s *syscfg) HealthCheck(ctx context.Context) error {
}
return errors.New("healthcheck failed")
}

// Property is a noop for the syscfg subsystem.
func (s *syscfg) Property(_ context.Context, _ string) bool {
return false
}
Loading