From b17e4c877b8669e4dcf22c284723b7fe51cbc5b1 Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 11:41:25 +0100 Subject: [PATCH 01/12] health: stability events log + live system metrics page backend/stability: EventLog persists structured events to $SNAP_COMMON/stability-events.jsonl (one JSON line per zram setup / file-swap disable / pressure detection / SIGTERM / SIGKILL). Watcher and Zram both accept an optional *EventLog and append events alongside their existing zap logging. backend/health: New package. Collector reads /proc/stat, /proc/meminfo, /proc/diskstats, /proc/net/dev for live system metrics; statfs of each non-snap mountpoint for capacity. Health{} bundles the EventLog reader + Collector for use by REST. backend/rest: Two new admin-secured endpoints: GET /rest/settings/health/events?limit=N -> recent stability events GET /rest/settings/health/metrics -> single Snapshot web/platform: New Health.vue. Polls metrics every 2 s (computes CPU% / disk-IO KB/s / net rate from snapshot deltas), events every 10 s. Shows per-mount usage bars, swap usage, and the stability event history. Listed under Settings with a 'favorite' material icon. --- backend/cmd/stability/main.go | 9 +- backend/health/health.go | 22 +++ backend/health/metrics.go | 234 ++++++++++++++++++++++++++++ backend/health/metrics_test.go | 71 +++++++++ backend/health/mounts.go | 45 ++++++ backend/ioc/common.go | 21 +++ backend/ioc/public_api.go | 4 +- backend/rest/backend.go | 21 +++ backend/stability/events.go | 89 +++++++++++ backend/stability/events_test.go | 45 ++++++ backend/stability/oom.go | 13 +- backend/stability/oom_test.go | 4 +- backend/stability/zram.go | 10 +- web/platform/src/locales/en.json | 28 +++- web/platform/src/router/index.js | 1 + web/platform/src/views/Health.vue | 218 ++++++++++++++++++++++++++ web/platform/src/views/Settings.vue | 7 + 17 files changed, 834 insertions(+), 8 deletions(-) create mode 100644 backend/health/health.go create mode 100644 backend/health/metrics.go create mode 100644 backend/health/metrics_test.go create mode 100644 backend/health/mounts.go create mode 100644 backend/stability/events.go create mode 100644 backend/stability/events_test.go create mode 100644 web/platform/src/views/Health.vue diff --git a/backend/cmd/stability/main.go b/backend/cmd/stability/main.go index 08ce1160..c955ad54 100644 --- a/backend/cmd/stability/main.go +++ b/backend/cmd/stability/main.go @@ -16,7 +16,12 @@ func main() { defer cancel() mem := stability.NewMemInfo("/proc") - z := stability.NewZram(mem, stability.SwaponSyscall, stability.SwapoffSyscall, logger) + commonDir := os.Getenv("SNAP_COMMON") + if commonDir == "" { + commonDir = "/var/snap/platform/common" + } + events := stability.NewEventLog(commonDir + "/stability-events.jsonl") + z := stability.NewZram(mem, stability.SwaponSyscall, stability.SwapoffSyscall, events, logger) if err := z.EnsureConfigured(); err != nil { logger.Sugar().Warnf("stability: zram setup failed (continuing): %v", err) } @@ -24,7 +29,7 @@ func main() { scan := stability.NewProcScanner("/proc") w := stability.NewWatcher(mem, scan, func(pid int, sig syscall.Signal) error { return syscall.Kill(pid, sig) - }, logger) + }, events, logger) if err := w.Run(ctx); err != nil && err != context.Canceled { logger.Sugar().Errorf("stability: watcher exited: %v", err) diff --git a/backend/health/health.go b/backend/health/health.go new file mode 100644 index 00000000..af5efbf9 --- /dev/null +++ b/backend/health/health.go @@ -0,0 +1,22 @@ +package health + +import ( + "github.com/syncloud/platform/stability" +) + +type Health struct { + events *stability.EventLog + collector *Collector +} + +func NewHealth(events *stability.EventLog, collector *Collector) *Health { + return &Health{events: events, collector: collector} +} + +func (h *Health) Events(limit int) ([]stability.Event, error) { + return h.events.Recent(limit) +} + +func (h *Health) Metrics() (Snapshot, error) { + return h.collector.Snapshot() +} diff --git a/backend/health/metrics.go b/backend/health/metrics.go new file mode 100644 index 00000000..c10603bf --- /dev/null +++ b/backend/health/metrics.go @@ -0,0 +1,234 @@ +package health + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +type CPU struct { + User uint64 `json:"user"` + Nice uint64 `json:"nice"` + System uint64 `json:"system"` + Idle uint64 `json:"idle"` + IOWait uint64 `json:"iowait"` + IRQ uint64 `json:"irq"` + SoftIRQ uint64 `json:"softirq"` + Steal uint64 `json:"steal"` +} + +func (c CPU) Total() uint64 { + return c.User + c.Nice + c.System + c.Idle + c.IOWait + c.IRQ + c.SoftIRQ + c.Steal +} + +func (c CPU) Busy() uint64 { + return c.Total() - c.Idle - c.IOWait +} + +type Memory struct { + TotalKB uint64 `json:"total_kb"` + AvailableKB uint64 `json:"available_kb"` + FreeKB uint64 `json:"free_kb"` + BuffersKB uint64 `json:"buffers_kb"` + CachedKB uint64 `json:"cached_kb"` + SwapTotalKB uint64 `json:"swap_total_kb"` + SwapFreeKB uint64 `json:"swap_free_kb"` +} + +type Disk struct { + Name string `json:"name"` + ReadsTotal uint64 `json:"reads_total"` + WritesTotal uint64 `json:"writes_total"` + SectorsRead uint64 `json:"sectors_read"` + SectorsWrt uint64 `json:"sectors_written"` +} + +type Mount struct { + Path string `json:"path"` + TotalKB uint64 `json:"total_kb"` + UsedKB uint64 `json:"used_kb"` +} + +type Net struct { + Name string `json:"name"` + RxBytes uint64 `json:"rx_bytes"` + TxBytes uint64 `json:"tx_bytes"` +} + +type Snapshot struct { + CPU CPU `json:"cpu"` + Memory Memory `json:"memory"` + Disks []Disk `json:"disks"` + Mounts []Mount `json:"mounts"` + Net []Net `json:"net"` +} + +type Collector struct { + procDir string +} + +func NewCollector(procDir string) *Collector { + return &Collector{procDir: procDir} +} + +func (c *Collector) Snapshot() (Snapshot, error) { + var s Snapshot + cpu, err := readCPU(filepath.Join(c.procDir, "stat")) + if err != nil { + return s, err + } + s.CPU = cpu + mem, err := readMemory(filepath.Join(c.procDir, "meminfo")) + if err != nil { + return s, err + } + s.Memory = mem + s.Disks, _ = readDisks(filepath.Join(c.procDir, "diskstats")) + s.Net, _ = readNet(filepath.Join(c.procDir, "net/dev")) + s.Mounts = c.Mounts() + return s, nil +} + +func readCPU(path string) (CPU, error) { + f, err := os.Open(path) + if err != nil { + return CPU{}, err + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + if !strings.HasPrefix(line, "cpu ") { + continue + } + fields := strings.Fields(line) + nums := make([]uint64, 0, 8) + for _, fld := range fields[1:] { + n, _ := strconv.ParseUint(fld, 10, 64) + nums = append(nums, n) + } + for len(nums) < 8 { + nums = append(nums, 0) + } + return CPU{nums[0], nums[1], nums[2], nums[3], nums[4], nums[5], nums[6], nums[7]}, nil + } + return CPU{}, fmt.Errorf("cpu: 'cpu ' line missing") +} + +func readMemory(path string) (Memory, error) { + f, err := os.Open(path) + if err != nil { + return Memory{}, err + } + defer f.Close() + m := Memory{} + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + idx := strings.IndexByte(line, ':') + if idx < 0 { + continue + } + key := line[:idx] + rest := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(line[idx+1:]), " kB")) + v, _ := strconv.ParseUint(rest, 10, 64) + switch key { + case "MemTotal": + m.TotalKB = v + case "MemAvailable": + m.AvailableKB = v + case "MemFree": + m.FreeKB = v + case "Buffers": + m.BuffersKB = v + case "Cached": + m.CachedKB = v + case "SwapTotal": + m.SwapTotalKB = v + case "SwapFree": + m.SwapFreeKB = v + } + } + return m, sc.Err() +} + +func readDisks(path string) ([]Disk, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var out []Disk + sc := bufio.NewScanner(f) + for sc.Scan() { + fields := strings.Fields(sc.Text()) + if len(fields) < 14 { + continue + } + name := fields[2] + if isPartition(name) { + continue + } + reads, _ := strconv.ParseUint(fields[3], 10, 64) + sectorsRead, _ := strconv.ParseUint(fields[5], 10, 64) + writes, _ := strconv.ParseUint(fields[7], 10, 64) + sectorsWrt, _ := strconv.ParseUint(fields[9], 10, 64) + out = append(out, Disk{ + Name: name, + ReadsTotal: reads, + SectorsRead: sectorsRead, + WritesTotal: writes, + SectorsWrt: sectorsWrt, + }) + } + return out, sc.Err() +} + +func isPartition(name string) bool { + if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") || strings.HasPrefix(name, "dm-") { + return true + } + if len(name) == 0 { + return true + } + last := name[len(name)-1] + if last < '0' || last > '9' { + return false + } + if strings.HasPrefix(name, "mmcblk") || strings.HasPrefix(name, "nvme") { + return strings.Contains(name, "p") + } + return true +} + +func readNet(path string) ([]Net, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var out []Net + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + idx := strings.IndexByte(line, ':') + if idx < 0 { + continue + } + name := strings.TrimSpace(line[:idx]) + if name == "lo" { + continue + } + fields := strings.Fields(line[idx+1:]) + if len(fields) < 9 { + continue + } + rx, _ := strconv.ParseUint(fields[0], 10, 64) + tx, _ := strconv.ParseUint(fields[8], 10, 64) + out = append(out, Net{Name: name, RxBytes: rx, TxBytes: tx}) + } + return out, sc.Err() +} diff --git a/backend/health/metrics_test.go b/backend/health/metrics_test.go new file mode 100644 index 00000000..e1656fb0 --- /dev/null +++ b/backend/health/metrics_test.go @@ -0,0 +1,71 @@ +package health + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func writeProc(t *testing.T, dir, rel, contents string) { + t.Helper() + p := filepath.Join(dir, rel) + require.NoError(t, os.MkdirAll(filepath.Dir(p), 0755)) + require.NoError(t, os.WriteFile(p, []byte(contents), 0644)) +} + +func TestSnapshotEndToEnd(t *testing.T) { + dir := t.TempDir() + writeProc(t, dir, "stat", "cpu 1000 50 200 5000 30 0 10 0\ncpu0 ...\n") + writeProc(t, dir, "meminfo", "MemTotal: 3700000 kB\nMemAvailable: 1500000 kB\nMemFree: 200000 kB\nBuffers: 50000 kB\nCached: 900000 kB\nSwapTotal: 2000000 kB\nSwapFree: 1500000 kB\n") + writeProc(t, dir, "diskstats", " 8 0 sda 100 0 200 0 10 0 20 0 0 0 0 0 0 0\n"+ + " 8 1 sda1 50 0 100 0 5 0 10 0 0 0 0 0 0 0\n"+ + " 179 0 mmcblk0 1000 0 2000 0 100 0 200 0 0 0 0 0 0 0\n"+ + " 179 1 mmcblk0p1 500 0 1000 0 50 0 100 0 0 0 0 0 0 0\n"+ + " 7 0 loop0 1 0 2 0 0 0 0 0 0 0 0 0 0 0\n") + writeProc(t, dir, "net/dev", `Inter-| Receive | Transmit + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed + lo: 1000 10 0 0 0 0 0 0 1000 10 0 0 0 0 0 0 + eth0: 5000 20 0 0 0 0 0 0 8000 30 0 0 0 0 0 0 +`) + s, err := NewCollector(dir).Snapshot() + require.NoError(t, err) + assert.Equal(t, uint64(1000), s.CPU.User) + assert.Equal(t, uint64(5000), s.CPU.Idle) + assert.Equal(t, uint64(3700000), s.Memory.TotalKB) + assert.Equal(t, uint64(1500000), s.Memory.AvailableKB) + assert.Equal(t, uint64(2000000), s.Memory.SwapTotalKB) + + names := []string{} + for _, d := range s.Disks { + names = append(names, d.Name) + } + assert.ElementsMatch(t, []string{"sda", "mmcblk0"}, names) + + require.Len(t, s.Net, 1) + assert.Equal(t, "eth0", s.Net[0].Name) + assert.Equal(t, uint64(5000), s.Net[0].RxBytes) + assert.Equal(t, uint64(8000), s.Net[0].TxBytes) +} + +func TestIsPartition(t *testing.T) { + cases := []struct { + name string + want bool + }{ + {"sda", false}, + {"sda1", true}, + {"mmcblk0", false}, + {"mmcblk0p1", true}, + {"nvme0n1", false}, + {"nvme0n1p1", true}, + {"loop0", true}, + {"dm-0", true}, + {"ram0", true}, + } + for _, c := range cases { + assert.Equal(t, c.want, isPartition(c.name), c.name) + } +} diff --git a/backend/health/mounts.go b/backend/health/mounts.go new file mode 100644 index 00000000..fb74904f --- /dev/null +++ b/backend/health/mounts.go @@ -0,0 +1,45 @@ +package health + +import ( + "bufio" + "os" + "path/filepath" + "strings" + "syscall" +) + +func (c *Collector) Mounts() []Mount { + f, err := os.Open(filepath.Join(c.procDir, "mounts")) + if err != nil { + return nil + } + defer f.Close() + var out []Mount + seen := map[string]bool{} + sc := bufio.NewScanner(f) + for sc.Scan() { + fields := strings.Fields(sc.Text()) + if len(fields) < 3 { + continue + } + dev, mount, fs := fields[0], fields[1], fields[2] + if !strings.HasPrefix(dev, "/dev/") { + continue + } + if fs == "squashfs" || fs == "tmpfs" || fs == "devtmpfs" { + continue + } + if seen[mount] { + continue + } + seen[mount] = true + var st syscall.Statfs_t + if err := syscall.Statfs(mount, &st); err != nil { + continue + } + total := st.Blocks * uint64(st.Bsize) / 1024 + free := st.Bavail * uint64(st.Bsize) / 1024 + out = append(out, Mount{Path: mount, TotalKB: total, UsedKB: total - free}) + } + return out +} diff --git a/backend/ioc/common.go b/backend/ioc/common.go index 80257dbf..274aa376 100644 --- a/backend/ioc/common.go +++ b/backend/ioc/common.go @@ -15,6 +15,7 @@ import ( "github.com/syncloud/platform/date" "github.com/syncloud/platform/du" "github.com/syncloud/platform/event" + "github.com/syncloud/platform/health" "github.com/syncloud/platform/hook" "github.com/syncloud/platform/identification" "github.com/syncloud/platform/installer" @@ -26,6 +27,7 @@ import ( "github.com/syncloud/platform/rest" "github.com/syncloud/platform/session" "github.com/syncloud/platform/snap" + "github.com/syncloud/platform/stability" "github.com/syncloud/platform/storage" "github.com/syncloud/platform/storage/btrfs" "github.com/syncloud/platform/support" @@ -570,5 +572,24 @@ func Init(userConfig string, systemConfig string, backupDir string, varDir strin return nil, err } + err = c.Singleton(func() *stability.EventLog { + return stability.NewEventLog("/var/snap/platform/common/stability-events.jsonl") + }) + if err != nil { + return nil, err + } + err = c.Singleton(func() *health.Collector { + return health.NewCollector("/proc") + }) + if err != nil { + return nil, err + } + err = c.Singleton(func(events *stability.EventLog, collector *health.Collector) *health.Health { + return health.NewHealth(events, collector) + }) + if err != nil { + return nil, err + } + return c, nil } diff --git a/backend/ioc/public_api.go b/backend/ioc/public_api.go index 17445b43..1ee08aa8 100644 --- a/backend/ioc/public_api.go +++ b/backend/ioc/public_api.go @@ -9,6 +9,7 @@ import ( "github.com/syncloud/platform/config" "github.com/syncloud/platform/cron" "github.com/syncloud/platform/event" + "github.com/syncloud/platform/health" "github.com/syncloud/platform/identification" "github.com/syncloud/platform/installer" "github.com/syncloud/platform/job" @@ -39,12 +40,13 @@ func InitPublicApi(userConfig string, systemConfig string, backupDir string, var changesClient *snap.ChangesClient, oidcService *auth.OIDCService, authelia *auth.Authelia, totp *auth.TOTP, tz *timezone.Applier, + healthService *health.Health, ) *rest.Backend { return rest.NewBackend(master, backupService, eventTrigger, worker, redirectService, installerService, storageService, id, activate, userConfig, redirectConfig, cert, externalAddress, snapd, disks, journalCtl, executor, iface, sender, proxy, customProxy, ldapService, middleware, cookies, net, address, changesClient, - oidcService, authelia, totp, tz, logger) + oidcService, authelia, totp, tz, healthService, logger) }) if err != nil { return nil, err diff --git a/backend/rest/backend.go b/backend/rest/backend.go index c6b9f99f..ed551cb4 100644 --- a/backend/rest/backend.go +++ b/backend/rest/backend.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "strconv" "strings" "time" @@ -15,6 +16,7 @@ import ( "github.com/syncloud/platform/cli" "github.com/syncloud/platform/config" "github.com/syncloud/platform/event" + "github.com/syncloud/platform/health" "github.com/syncloud/platform/identification" "github.com/syncloud/platform/installer" "github.com/syncloud/platform/job" @@ -62,6 +64,7 @@ type Backend struct { authelia *auth.Authelia totp *auth.TOTP timezone *timezone.Applier + health *health.Health network string address string logger *zap.Logger @@ -79,6 +82,7 @@ func NewBackend( changesClient *snap.ChangesClient, oidcService *auth.OIDCService, authelia *auth.Authelia, totp *auth.TOTP, timezone *timezone.Applier, + healthService *health.Health, logger *zap.Logger) *Backend { return &Backend{ @@ -110,6 +114,7 @@ func NewBackend( authelia: authelia, totp: totp, timezone: timezone, + health: healthService, network: network, address: address, changesClient: changesClient, @@ -152,6 +157,8 @@ func (b *Backend) Start() error { r.HandleFunc("/rest/settings/timezone", b.mw.FailIfNotActivated(b.mw.SecuredHandle(b.GetTimezone))).Methods("GET") r.HandleFunc("/rest/settings/timezone", b.mw.FailIfNotActivated(b.mw.AdminSecuredHandle(b.SetTimezone))).Methods("POST") r.HandleFunc("/rest/settings/time", b.mw.FailIfNotActivated(b.mw.SecuredHandle(b.GetTime))).Methods("GET") + r.HandleFunc("/rest/settings/health/events", b.mw.FailIfNotActivated(b.mw.AdminSecuredHandle(b.HealthEvents))).Methods("GET") + r.HandleFunc("/rest/settings/health/metrics", b.mw.FailIfNotActivated(b.mw.AdminSecuredHandle(b.HealthMetrics))).Methods("GET") // /rest/totp/setup is handled by the login service, not the backend r.HandleFunc("/rest/job/status", b.mw.FailIfNotActivated(b.mw.AdminSecuredHandle(b.JobStatus))).Methods("GET") r.HandleFunc("/rest/backup/list", b.mw.FailIfNotActivated(b.mw.AdminSecuredHandle(b.BackupList))).Methods("GET") @@ -611,6 +618,20 @@ func (b *Backend) UserLogout(w http.ResponseWriter, req *http.Request) { http.Redirect(w, req, autheliaLogout, http.StatusFound) } +func (b *Backend) HealthEvents(req *http.Request) (interface{}, error) { + limit := 100 + if v := req.URL.Query().Get("limit"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 && n <= 1000 { + limit = n + } + } + return b.health.Events(limit) +} + +func (b *Backend) HealthMetrics(_ *http.Request) (interface{}, error) { + return b.health.Metrics() +} + func (b *Backend) GetTwoFactorSettings(_ *http.Request) (interface{}, error) { return map[string]interface{}{ "enabled": b.userConfig.IsTwoFactorEnabled(), diff --git a/backend/stability/events.go b/backend/stability/events.go new file mode 100644 index 00000000..e7a983fe --- /dev/null +++ b/backend/stability/events.go @@ -0,0 +1,89 @@ +package stability + +import ( + "encoding/json" + "errors" + "os" + "sync" + "time" +) + +type EventKind string + +const ( + EventKindZramEnabled EventKind = "zram_enabled" + EventKindSwapoffFile EventKind = "swapoff_file" + EventKindPressure EventKind = "pressure_detected" + EventKindVictimSigterm EventKind = "victim_sigterm" + EventKindVictimSigkill EventKind = "victim_sigkill" +) + +type Event struct { + Time time.Time `json:"time"` + Kind EventKind `json:"kind"` + Message string `json:"message,omitempty"` + PID int `json:"pid,omitempty"` + Comm string `json:"comm,omitempty"` + RSSkb uint64 `json:"rss_kb,omitempty"` + Cgroup string `json:"cgroup,omitempty"` + AvailRatio float64 `json:"avail_ratio,omitempty"` + PSIavg10 float64 `json:"psi_avg10,omitempty"` + Path string `json:"path,omitempty"` + SizeBytes uint64 `json:"size_bytes,omitempty"` +} + +type EventLog struct { + path string + mu sync.Mutex +} + +func NewEventLog(path string) *EventLog { + return &EventLog{path: path} +} + +func (l *EventLog) Append(e Event) error { + if e.Time.IsZero() { + e.Time = time.Now().UTC() + } + l.mu.Lock() + defer l.mu.Unlock() + f, err := os.OpenFile(l.path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + return enc.Encode(e) +} + +func (l *EventLog) Recent(limit int) ([]Event, error) { + if limit <= 0 { + limit = 100 + } + l.mu.Lock() + defer l.mu.Unlock() + f, err := os.Open(l.path) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return []Event{}, nil + } + return nil, err + } + defer f.Close() + dec := json.NewDecoder(f) + var all []Event + for { + var e Event + if err := dec.Decode(&e); err != nil { + break + } + all = append(all, e) + } + if len(all) > limit { + all = all[len(all)-limit:] + } + for i, j := 0, len(all)-1; i < j; i, j = i+1, j-1 { + all[i], all[j] = all[j], all[i] + } + return all, nil +} diff --git a/backend/stability/events_test.go b/backend/stability/events_test.go new file mode 100644 index 00000000..6c03a9cc --- /dev/null +++ b/backend/stability/events_test.go @@ -0,0 +1,45 @@ +package stability + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAppendAndRecentReverseOrder(t *testing.T) { + dir := t.TempDir() + log := NewEventLog(filepath.Join(dir, "events.jsonl")) + require.NoError(t, log.Append(Event{Kind: EventKindZramEnabled, SizeBytes: 1 << 30})) + require.NoError(t, log.Append(Event{Kind: EventKindPressure, AvailRatio: 0.05})) + require.NoError(t, log.Append(Event{Kind: EventKindVictimSigterm, PID: 1234, Comm: "python3", RSSkb: 2000000})) + + evs, err := log.Recent(10) + require.NoError(t, err) + require.Len(t, evs, 3) + assert.Equal(t, EventKindVictimSigterm, evs[0].Kind) + assert.Equal(t, "python3", evs[0].Comm) + assert.Equal(t, EventKindPressure, evs[1].Kind) + assert.Equal(t, EventKindZramEnabled, evs[2].Kind) +} + +func TestRecentMissingFileReturnsEmpty(t *testing.T) { + dir := t.TempDir() + evs, err := NewEventLog(filepath.Join(dir, "nope.jsonl")).Recent(10) + require.NoError(t, err) + assert.Empty(t, evs) +} + +func TestRecentCapsLimit(t *testing.T) { + dir := t.TempDir() + log := NewEventLog(filepath.Join(dir, "events.jsonl")) + for i := 0; i < 20; i++ { + require.NoError(t, log.Append(Event{Kind: EventKindPressure, PID: i})) + } + evs, err := log.Recent(5) + require.NoError(t, err) + require.Len(t, evs, 5) + assert.Equal(t, 19, evs[0].PID) + assert.Equal(t, 15, evs[4].PID) +} diff --git a/backend/stability/oom.go b/backend/stability/oom.go index ff5cf8d9..be38079c 100644 --- a/backend/stability/oom.go +++ b/backend/stability/oom.go @@ -17,6 +17,7 @@ type Watcher struct { scan *ProcScanner protect Protect kill KillFn + events *EventLog log *zap.Logger interval time.Duration availMin float64 @@ -25,12 +26,13 @@ type Watcher struct { selfPID int } -func NewWatcher(mem *MemInfo, scan *ProcScanner, kill KillFn, log *zap.Logger) *Watcher { +func NewWatcher(mem *MemInfo, scan *ProcScanner, kill KillFn, events *EventLog, log *zap.Logger) *Watcher { return &Watcher{ mem: mem, scan: scan, protect: DefaultProtect(), kill: kill, + events: events, log: log, interval: 2 * time.Second, availMin: 0.08, @@ -82,6 +84,9 @@ func (w *Watcher) tick() error { zap.Float64("psi_avg10", psi), zap.Bool("psi_ok", psiOK), ) + if w.events != nil { + _ = w.events.Append(Event{Kind: EventKindPressure, AvailRatio: avail, PSIavg10: psi}) + } return w.killWorst() } @@ -110,6 +115,9 @@ func (w *Watcher) killWorst() error { zap.Uint64("rss_kb", v.RSSkB), zap.String("cgroup", v.Cgroup), ) + if w.events != nil { + _ = w.events.Append(Event{Kind: EventKindVictimSigterm, PID: v.PID, Comm: v.Comm, RSSkb: v.RSSkB, Cgroup: v.Cgroup}) + } if err := w.kill(v.PID, syscall.SIGTERM); err != nil { if errors.Is(err, syscall.ESRCH) { return nil @@ -124,6 +132,9 @@ func (w *Watcher) killWorst() error { time.Sleep(200 * time.Millisecond) } w.log.Warn("oom-watcher: SIGKILL victim", zap.Int("pid", v.PID), zap.String("comm", v.Comm)) + if w.events != nil { + _ = w.events.Append(Event{Kind: EventKindVictimSigkill, PID: v.PID, Comm: v.Comm, RSSkb: v.RSSkB, Cgroup: v.Cgroup}) + } if err := w.kill(v.PID, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) { return err } diff --git a/backend/stability/oom_test.go b/backend/stability/oom_test.go index b9f73c2b..012226b7 100644 --- a/backend/stability/oom_test.go +++ b/backend/stability/oom_test.go @@ -35,7 +35,7 @@ func newWatcherWithProc(t *testing.T, memTotal, memAvail uint64, procDir string) root := t.TempDir() procRoot := root writeProcFile(t, procRoot, "meminfo", "MemTotal: "+strconvUint(memTotal)+" kB\nMemAvailable: "+strconvUint(memAvail)+" kB\n") - return NewWatcher(NewMemInfo(procRoot), NewProcScanner(procDir), nil, zap.NewNop()) + return NewWatcher(NewMemInfo(procRoot), NewProcScanner(procDir), nil, nil, zap.NewNop()) } func TestTickNoActionWhenHealthy(t *testing.T) { @@ -85,7 +85,7 @@ func TestKillWorstReturnsErrNoVictim(t *testing.T) { } func TestPressureExceededByAvailOrPSI(t *testing.T) { - w := NewWatcher(NewMemInfo(t.TempDir()), nil, nil, zap.NewNop()) + w := NewWatcher(NewMemInfo(t.TempDir()), nil, nil, nil, zap.NewNop()) assert.True(t, w.pressureExceeded(0.05, 0, false)) assert.False(t, w.pressureExceeded(0.30, 0, false)) assert.True(t, w.pressureExceeded(0.30, 50, true)) diff --git a/backend/stability/zram.go b/backend/stability/zram.go index 26281630..b57097a8 100644 --- a/backend/stability/zram.go +++ b/backend/stability/zram.go @@ -33,10 +33,11 @@ type Zram struct { mem *MemInfo swapon SwaponFn swapoff SwapoffFn + events *EventLog log *zap.Logger } -func NewZram(mem *MemInfo, swapon SwaponFn, swapoff SwapoffFn, log *zap.Logger) *Zram { +func NewZram(mem *MemInfo, swapon SwaponFn, swapoff SwapoffFn, events *EventLog, log *zap.Logger) *Zram { return &Zram{ sysBlock: zramSysBlockDefault, hotAdd: zramHotAddDefault, @@ -45,6 +46,7 @@ func NewZram(mem *MemInfo, swapon SwaponFn, swapoff SwapoffFn, log *zap.Logger) mem: mem, swapon: swapon, swapoff: swapoff, + events: events, log: log, } } @@ -83,6 +85,9 @@ func (z *Zram) EnsureConfigured() error { return fmt.Errorf("zram: swapon: %w", err) } z.log.Info("zram: enabled", zap.Uint64("size_bytes", size), zap.Int("priority", zramPriority)) + if z.events != nil { + _ = z.events.Append(Event{Kind: EventKindZramEnabled, SizeBytes: size}) + } if err := z.disableFileSwaps(); err != nil { z.log.Warn("zram: file-swap disable failed", zap.Error(err)) } @@ -107,6 +112,9 @@ func (z *Zram) disableFileSwaps() error { continue } z.log.Info("zram: swapoff file swap", zap.String("path", fields[0])) + if z.events != nil { + _ = z.events.Append(Event{Kind: EventKindSwapoffFile, Path: fields[0]}) + } } return nil } diff --git a/web/platform/src/locales/en.json b/web/platform/src/locales/en.json index 3b23d4db..dc6f313f 100644 --- a/web/platform/src/locales/en.json +++ b/web/platform/src/locales/en.json @@ -53,7 +53,33 @@ "logs": "Logs", "customProxy": "Custom Proxy", "system": "System", - "locale": "Locale" + "locale": "Locale", + "health": "Health" + }, + "health": { + "title": "Health", + "cpu": "CPU", + "memory": "Memory", + "swap": "Swap", + "disks": "Disks", + "network": "Network", + "events": "Stability events", + "noEvents": "No events recorded yet.", + "colTime": "Time", + "colKind": "Kind", + "colDetails": "Details", + "used": "used", + "total": "total", + "available": "available", + "ioRead": "read", + "ioWrite": "write", + "netRx": "down", + "netTx": "up", + "kindZramEnabled": "zram enabled", + "kindSwapoffFile": "file swap disabled", + "kindPressure": "memory pressure detected", + "kindVictimSigterm": "process terminated (SIGTERM)", + "kindVictimSigkill": "process killed (SIGKILL)" }, "activation": { "title": "Activation", diff --git a/web/platform/src/router/index.js b/web/platform/src/router/index.js index cc6a245e..da7f2cc5 100644 --- a/web/platform/src/router/index.js +++ b/web/platform/src/router/index.js @@ -22,6 +22,7 @@ const routes = [ { path: '/customproxy', name: 'CustomProxy', component: () => import('../views/CustomProxy.vue') }, { path: '/system', name: 'System', component: () => import('../views/System.vue') }, { path: '/locale', name: 'Locale', component: () => import('../views/Locale.vue') }, + { path: '/health', name: 'Health', component: () => import('../views/Health.vue') }, { path: '/:catchAll(.*)', redirect: '/' } ] diff --git a/web/platform/src/views/Health.vue b/web/platform/src/views/Health.vue new file mode 100644 index 00000000..0c022bd4 --- /dev/null +++ b/web/platform/src/views/Health.vue @@ -0,0 +1,218 @@ + + + + + diff --git a/web/platform/src/views/Settings.vue b/web/platform/src/views/Settings.vue index d39a88c2..59353775 100644 --- a/web/platform/src/views/Settings.vue +++ b/web/platform/src/views/Settings.vue @@ -103,6 +103,13 @@ +
+ + favorite +
{{ $t('settings.health') }}
+
+
+ From 9021b688885427d10ad29a2d9777dc76f57c6c8e Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 12:38:16 +0100 Subject: [PATCH 02/12] health: drop unused catch binding (eslint no-unused-vars) --- web/platform/src/views/Health.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/platform/src/views/Health.vue b/web/platform/src/views/Health.vue index 0c022bd4..59122146 100644 --- a/web/platform/src/views/Health.vue +++ b/web/platform/src/views/Health.vue @@ -118,7 +118,7 @@ export default { return kind.split('_').map(s => s.charAt(0).toUpperCase() + s.slice(1)).join('') }, fmtTime (iso) { - try { return new Date(iso).toLocaleString() } catch (e) { return iso } + try { return new Date(iso).toLocaleString() } catch { return iso } }, fmtDetails (e) { const parts = [] From 5e80787f2a83af4c3146173504d60ff2979c794d Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 13:00:56 +0100 Subject: [PATCH 03/12] stub: realistic data for Health page CPU ticks advance with sine-shaped busy/idle, mem/swap usage oscillates so the live bars actually move, net/disk byte counters grow with random deltas to produce realistic KB/s rates. Events list includes a recent SIGTERM + earlier SIGKILL chain plus a zram_enabled + swapoff_file pair matching what the borisarm64 OOM stress test produced. --- web/platform/src/stub/api.js | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/web/platform/src/stub/api.js b/web/platform/src/stub/api.js index 32ee35e4..bb5cce85 100644 --- a/web/platform/src/stub/api.js +++ b/web/platform/src/stub/api.js @@ -631,6 +631,68 @@ export function mock () { ] return new Response(200, {}, { success: true, data: logs }) }) + const stubHealth = { + tick: 0, + cpu: { user: 1000000, nice: 100, system: 200000, idle: 5000000, iowait: 30000, irq: 0, softirq: 10000, steal: 0 }, + net: [ + { name: 'eth0', rx_bytes: 50000000, tx_bytes: 20000000 }, + { name: 'wlan0', rx_bytes: 1000000, tx_bytes: 500000 } + ], + disks: [ + { name: 'sda', reads_total: 100000, sectors_read: 800000000, writes_total: 50000, sectors_written: 200000000 }, + { name: 'mmcblk0', reads_total: 500000, sectors_read: 4000000000, writes_total: 200000, sectors_written: 1500000000 } + ] + } + this.get('/rest/settings/health/metrics', function (_schema, _request) { + stubHealth.tick++ + // simulate variable CPU activity (sine-ish pattern) + const busyMs = Math.round(800 + 600 * Math.sin(stubHealth.tick / 8)) + stubHealth.cpu.user += busyMs * 0.6 + stubHealth.cpu.system += busyMs * 0.3 + stubHealth.cpu.iowait += busyMs * 0.1 + stubHealth.cpu.idle += 2000 - busyMs + stubHealth.net.forEach((n, i) => { + n.rx_bytes += Math.round(50000 + 200000 * Math.random()) * (i === 0 ? 1 : 0.05) + n.tx_bytes += Math.round(20000 + 100000 * Math.random()) * (i === 0 ? 1 : 0.05) + }) + stubHealth.disks.forEach(d => { + d.sectors_read += Math.round(500 + 8000 * Math.random()) + d.sectors_written += Math.round(200 + 4000 * Math.random()) + }) + const memUsed = 1500000 + Math.round(800000 * Math.sin(stubHealth.tick / 12)) + const data = { + cpu: { ...stubHealth.cpu }, + memory: { + total_kb: 3789348, + available_kb: 3789348 - memUsed, + free_kb: 200000, + buffers_kb: 50000, + cached_kb: 900000, + swap_total_kb: 2097148, + swap_free_kb: 2097148 - 600000 - Math.round(300000 * Math.sin(stubHealth.tick / 7)) + }, + disks: stubHealth.disks.map(d => ({ ...d })), + mounts: [ + { path: '/', total_kb: 30 * 1024 * 1024, used_kb: 24 * 1024 * 1024 }, + { path: '/opt/disk/external', total_kb: 1.9 * 1024 * 1024 * 1024, used_kb: 0.235 * 1024 * 1024 * 1024 } + ], + net: stubHealth.net.map(n => ({ ...n })) + } + return new Response(200, {}, { success: true, data }) + }) + this.get('/rest/settings/health/events', function (_schema, _request) { + const now = Date.now() + const events = [ + { time: new Date(now - 30 * 1000).toISOString(), kind: 'victim_sigterm', pid: 3325956, comm: 'python3', rss_kb: 1943228, cgroup: '0::/user.slice/user-0.slice/session-61617.scope' }, + { time: new Date(now - 30 * 1000 - 1500).toISOString(), kind: 'pressure_detected', avail_ratio: 0.058 }, + { time: new Date(now - 17 * 60 * 1000).toISOString(), kind: 'victim_sigkill', pid: 4421, comm: 'photoprism', rss_kb: 1102456, cgroup: '0::/system.slice/snap.photoprism.web.service' }, + { time: new Date(now - 17 * 60 * 1000 - 4500).toISOString(), kind: 'victim_sigterm', pid: 4421, comm: 'photoprism', rss_kb: 1102456, cgroup: '0::/system.slice/snap.photoprism.web.service' }, + { time: new Date(now - 17 * 60 * 1000 - 6500).toISOString(), kind: 'pressure_detected', avail_ratio: 0.041, psi_avg10: 62.4 }, + { time: new Date(now - 3 * 3600 * 1000).toISOString(), kind: 'swapoff_file', path: '/swapfile' }, + { time: new Date(now - 3 * 3600 * 1000 - 200).toISOString(), kind: 'zram_enabled', size_bytes: 1939916800 } + ] + return new Response(200, {}, { success: true, data: events }) + }) } }) } From 87dabb6d1d444f1abb2caeb99844f9cc6b8be43b Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 13:22:50 +0100 Subject: [PATCH 04/12] health: replace el-table with responsive event list el-table forces a fixed wide layout that overflowed on mobile. Switch to a vertical list of cards (max-width 720px) with a colored left border per event kind (red for kills, orange for pressure, green for zram/swap actions). Time wraps below kind on narrow screens via flex-wrap. --- web/platform/src/views/Health.vue | 74 ++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/web/platform/src/views/Health.vue b/web/platform/src/views/Health.vue index 59122146..bb630520 100644 --- a/web/platform/src/views/Health.vue +++ b/web/platform/src/views/Health.vue @@ -49,17 +49,15 @@

{{ $t('health.events') }}

{{ $t('health.noEvents') }}
- - - - - - - - - - - +
    +
  • +
    + {{ $t('health.kind' + kindCamel(ev.kind)) }} + +
    +
    {{ fmtDetails(ev) }}
    +
  • +
@@ -215,4 +213,58 @@ export default { h2 { margin-top: 24px; } +.event-list { + list-style: none; + margin: 0; + padding: 0; + max-width: 720px; +} +.event-item { + border-left: 3px solid #d0d0d0; + background: #fafafa; + padding: 8px 12px; + margin: 6px 0; + border-radius: 0 4px 4px 0; +} +.event-victim_sigterm, +.event-victim_sigkill { + border-left-color: #e74c3c; +} +.event-pressure_detected { + border-left-color: #f39c12; +} +.event-zram_enabled, +.event-swapoff_file { + border-left-color: #36ad40; +} +.event-head { + display: flex; + justify-content: space-between; + align-items: baseline; + gap: 8px; + flex-wrap: wrap; +} +.event-kind { + font-weight: 600; +} +.event-time { + color: #888; + font-size: 0.85em; + white-space: nowrap; +} +.event-details { + color: #555; + font-size: 0.9em; + margin-top: 4px; + word-break: break-word; + font-family: monospace; +} +@media (max-width: 600px) { + .event-item { + padding: 6px 10px; + } + .event-details { + font-size: 0.85em; + } +} From e409b5b4cd45c843f7476e1a852fe59128c86409 Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 13:24:56 +0100 Subject: [PATCH 05/12] health: card-style events with icon + relative time Each event now has a material icon (warning/cancel for kills, priority for pressure, memory/swap for zram) tinted to match the left-border accent color, a white rounded card with soft shadow, and relative time (e.g. '2m ago') with absolute timestamp on hover. Tighter padding + smaller fonts under 600px. --- web/platform/src/views/Health.vue | 112 ++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/web/platform/src/views/Health.vue b/web/platform/src/views/Health.vue index bb630520..0224a5dd 100644 --- a/web/platform/src/views/Health.vue +++ b/web/platform/src/views/Health.vue @@ -50,12 +50,15 @@

{{ $t('health.events') }}

{{ $t('health.noEvents') }}
    -
  • -
    - {{ $t('health.kind' + kindCamel(ev.kind)) }} - +
  • + {{ kindIcon(ev.kind) }} +
    +
    + {{ $t('health.kind' + kindCamel(ev.kind)) }} + +
    +
    {{ fmtDetails(ev) }}
    -
    {{ fmtDetails(ev) }}
@@ -118,6 +121,30 @@ export default { fmtTime (iso) { try { return new Date(iso).toLocaleString() } catch { return iso } }, + fmtRel (iso) { + const t = new Date(iso).getTime() + if (Number.isNaN(t)) return iso + const diff = Math.max(0, Date.now() - t) + const s = Math.floor(diff / 1000) + if (s < 60) return s + 's ago' + const m = Math.floor(s / 60) + if (m < 60) return m + 'm ago' + const h = Math.floor(m / 60) + if (h < 24) return h + 'h ago' + const d = Math.floor(h / 24) + if (d < 7) return d + 'd ago' + return new Date(iso).toLocaleDateString() + }, + kindIcon (kind) { + switch (kind) { + case 'victim_sigkill': return 'cancel' + case 'victim_sigterm': return 'warning' + case 'pressure_detected': return 'priority_high' + case 'zram_enabled': return 'memory' + case 'swapoff_file': return 'swap_horiz' + default: return 'info' + } + }, fmtDetails (e) { const parts = [] if (e.comm) parts.push(e.comm) @@ -218,25 +245,53 @@ h2 { margin: 0; padding: 0; max-width: 720px; + display: flex; + flex-direction: column; + gap: 10px; } -.event-item { - border-left: 3px solid #d0d0d0; - background: #fafafa; - padding: 8px 12px; - margin: 6px 0; - border-radius: 0 4px 4px 0; +.event-card { + display: flex; + gap: 12px; + align-items: flex-start; + background: #ffffff; + border-radius: 10px; + padding: 14px 16px; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05), 0 1px 3px rgba(0, 0, 0, 0.06); + border-left: 4px solid #d0d0d0; + transition: box-shadow 120ms ease; +} +.event-card:hover { + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.06), 0 4px 8px rgba(0, 0, 0, 0.08); +} +.event-icon { + font-size: 22px !important; + line-height: 1; + flex-shrink: 0; + margin-top: 2px; + color: #9e9e9e; } -.event-victim_sigterm, .event-victim_sigkill { border-left-color: #e74c3c; } +.event-victim_sigkill .event-icon { color: #e74c3c; } +.event-victim_sigterm { + border-left-color: #ec7063; +} +.event-victim_sigterm .event-icon { color: #ec7063; } .event-pressure_detected { border-left-color: #f39c12; } +.event-pressure_detected .event-icon { color: #f39c12; } .event-zram_enabled, .event-swapoff_file { border-left-color: #36ad40; } +.event-zram_enabled .event-icon, +.event-swapoff_file .event-icon { color: #36ad40; } +.event-body { + flex: 1; + min-width: 0; +} .event-head { display: flex; justify-content: space-between; @@ -246,25 +301,40 @@ h2 { } .event-kind { font-weight: 600; + font-size: 0.95em; + color: #2c3e50; } .event-time { - color: #888; - font-size: 0.85em; + color: #95a5a6; + font-size: 0.8em; white-space: nowrap; + font-variant-numeric: tabular-nums; } .event-details { - color: #555; - font-size: 0.9em; - margin-top: 4px; + color: #5d6d7e; + font-size: 0.85em; + margin-top: 6px; word-break: break-word; - font-family: monospace; + font-family: 'SF Mono', Menlo, Consolas, monospace; + line-height: 1.4; } @media (max-width: 600px) { - .event-item { - padding: 6px 10px; + .event-list { + gap: 8px; + } + .event-card { + padding: 12px 14px; + border-radius: 8px; + } + .event-icon { + font-size: 20px !important; + } + .event-kind { + font-size: 0.9em; } .event-details { - font-size: 0.85em; + font-size: 0.8em; + margin-top: 4px; } } From a5413c3a765db3da876d1ef26f7845f61df82f29 Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 13:27:30 +0100 Subject: [PATCH 06/12] health: align events block to same settingsblock width as cpu/disk Wrap events H2 + list in .settingsblock so they get the same 1024px-capped centered container as the CPU/memory/disk sections. Drop the now-redundant 720px event-list cap. --- web/platform/src/views/Health.vue | 33 ++++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/web/platform/src/views/Health.vue b/web/platform/src/views/Health.vue index 0224a5dd..602602e2 100644 --- a/web/platform/src/views/Health.vue +++ b/web/platform/src/views/Health.vue @@ -47,20 +47,22 @@ -

{{ $t('health.events') }}

-
{{ $t('health.noEvents') }}
-
    -
  • - {{ kindIcon(ev.kind) }} -
    -
    - {{ $t('health.kind' + kindCamel(ev.kind)) }} - +
    +

    {{ $t('health.events') }}

    +
    {{ $t('health.noEvents') }}
    +
      +
    • + {{ kindIcon(ev.kind) }} +
      +
      + {{ $t('health.kind' + kindCamel(ev.kind)) }} + +
      +
      {{ fmtDetails(ev) }}
      -
      {{ fmtDetails(ev) }}
      -
    -
  • -
+ + + @@ -240,11 +242,14 @@ export default { h2 { margin-top: 24px; } +.events-block { + margin-top: 24px; + text-align: left; +} .event-list { list-style: none; margin: 0; padding: 0; - max-width: 720px; display: flex; flex-direction: column; gap: 10px; From 92a7a7d8bdbf24c2db2d876428f96cc92d59b1b0 Mon Sep 17 00:00:00 2001 From: Boris Rybalkin Date: Thu, 21 May 2026 13:30:01 +0100 Subject: [PATCH 07/12] health: import site.css + material-icons in view scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Other views import these in their