brain/kvbrain: start of badger/bbolt version of brain

zephyrtronium · Mar 9, 2024 · c6925f3 · c6925f3
1 parent f0596f8
commit c6925f3
Show file tree

Hide file tree

Showing 5 changed files with 389 additions and 0 deletions.
diff --git a/brain/kvbrain/kvbrain.go b/brain/kvbrain/kvbrain.go
@@ -0,0 +1,85 @@
+package kvbrain
+
+import (
+	"context"
+	"time"
+
+	"github.com/dgraph-io/badger/v4"
+	"github.com/google/uuid"
+
+	"github.com/zephyrtronium/robot/brain"
+	"github.com/zephyrtronium/robot/userhash"
+)
+
+/*
+Message key structure:
+Tag × Tuples × UUID
+- Tag is a 16 byte string padded with \x00.
+- Tuple terms are separated by \xff sentinels. Terms are recorded in reverse order.
+- The final tuple term is the empty string, so the tuple portion ends with \xff\xff.
+- UUID is the raw uuid.
+
+As with the SQL approach, we record every prefix with its suffix, including the
+final empty prefix.
+
+Operations:
+- Find a start tuple: Search for a prefix of tag × \xff.
+- Find a continuation:
+	+ With full context, just search for it, again in reverse order.
+	+ When we reduce context, record by how much and only search for that much.
+	+ In both cases, and with start tuple, check message UUID and tags we
+		select against the deletions db.
+- Learn: Construct the key according to above. The suffix is the entire value.
+	Record a mapping of tag, UUID, timestamp, and userhash to keys.
+- Forget tuples: thinking…
+- ForgetMessage, ForgetDuring, ForgetUserSince: Look up the actual keys to
+	delete in the recording taken during learning.
+*/
+
+type Brain struct {
+	knowledge *badger.DB
+}
+
+var _ brain.Learner = (*Brain)(nil)
+
+func New(knowledge *badger.DB) *Brain {
+	return &Brain{
+		knowledge: knowledge,
+	}
+}
+
+// tagBytes is the number of bytes used to record tags in the KV database.
+const tagBytes = 8 // TODO(zeph): we should just use a hash instead
+
+// Order returns the number of elements in the prefix of a chain. It is
+// called once at the beginning of learning. The returned value must always
+// be at least 1.
+func (br *Brain) Order() int {
+	// TOOD(zeph): this can go away one day
+	return 250
+}
+
+// Forget removes a set of recorded tuples. The tuples provided are as for
+// Learn. If a tuple has been recorded multiple times, only the first
+// should be deleted. If a tuple has not been recorded, it should be
+// ignored.
+func (br *Brain) Forget(ctx context.Context, tag string, tuples []brain.Tuple) error {
+	panic("not implemented") // TODO: Implement
+}
+
+// ForgetMessage forgets everything learned from a single given message.
+// If nothing has been learned from the message, it should be ignored.
+func (br *Brain) ForgetMessage(ctx context.Context, tag string, msg uuid.UUID) error {
+	panic("not implemented") // TODO: Implement
+}
+
+// ForgetDuring forgets all messages learned in the given time span.
+func (br *Brain) ForgetDuring(ctx context.Context, tag string, since, before time.Time) error {
+	panic("not implemented") // TODO: Implement
+}
+
+// ForgetUserSince forgets all messages learned from a user since a given
+// time.
+func (br *Brain) ForgetUserSince(ctx context.Context, user *userhash.Hash, since time.Time) error {
+	panic("not implemented") // TODO: Implement
+}
diff --git a/brain/kvbrain/learn.go b/brain/kvbrain/learn.go
@@ -0,0 +1,73 @@
+package kvbrain
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"slices"
+
+	"github.com/zephyrtronium/robot/brain"
+)
+
+// Learn records a set of tuples. Each tuple prefix has length equal to the
+// result of Order. The tuples begin with empty strings in the prefix to
+// denote the start of the message and end with one empty suffix to denote
+// the end; all other tokens are non-empty. Each tuple's prefix has entropy
+// reduction transformations applied.
+func (br *Brain) Learn(ctx context.Context, meta *brain.MessageMeta, tuples []brain.Tuple) error {
+	if len(tuples) == 0 {
+		return errors.New("no tuples to learn")
+	}
+	// Construct the keys and values we will use.
+	// There are probably things we could do to control allocations since we're
+	// using many overlapping tuples for keys, but it's tremendously easier to
+	// just fill up a buffer for each.
+	type entry struct {
+		key []byte
+		val []byte
+	}
+	entries := make([]entry, len(tuples))
+	var b bytes.Buffer
+	p := make([]string, 0, len(tuples[0].Prefix))
+	for i, t := range tuples {
+		b.Reset()
+		// Write the tag.
+		u := make([]byte, tagBytes)
+		copy(u, meta.Tag)
+		b.Write(u)
+		// Write prefixes.
+		k := slices.IndexFunc(t.Prefix, func(s string) bool { return s != "" })
+		if k < 0 {
+			// First prefix of the message. We want to write only the separator.
+			k = len(t.Prefix)
+		}
+		p = append(p[:0], t.Prefix[k:]...)
+		slices.Reverse(p)
+		for _, s := range p {
+			b.WriteString(s)
+			b.WriteByte('\xff')
+		}
+		b.WriteByte('\xff')
+		// Write message ID.
+		b.Write(meta.ID[:])
+		entries[i] = entry{
+			key: bytes.Clone(b.Bytes()),
+			val: []byte(t.Suffix),
+		}
+	}
+	// TODO(zeph): record mapping of metadata to key
+	batch := br.knowledge.NewWriteBatch()
+	defer batch.Cancel()
+	for _, e := range entries {
+		err := batch.Set(e.key, e.val)
+		if err != nil {
+			return err
+		}
+	}
+	err := batch.Flush()
+	if err != nil {
+		return fmt.Errorf("couldn't commit learned knowledge: %w", err)
+	}
+	return nil
+}
diff --git a/brain/kvbrain/learn_test.go b/brain/kvbrain/learn_test.go
@@ -0,0 +1,132 @@
+package kvbrain
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/dgraph-io/badger/v4"
+	"github.com/google/uuid"
+
+	"github.com/zephyrtronium/robot/brain"
+	"github.com/zephyrtronium/robot/userhash"
+)
+
+func TestLearn(t *testing.T) {
+	mkey := func(tag, toks string, id uuid.UUID) string {
+		b := make([]byte, tagBytes, tagBytes+len(toks)+len(id))
+		copy(b, tag)
+		b = append(b, toks...)
+		b = append(b, id[:]...)
+		return string(b)
+	}
+	uu := uuid.UUID{':', ')', ':', ')', ':', ')', ':', ')', ':', ')', ':', ')', ':', ')', ':', ')'}
+	h := userhash.Hash{2}
+	cases := []struct {
+		name string
+		msg  brain.MessageMeta
+		tups []brain.Tuple
+		want map[string]string
+	}{
+		{
+			name: "single",
+			msg: brain.MessageMeta{
+				ID:   uu,
+				User: h,
+				Tag:  "kessoku",
+				Time: time.Unix(0, 0),
+			},
+			tups: []brain.Tuple{
+				{
+					Prefix: []string{""},
+					Suffix: "bocchi",
+				},
+			},
+			want: map[string]string{
+				mkey("kessoku", "\xff", uu): "bocchi",
+			},
+		},
+		{
+			name: "full",
+			msg: brain.MessageMeta{
+				ID:   uu,
+				User: h,
+				Tag:  "kessoku",
+				Time: time.Unix(0, 0),
+			},
+			tups: []brain.Tuple{
+				{
+					Prefix: []string{"", "", "", ""},
+					Suffix: "bocchi",
+				},
+				{
+					Prefix: []string{"", "", "", "bocchi"},
+					Suffix: "ryou",
+				},
+				{
+					Prefix: []string{"", "", "bocchi", "ryou"},
+					Suffix: "nijika",
+				},
+				{
+					Prefix: []string{"", "bocchi", "ryou", "nijika"},
+					Suffix: "kita",
+				},
+				{
+					Prefix: []string{"bocchi", "ryou", "nijika", "kita"},
+					Suffix: "seika",
+				},
+				{
+					Prefix: []string{"ryou", "nijika", "kita", "seika"},
+					Suffix: "",
+				},
+			},
+			want: map[string]string{
+				mkey("kessoku", "\xff", uu):                                     "bocchi",
+				mkey("kessoku", "bocchi\xff\xff", uu):                           "ryou",
+				mkey("kessoku", "ryou\xffbocchi\xff\xff", uu):                   "nijika",
+				mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu):         "kita",
+				mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu): "seika",
+				mkey("kessoku", "seika\xffkita\xffnijika\xffryou\xff\xff", uu):  "",
+			},
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			t.Parallel()
+			ctx := context.Background()
+			db, err := badger.Open(badger.DefaultOptions("").WithInMemory(true).WithLogger(nil))
+			if err != nil {
+				t.Fatal(err)
+			}
+			br := New(db)
+			if err := br.Learn(ctx, &c.msg, c.tups); err != nil {
+				t.Errorf("failed to learn: %v", err)
+			}
+			seen := 0
+			err = db.View(func(txn *badger.Txn) error {
+				opts := badger.IteratorOptions{}
+				it := txn.NewIterator(opts)
+				defer it.Close()
+				for it.Rewind(); it.Valid(); it.Next() {
+					item := it.Item()
+					k := string(item.Key())
+					v, err := item.ValueCopy(nil)
+					if err != nil {
+						t.Errorf("couldn't get value for key %q: %v", k, err)
+					}
+					if got := string(v); c.want[k] != got {
+						t.Errorf("wrong value for key %q: want %q, got %q", k, c.want[k], got)
+					}
+					seen++
+				}
+				return nil
+			})
+			if err != nil {
+				t.Errorf("view failed: %v", err)
+			}
+			if seen != len(c.want) {
+				t.Errorf("saw wrong number of items: want %d, got %d", len(c.want), seen)
+			}
+		})
+	}
+}
diff --git a/go.mod b/go.mod
@@ -4,6 +4,7 @@ go 1.22.0
 
 require (
 	github.com/BurntSushi/toml v1.3.2
+	github.com/dgraph-io/badger/v4 v4.2.0
 	github.com/google/go-cmp v0.6.0
 	github.com/google/uuid v1.6.0
 	github.com/mattn/go-sqlite3 v1.14.22
@@ -18,8 +19,20 @@ require (
 )
 
 require (
+	github.com/cespare/xxhash/v2 v2.2.0 // indirect
+	github.com/dgraph-io/ristretto v0.1.1 // indirect
+	github.com/dustin/go-humanize v1.0.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/glog v1.0.0 // indirect
+	github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6 // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
+	github.com/golang/snappy v0.0.3 // indirect
+	github.com/google/flatbuffers v1.12.1 // indirect
+	github.com/klauspost/compress v1.12.3 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/xrash/smetrics v0.0.0-20231213231151-1d8dd44e695e // indirect
+	go.opencensus.io v0.22.5 // indirect
+	golang.org/x/net v0.21.0 // indirect
 	golang.org/x/sys v0.17.0 // indirect
 	google.golang.org/appengine v1.6.8 // indirect
 	google.golang.org/protobuf v1.32.0 // indirect