/
builder.go
400 lines (328 loc) · 10.9 KB
/
builder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: hello@weaviate.io
//
package sempath
import (
"context"
"fmt"
"math"
"sort"
"time"
"github.com/danaugrs/go-tsne/tsne"
"github.com/pkg/errors"
"github.com/tailor-inc/graphql/language/ast"
"github.com/weaviate/weaviate/entities/models"
"github.com/weaviate/weaviate/entities/moduletools"
"github.com/weaviate/weaviate/entities/search"
txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
"gonum.org/v1/gonum/mat"
)
func New(c11y Remote) *PathBuilder {
return &PathBuilder{
fixedSeed: time.Now().UnixNano(),
c11y: c11y,
}
}
type PathBuilder struct {
fixedSeed int64
c11y Remote
}
type Remote interface {
MultiNearestWordsByVector(ctx context.Context, vectors [][]float32, k, n int) ([]*txt2vecmodels.NearestNeighbors, error)
}
func (pb *PathBuilder) AdditionalPropertyDefaultValue() interface{} {
return &Params{}
}
func (pb *PathBuilder) AdditionalPropertyFn(ctx context.Context,
in []search.Result, params interface{}, limit *int,
argumentModuleParams map[string]interface{}, cfg moduletools.ClassConfig,
) ([]search.Result, error) {
if parameters, ok := params.(*Params); ok {
return pb.CalculatePath(in, parameters)
}
return nil, errors.New("unknown params")
}
func (pb *PathBuilder) ExtractAdditionalFn(param []*ast.Argument) interface{} {
return &Params{}
}
func (pb *PathBuilder) CalculatePath(in []search.Result, params *Params) ([]search.Result, error) {
if len(in) == 0 {
return nil, nil
}
if params == nil {
return nil, fmt.Errorf("no params provided")
}
dims := len(in[0].Vector)
if err := params.SetDefaultsAndValidate(len(in), dims); err != nil {
return nil, errors.Wrap(err, "invalid params")
}
searchNeighbors, err := pb.addSearchNeighbors(params)
if err != nil {
return nil, err
}
for i, obj := range in {
path, err := pb.calculatePathPerObject(obj, in, params, searchNeighbors)
if err != nil {
return nil, fmt.Errorf("object %d: %v", i, err)
}
if in[i].AdditionalProperties == nil {
in[i].AdditionalProperties = models.AdditionalProperties{}
}
in[i].AdditionalProperties["semanticPath"] = path
}
return in, nil
}
func (pb *PathBuilder) calculatePathPerObject(obj search.Result, allObjects []search.Result, params *Params,
searchNeighbors []*txt2vecmodels.NearestNeighbor,
) (*txt2vecmodels.SemanticPath, error) {
dims := len(obj.Vector)
matrix, neighbors, err := pb.vectorsToMatrix(obj, allObjects, dims, params, searchNeighbors)
if err != nil {
return nil, err
}
inputRows := matrix.RawMatrix().Rows
t := tsne.NewTSNE(2, float64(inputRows/2), 100, 100, false)
res := t.EmbedData(matrix, nil)
rows, cols := res.Dims()
if rows != inputRows {
return nil, fmt.Errorf("have different output results than input %d != %d", inputRows, rows)
}
// create an explicit copy of the neighbors, so we don't mutate them.
// Otherwise the 2nd round will have been influenced by the first
projectedNeighbors := copyNeighbors(neighbors)
var projectedSearchVector []float32
var projectedTargetVector []float32
for i := 0; i < rows; i++ {
vector := make([]float32, cols)
for j := range vector {
vector[j] = float32(res.At(i, j))
}
if i == 0 { // the input object
projectedTargetVector = vector
} else if i < 1+len(neighbors) {
// these must be neighbor props
projectedNeighbors[i-1].Vector = vector
} else {
// is now the very last element which is the search vector
projectedSearchVector = vector
}
}
path := pb.buildPath(projectedNeighbors, projectedSearchVector, projectedTargetVector)
return pb.addDistancesToPath(path, neighbors, params.SearchVector, obj.Vector)
}
func (pb *PathBuilder) addSearchNeighbors(params *Params) ([]*txt2vecmodels.NearestNeighbor, error) {
nn, err := pb.c11y.MultiNearestWordsByVector(context.TODO(), [][]float32{params.SearchVector}, 36, 50)
if err != nil {
return nil, err
}
return nn[0].Neighbors, nil
}
// TODO: document behavior if it actually stays like this
func (pb *PathBuilder) vectorsToMatrix(obj search.Result, allObjects []search.Result, dims int,
params *Params, searchNeighbors []*txt2vecmodels.NearestNeighbor,
) (*mat.Dense, []*txt2vecmodels.NearestNeighbor, error) {
items := 1 // the initial object
var neighbors []*txt2vecmodels.NearestNeighbor
neighbors = pb.extractNeighbors(allObjects)
neighbors = append(neighbors, searchNeighbors...)
neighbors = pb.removeDuplicateNeighborsAndDollarNeighbors(neighbors)
items += len(neighbors) + 1 // The +1 is for the search vector which we append last
// concat all vectors to build gonum dense matrix
mergedVectors := make([]float64, items*dims)
if l := len(obj.Vector); l != dims {
return nil, nil, fmt.Errorf("object: inconsistent vector lengths found: dimensions=%d and object=%d", dims, l)
}
for j, dim := range obj.Vector {
mergedVectors[j] = float64(dim)
}
withoutNeighbors := 1 * dims
for i, neighbor := range neighbors {
neighborVector := neighbor.Vector
if l := len(neighborVector); l != dims {
return nil, nil, fmt.Errorf("neighbor: inconsistent vector lengths found: dimensions=%d and object=%d", dims, l)
}
for j, dim := range neighborVector {
mergedVectors[withoutNeighbors+i*dims+j] = float64(dim)
}
}
for i, dim := range params.SearchVector {
mergedVectors[len(mergedVectors)-dims+i] = float64(dim)
}
return mat.NewDense(items, dims, mergedVectors), neighbors, nil
}
func (pb *PathBuilder) extractNeighbors(in []search.Result) []*txt2vecmodels.NearestNeighbor {
var out []*txt2vecmodels.NearestNeighbor
for _, obj := range in {
if obj.AdditionalProperties == nil || obj.AdditionalProperties["nearestNeighbors"] == nil {
continue
}
if neighbors, ok := obj.AdditionalProperties["nearestNeighbors"]; ok {
if nearestNeighbors, ok := neighbors.(*txt2vecmodels.NearestNeighbors); ok {
out = append(out, nearestNeighbors.Neighbors...)
}
}
}
return out
}
func (pb *PathBuilder) removeDuplicateNeighborsAndDollarNeighbors(in []*txt2vecmodels.NearestNeighbor) []*txt2vecmodels.NearestNeighbor {
seen := map[string]struct{}{}
out := make([]*txt2vecmodels.NearestNeighbor, len(in))
i := 0
for _, candidate := range in {
if _, ok := seen[candidate.Concept]; ok {
continue
}
if candidate.Concept[0] == '$' {
continue
}
out[i] = candidate
i++
seen[candidate.Concept] = struct{}{}
}
return out[:i]
}
func (pb *PathBuilder) buildPath(neighbors []*txt2vecmodels.NearestNeighbor, searchVector []float32,
target []float32,
) *txt2vecmodels.SemanticPath {
var path []*txt2vecmodels.SemanticPathElement
minDist := float32(math.MaxFloat32)
current := searchVector // initial search point
for {
nn := pb.nearestNeighbors(current, neighbors, 10)
nn = pb.discardFurtherThan(nn, minDist, target)
if len(nn) == 0 {
break
}
nn = pb.nearestNeighbors(current, nn, 1)
current = nn[0].Vector
minDist = pb.distance(current, target)
path = append(path, &txt2vecmodels.SemanticPathElement{
Concept: nn[0].Concept,
})
}
return &txt2vecmodels.SemanticPath{
Path: path,
}
}
func (pb *PathBuilder) nearestNeighbors(search []float32, candidates []*txt2vecmodels.NearestNeighbor, length int) []*txt2vecmodels.NearestNeighbor {
sort.Slice(candidates, func(a, b int) bool {
return pb.distance(candidates[a].Vector, search) < pb.distance(candidates[b].Vector, search)
})
return candidates[:length]
}
func (pb *PathBuilder) distance(a, b []float32) float32 {
var sums float32
for i := range a {
sums += (a[i] - b[i]) * (a[i] - b[i])
}
return float32(math.Sqrt(float64(sums)))
}
func (pb *PathBuilder) discardFurtherThan(candidates []*txt2vecmodels.NearestNeighbor, threshold float32, target []float32) []*txt2vecmodels.NearestNeighbor {
out := make([]*txt2vecmodels.NearestNeighbor, len(candidates))
i := 0
for _, c := range candidates {
if pb.distance(c.Vector, target) >= threshold {
continue
}
out[i] = c
i++
}
return out[:i]
}
// create an explicit deep copy that does not keep any references
func copyNeighbors(in []*txt2vecmodels.NearestNeighbor) []*txt2vecmodels.NearestNeighbor {
out := make([]*txt2vecmodels.NearestNeighbor, len(in))
for i, n := range in {
out[i] = &txt2vecmodels.NearestNeighbor{
Concept: n.Concept,
Distance: n.Distance,
Vector: n.Vector,
}
}
return out
}
func (pb *PathBuilder) addDistancesToPath(path *txt2vecmodels.SemanticPath, neighbors []*txt2vecmodels.NearestNeighbor,
searchVector, targetVector []float32,
) (*txt2vecmodels.SemanticPath, error) {
for i, elem := range path.Path {
vec, ok := neighborVecByConcept(neighbors, elem.Concept)
if !ok {
return nil, fmt.Errorf("no vector present for concept: %s", elem.Concept)
}
if i != 0 {
// include previous
previousVec, ok := neighborVecByConcept(neighbors, path.Path[i-1].Concept)
if !ok {
return nil, fmt.Errorf("no vector present for previous concept: %s", path.Path[i-1].Concept)
}
d, err := cosineDist(vec, previousVec)
if err != nil {
return nil, errors.Wrap(err, "calculate distance between current path and previous element")
}
path.Path[i].DistanceToPrevious = &d
}
// target
d, err := cosineDist(vec, targetVector)
if err != nil {
return nil, errors.Wrap(err, "calculate distance between current path and result element")
}
path.Path[i].DistanceToResult = d
// query
d, err = cosineDist(vec, searchVector)
if err != nil {
return nil, errors.Wrap(err, "calculate distance between current path and query element")
}
path.Path[i].DistanceToQuery = d
if i != len(path.Path)-1 {
// include next
nextVec, ok := neighborVecByConcept(neighbors, path.Path[i+1].Concept)
if !ok {
return nil, fmt.Errorf("no vector present for next concept: %s", path.Path[i+1].Concept)
}
d, err := cosineDist(vec, nextVec)
if err != nil {
return nil, errors.Wrap(err, "calculate distance between current path and next element")
}
path.Path[i].DistanceToNext = &d
}
}
return path, nil
}
func neighborVecByConcept(neighbors []*txt2vecmodels.NearestNeighbor, concept string) ([]float32, bool) {
for _, n := range neighbors {
if n.Concept == concept {
return n.Vector, true
}
}
return nil, false
}
func cosineSim(a, b []float32) (float32, error) {
if len(a) != len(b) {
return 0, fmt.Errorf("vectors have different dimensions")
}
var (
sumProduct float64
sumASquare float64
sumBSquare float64
)
for i := range a {
sumProduct += float64(a[i] * b[i])
sumASquare += float64(a[i] * a[i])
sumBSquare += float64(b[i] * b[i])
}
return float32(sumProduct / (math.Sqrt(sumASquare) * math.Sqrt(sumBSquare))), nil
}
func cosineDist(a, b []float32) (float32, error) {
sim, err := cosineSim(a, b)
if err != nil {
return 0, err
}
return 1 - sim, nil
}