-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
weights.go
319 lines (292 loc) · 9.75 KB
/
weights.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/*
Copyright 2023 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package evalengine
import (
"encoding/binary"
"math"
"vitess.io/vitess/go/hack"
"vitess.io/vitess/go/mysql/collations"
"vitess.io/vitess/go/mysql/collations/charset"
"vitess.io/vitess/go/mysql/collations/colldata"
"vitess.io/vitess/go/mysql/decimal"
"vitess.io/vitess/go/mysql/json"
"vitess.io/vitess/go/sqltypes"
querypb "vitess.io/vitess/go/vt/proto/query"
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
"vitess.io/vitess/go/vt/vterrors"
)
// WeightString returns the weight string for a value.
// It appends to dst if an existing slice is given, otherwise it
// returns a new one.
// The returned boolean indicates whether the weight string is a
// fixed-width weight string, such as for fixed size integer values.
// Our WeightString implementation supports more types that MySQL
// externally communicates with the `WEIGHT_STRING` function, so that we
// can also use this to order / sort other types like Float and Decimal
// as well.
func WeightString(dst []byte, v sqltypes.Value, coerceTo sqltypes.Type, col collations.ID, length, precision int, values *EnumSetValues, sqlmode SQLMode) ([]byte, bool, error) {
// We optimize here for the case where we already have the desired type.
// Otherwise, we fall back to the general evalengine conversion logic.
if v.Type() != coerceTo {
return fallbackWeightString(dst, v, coerceTo, col, length, precision, values, sqlmode)
}
switch {
case sqltypes.IsNull(coerceTo):
return nil, true, nil
case sqltypes.IsSigned(coerceTo):
i, err := v.ToInt64()
if err != nil {
return dst, false, err
}
raw := uint64(i)
raw = raw ^ (1 << 63)
return binary.BigEndian.AppendUint64(dst, raw), true, nil
case sqltypes.IsUnsigned(coerceTo):
u, err := v.ToUint64()
if err != nil {
return dst, false, err
}
return binary.BigEndian.AppendUint64(dst, u), true, nil
case sqltypes.IsFloat(coerceTo):
f, err := v.ToFloat64()
if err != nil {
return dst, false, err
}
raw := math.Float64bits(f)
if math.Signbit(f) {
raw = ^raw
} else {
raw = raw ^ (1 << 63)
}
return binary.BigEndian.AppendUint64(dst, raw), true, nil
case sqltypes.IsBinary(coerceTo):
b := v.Raw()
if length != 0 {
if length > cap(b) {
b = append(b, make([]byte, length-len(b))...)
} else {
b = b[:length]
}
}
return append(dst, b...), false, nil
case sqltypes.IsText(coerceTo):
coll := colldata.Lookup(col)
if coll == nil {
return dst, false, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "cannot hash unsupported collation")
}
b := v.Raw()
if length != 0 {
b = charset.Slice(coll.Charset(), b, 0, length)
}
return coll.WeightString(dst, b, length), false, nil
case sqltypes.IsDecimal(coerceTo):
dec, err := decimal.NewFromMySQL(v.Raw())
if err != nil {
return dst, false, err
}
return dec.WeightString(dst, int32(length), int32(precision)), true, nil
case coerceTo == sqltypes.TypeJSON:
j, err := json.NewFromSQL(v)
if err != nil {
return dst, false, err
}
return j.WeightString(dst), false, nil
case coerceTo == sqltypes.Enum:
return evalWeightString(dst, newEvalEnum(v.Raw(), values), length, precision)
case coerceTo == sqltypes.Set:
return evalWeightString(dst, newEvalSet(v.Raw(), values), length, precision)
default:
return fallbackWeightString(dst, v, coerceTo, col, length, precision, values, sqlmode)
}
}
func fallbackWeightString(dst []byte, v sqltypes.Value, coerceTo sqltypes.Type, col collations.ID, length, precision int, values *EnumSetValues, sqlmode SQLMode) ([]byte, bool, error) {
e, err := valueToEvalCast(v, coerceTo, col, values, sqlmode)
if err != nil {
return dst, false, err
}
return evalWeightString(dst, e, length, precision)
}
func evalWeightString(dst []byte, e eval, length, precision int) ([]byte, bool, error) {
switch e := e.(type) {
case nil:
return nil, true, nil
case *evalInt64:
raw := uint64(e.i)
raw = raw ^ (1 << 63)
return binary.BigEndian.AppendUint64(dst, raw), true, nil
case *evalUint64:
return binary.BigEndian.AppendUint64(dst, e.u), true, nil
case *evalFloat:
raw := math.Float64bits(e.f)
if math.Signbit(e.f) {
raw = ^raw
} else {
raw = raw ^ (1 << 63)
}
return binary.BigEndian.AppendUint64(dst, raw), true, nil
case *evalDecimal:
return e.dec.WeightString(dst, int32(length), int32(precision)), true, nil
case *evalBytes:
if e.isBinary() {
b := e.bytes
if length != 0 {
if length > cap(b) {
b = append(b, make([]byte, length-len(b))...)
} else {
b = b[:length]
}
}
return append(dst, b...), false, nil
}
coll := colldata.Lookup(e.col.Collation)
if coll == nil {
return dst, false, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "cannot hash unsupported collation")
}
b := e.bytes
if length != 0 {
b = charset.Slice(coll.Charset(), b, 0, length)
}
return coll.WeightString(dst, b, length), false, nil
case *evalTemporal:
return e.dt.WeightString(dst), true, nil
case *evalJSON:
return e.WeightString(dst), false, nil
case *evalEnum:
raw := uint64(e.value)
raw = raw ^ (1 << 63)
return binary.BigEndian.AppendUint64(dst, raw), true, nil
case *evalSet:
raw := e.set
raw = raw ^ (1 << 63)
return binary.BigEndian.AppendUint64(dst, raw), true, nil
}
return dst, false, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unexpected type %v", e.SQLType())
}
// TinyWeighter returns a callback to apply a Tiny Weight string to a sqltypes.Value.
// A tiny weight string is a compressed 4-byte representation of the value's full weight string that
// sorts identically to its full weight. Obviously, the tiny weight string can collide because
// it's represented in fewer bytes than the full one.
// Hence, for any 2 instances of sqltypes.Value: if both instances have a Tiny Weight string,
// and the weight strings are **different**, the two values will sort accordingly to the 32-bit
// numerical sort of their tiny weight strings. Otherwise, the relative sorting of the two values
// will not be known, and they will require a full sort using e.g. NullsafeCompare.
func TinyWeighter(f *querypb.Field, collation collations.ID) func(v *sqltypes.Value) {
switch {
case sqltypes.IsNull(f.Type):
return nil
case sqltypes.IsSigned(f.Type), f.Type == sqltypes.Enum, f.Type == sqltypes.Set:
return func(v *sqltypes.Value) {
i, err := v.ToInt64()
if err != nil {
return
}
// The full weight string for an integer is just its MSB bit-inverted 64 bit representation.
// However, we only have 4 bytes to work with here, so in order to minimize the amount
// of collisions for the tiny weight string, instead of grabbing the top 32 bits of the
// 64 bit representation, we're going to cast to float32. Floats are sortable once bit-inverted,
// and although they cannot represent the full 64-bit range (duh!), that's perfectly fine
// because close-by numbers will collide into the same tiny weight, allowing us to fall back
// to a full comparison.
raw := math.Float32bits(float32(i))
if i < 0 {
raw = ^raw
} else {
raw = raw ^ (1 << 31)
}
v.SetTinyWeight(raw)
}
case sqltypes.IsUnsigned(f.Type):
return func(v *sqltypes.Value) {
u, err := v.ToUint64()
if err != nil {
return
}
// See comment for the IsSigned block. No bit-inversion is required here as all floats will be positive.
v.SetTinyWeight(math.Float32bits(float32(u)))
}
case sqltypes.IsFloat(f.Type):
return func(v *sqltypes.Value) {
fl, err := v.ToFloat64()
if err != nil {
return
}
// Similarly as the IsSigned block, we could take the top 32 bits of the float64 bit representation,
// but by down-sampling to a float32 we reduce the amount of collisions.
raw := math.Float32bits(float32(fl))
if math.Signbit(fl) {
raw = ^raw
} else {
raw = raw ^ (1 << 31)
}
v.SetTinyWeight(raw)
}
case sqltypes.IsBinary(f.Type):
return func(v *sqltypes.Value) {
if v.IsNull() {
return
}
var w32 [4]byte
copy(w32[:4], v.Raw())
v.SetTinyWeight(binary.BigEndian.Uint32(w32[:4]))
}
case sqltypes.IsText(f.Type):
if coll := colldata.Lookup(collation); coll != nil {
if twcoll, ok := coll.(colldata.TinyWeightCollation); ok {
return func(v *sqltypes.Value) {
if v.IsNull() {
return
}
v.SetTinyWeight(twcoll.TinyWeightString(v.Raw()))
}
}
}
return nil
case sqltypes.IsDecimal(f.Type):
return func(v *sqltypes.Value) {
if v.IsNull() {
return
}
// To generate a 32-bit weight string of the decimal, we'll just attempt a fast 32bit atof parse
// of its contents. This can definitely fail for many corner cases, but that's OK: we'll just fall
// back to a full decimal comparison in those cases.
fl, _, err := hack.Atof32(v.RawStr())
if err != nil {
return
}
raw := math.Float32bits(fl)
if raw&(1<<31) != 0 {
raw = ^raw
} else {
raw = raw ^ (1 << 31)
}
v.SetTinyWeight(raw)
}
case f.Type == sqltypes.TypeJSON:
return func(v *sqltypes.Value) {
if v.IsNull() {
return
}
j, err := json.NewFromSQL(*v)
if err != nil {
return
}
var w32 [4]byte
// TODO: this can be done more efficiently without having to calculate the full weight string and
// extracting its prefix.
copy(w32[:4], j.WeightString(nil))
v.SetTinyWeight(binary.BigEndian.Uint32(w32[:4]))
}
default:
return nil
}
}