-
Notifications
You must be signed in to change notification settings - Fork 2k
/
iter_fast_900.go
343 lines (301 loc) · 12.4 KB
/
iter_fast_900.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/*
Copyright 2021 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package uca
import (
"math/bits"
"unicode/utf8"
"unsafe"
)
type FastIterator900 struct {
iterator900
fastTable *[256]uint32
unicode int
}
func (it *FastIterator900) Done() {
it.original = nil
it.input = nil
it.iterpool.Put(it)
}
func (it *FastIterator900) reset(input []byte) {
it.fastTable = &fastweightTable_uca900_page000L0
it.unicode = 0
it.iterator900.reset(input)
}
func (it *FastIterator900) SkipLevel() int {
it.codepoint.ce = 0
it.level++
it.resetForNextLevel()
return it.level
}
const maxUnicodeBlocks = 3
// FastForward32 fast-forwards this iterator and the given it2 in parallel until
// there is a mismatch in their weights, and returns their difference.
// This function is similar to NextWeightBlock64 in that it only succeeds if the
// iterators are composed of (mostly) ASCII characters. See the docs for NextWeightBlock64
// for documentation on how these fast comparisons work.
func (it *FastIterator900) FastForward32(it2 *FastIterator900) int {
// We use a heuristic to detect when we should stop using the FastForward32
// iterator: every time we encounter a 4-byte block that is fully Unicode,
// (i.e. without any ASCII characters), we increase the `it.unicode` counter.
// Encountering a block that is fully ASCII decreases the counter. If the
// counter ever gets to 4, further calls to FastForward32 are disabled.
if it.unicode > maxUnicodeBlocks || it.codepoint.ce != 0 || it2.codepoint.ce != 0 {
return 0
}
p1 := it.input
p2 := it2.input
var w1, w2 uint16
for len(p1) >= 4 && len(p2) >= 4 {
dword1 := *(*uint32)(unsafe.Pointer(&p1[0]))
dword2 := *(*uint32)(unsafe.Pointer(&p2[0]))
nonascii := (dword1 | dword2) & 0x80808080
if nonascii == 0 {
if dword1 != dword2 {
// Use the weight string fast tables for quick weight comparisons;
// see (*FastIterator900).NextWeightBlock64 for a description of
// the table format
table := it.fastTable
if w1, w2 = uint16(table[p1[0]]), uint16(table[p2[0]]); w1 != w2 {
goto mismatch
}
if w1, w2 = uint16(table[p1[1]]), uint16(table[p2[1]]); w1 != w2 {
goto mismatch
}
if w1, w2 = uint16(table[p1[2]]), uint16(table[p2[2]]); w1 != w2 {
goto mismatch
}
if w1, w2 = uint16(table[p1[3]]), uint16(table[p2[3]]); w1 != w2 {
goto mismatch
}
}
p1 = p1[4:]
p2 = p2[4:]
it.unicode--
continue
} else if bits.OnesCount32(nonascii) == 4 {
it.unicode++
}
break
}
it.input = p1
it2.input = p2
return 0
mismatch:
// If either of the weights was an ignorable, this is not really a mismatch;
// return 0 so we fall back to the slow path and increase `it.unicode`. Although
// these are _not_ unicode codepoints, if we find too many ignorable ASCII in
// an iterator we want to skip further calls to FastForward32 because they
// won't be able to optimize the comparisons at all
if w1 == 0 || w2 == 0 {
it.input = p1
it2.input = p2
it.unicode++
return 0
}
// The weights must be byte-swapped before comparison because they're stored in big endian
return int(bits.ReverseBytes16(w1)) - int(bits.ReverseBytes16(w2))
}
// NextWeightBlock64 takes a byte slice of 16 bytes and fills it with the next
// chunk of weights from this iterator. If the input slice is smaller than
// 16 bytes, the function will panic.
//
// The function returns the weights in Big Endian ordering: this is the
// same ordering that MySQL uses when generating weight strings, so the return
// of this function can be inserted directly into a weight string and the
// result will be compatible with MySQL. Likewise, the resulting slice
// can be compared byte-wise (bytes.Compare) to obtain a proper collation
// ordering against another string.
//
// Returns the number of bytes written to `dstbytes`. If 0, this iterator
// has been fully consumed.
//
// Implementation notes:
// This is a fast-path algorithm that can only work for UCA900 collations
// that do not have reorderings, contractions or any weight patches. The idea
// is detecting runs of 8 ASCII characters in a row, which are very frequent
// in most UTF8 code, particularly in English, and generating the weights
// for these 8 characters directly from an optimized table, instead of going
// through the whole Unicode Collation Algorithm. This is feasible because
// in UCA900, all characters in the ASCII range have either 0 or 1 weight
// triplets, so their weight can be calculated with a single lookup in a 128-entry
// table for each level (0, 1, 2).
func (it *FastIterator900) NextWeightBlock64(dstbytes []byte) int {
// Ensure the destination slice has at least 16 bytes; this bounds check
// removes all the other bound checks for the rest of the function.
_ = dstbytes[15]
// Unsafe cast the destination byte slice into a slice of uint16, so the
// individual weights can be written directly to it.
dst := (*[8]uint16)(unsafe.Pointer(&dstbytes[0]))
p := it.input
// The fast path works on 8-byte chunks from the original input.
// If the underlying slow iterator is in the middle of processing the
// weights for a codepoint, we cannot go through the fast path.
if it.codepoint.ce == 0 && len(p) >= 8 {
// Read 8 bytes from the input string. This would ideally be implemented
// as a `binary.LittleEndian.Uint64` read, but the compiler doesn't seem to be
// optimizing it properly, and it generates the individual byte shifts :(
dword := *(*uint64)(unsafe.Pointer(&p[0]))
// Check if all any of the bytes in the next 8 bytes have their highest
// bit set. If they're all clear, these are 8 ASCII bytes which can go through
// the fast path
if dword&0x8080808080808080 == 0 {
// Load the fast table from the iterator. There are three fast tables hardcoded
// for this implementation, for ASCII levels 0, 1 and 2. The slow iterator replaces
// the table stored on `it.fastTable` every time we skip a level.
// Note that the table has 256 entries although only the first 128 are used. We
// want a 256 entry table because it forces the Go compiler to disable all bound
// checks when accessing the table IF our index variable is a byte (since a byte
// cannot be larger than 255).
table := it.fastTable
// All ASCII codepoints (0 >= cp >= 127) have either 0 or 1 weights to yield.
// This is a problem for our fast path, because 0-weights must NOT appear in the
// resulting weight string. The codepoints with 0 weights are, however, exceedingly
// rare (they're mostly non-print codepoints), so based on this we can choose to
// perform either an optimistic or pessimistic optimization, which is toggled at
// compile time with this flag. For now, we're going with optimistic because it
// provides better results in realistic benchmarks.
const optimisticFastWrites = true
if optimisticFastWrites {
// For the optimistic optimization, we're going to assume that none of the
// ASCII codepoints in this chunk have zero weights, and check only once at
// the end of the chunk if that was the case. If we found a zero-weight (a
// rare occurrence), we discard the whole chunk and fall back to the slow
// iterator, which handles zero weights just fine.
// To perform the check for zero weights efficiently, we've designed fast tables
// where every entry is 32 bits, even though the actual weights are in the
// bottom 16 bits. The upper 16 bits contain either 0x2, if this is a valid weight
// or are zeroed out for 0-weights.
// Because of this, we can lookup from the fast table and insert directly the lowest
// 16 bits as the weight, and then we `and` the whole weight against a 32-bit mask
// that starts as 0x20000. For any weights that are valid, that will leave the mask
// with the same value, because only the higher bits will match, while any 0-weight
// will fully clear the mask.
// At the end of this block, we check if the mask has been cleared by any of the
// writes, and if it has, we scrap this work (boo) and fall back to the slow iterator.
var mask uint32 = 0x20000
weight := table[p[0]]
mask &= weight
dst[0] = uint16(weight)
weight = table[p[1]]
mask &= weight
dst[1] = uint16(weight)
weight = table[p[2]]
mask &= weight
dst[2] = uint16(weight)
weight = table[p[3]]
mask &= weight
dst[3] = uint16(weight)
weight = table[p[4]]
mask &= weight
dst[4] = uint16(weight)
weight = table[p[5]]
mask &= weight
dst[5] = uint16(weight)
weight = table[p[6]]
mask &= weight
dst[6] = uint16(weight)
weight = table[p[7]]
mask &= weight
dst[7] = uint16(weight)
if mask != 0 {
it.input = it.input[8:]
return 16
}
} else {
// For the pessimistic optimization, we're going to assume that any 8-byte chunk
// can contain 0-weights (something rather rare in practice). We're writing
// the lower 16 bits of the weight into the target buffer (just like in the optimistic
// optimization), but then we're increasing our target buffer pointer by
// the high 16 bits of the weight (weight >> 16). For valid weights, the high
// bits will equal 0x2, which is exactly the offset we want to move our target
// pointer so we can write the next weight afterwards, and for 0-weights, the
// high bits will be 0x0, so the pointer will not advance and the next weight
// we write will replace the 0-weight we've just written. This ensures that the
// resulting byte output doesn't have any 0-weights in it, but it also causes
// small stalls in the CPU because the writes are not necessarily linear and they
// have an ordering dependency with the value we've loaded from the fast table.
// Regardless of the stalls, this algorithm is obviously branch-less and very
// efficient, it just happens that in real world scenarios, the optimistic
// approach is even faster because 0-weights are very rare in practice.
// For now, this algorithm is disabled.
dstptr := (*uint16)(unsafe.Pointer(&dstbytes[0]))
weight := table[p[0]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[1]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[2]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[3]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[4]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[5]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[6]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
weight = table[p[7]]
*dstptr = uint16(weight)
dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
written := uintptr(unsafe.Pointer(dstptr)) - uintptr(unsafe.Pointer(&dstbytes[0]))
if written != 0 {
it.input = it.input[written>>1:]
return int(written)
}
}
}
}
// Slow path: just loop up to 8 times to fill the buffer and bail
// early if we exhaust the iterator.
for i := 0; i < 8; i++ {
w, ok := it.Next()
if !ok {
return i * 2
}
dst[i] = bits.ReverseBytes16(w)
}
return 16
}
func (it *FastIterator900) resetForNextLevel() {
it.input = it.original
switch it.level {
case 1:
it.fastTable = &fastweightTable_uca900_page000L1
case 2:
it.fastTable = &fastweightTable_uca900_page000L2
}
}
func (it *FastIterator900) Next() (uint16, bool) {
for {
if w, ok := it.codepoint.next(); ok {
return w, true
}
cp, width := utf8.DecodeRune(it.input)
if cp == utf8.RuneError && width < 3 {
it.level++
if it.level < it.maxLevel {
it.resetForNextLevel()
return 0, true
}
return 0, false
}
it.input = it.input[width:]
it.codepoint.init(&it.iterator900, cp)
}
}