Skip to content

Commit 3453c87

Browse files
committed
fix last token offset error
1 parent b638c52 commit 3453c87

File tree

2 files changed

+279
-1
lines changed

2 files changed

+279
-1
lines changed

suffixarray.js

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
/**
2+
* An implementation of the linear time suffix array construction of
3+
* Karkkainen & Sanders:
4+
*
5+
* "Simple Linear Work Suffix Array Construction", Karkainen and Sanders.
6+
*
7+
* Creating a suffix array is very simple; just call suffixArray(...) with
8+
* either a string or a function that returns integers and its length. For
9+
* example,
10+
*
11+
* var s = "Sort this!";
12+
* suffixArray(s); // Returns [4, 9, 0, 6, 7, 1, 2, 8, 3, 5].
13+
*
14+
* function reverse(i) { return s.charCodeAt(s.length - 1 - i) }
15+
* suffixArray(reverse, s.length); // Returns [5, 0, 9, 3, 2, 8, 7, 1, 4, 6].
16+
*
17+
* @author Thomas Switzer
18+
*/
19+
20+
var floor = Math.floor,
21+
identity = function(x) { return x };
22+
23+
24+
/**
25+
* Sorts an array of (unsigned) integers in linear time. The values of the
26+
* array (a) act as keys, which are passed to the key function which returns an
27+
* integer value.
28+
*
29+
* @param a An array of keys to sort.
30+
* @param key A function that maps keys to integer values.
31+
* @return The array a.
32+
*/
33+
function bsort(a, key) {
34+
var len = a.length,
35+
buckets = [],
36+
i = len, j = -1, b, d = 0,
37+
keys = 0,
38+
bits;
39+
key = key || identity;
40+
while (i--)
41+
j = Math.max(key(a[i]), j);
42+
bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8;
43+
for (; d < bits; d += 4) {
44+
for (i = 16; i--;)
45+
buckets[i] = [];
46+
for (i = len; i--;)
47+
buckets[(key(a[i]) >> d) & 15].push(a[i]);
48+
for (b = 0; b < 16; b++)
49+
for (j = buckets[b].length; j--;)
50+
a[++i] = buckets[b][j];
51+
}
52+
return a;
53+
}
54+
55+
56+
function isInt(n) {
57+
return typeof n == "number" || n instanceof Number;
58+
}
59+
60+
function isStr(s) {
61+
return Object.prototype.toString.call(s) == "[object String]";
62+
}
63+
64+
65+
function wrap(s) {
66+
return typeof s == "function" ? s : (isStr(s)
67+
? function(i) { return s.charCodeAt(i) }
68+
: function(i) { return s[i] });
69+
}
70+
71+
72+
/**
73+
* Returns the suffix array of the string s. The suffix array is constructed
74+
* in linear time.
75+
*
76+
* The string s can either be an Unicode string (ie. JavaScript String object)
77+
* or a function that takes an index (integer >= 0) and returns another
78+
* integer (a "symbol"). If a function is provided, then another argument
79+
* specifying its length (integer >= 0) must be provided.
80+
*
81+
* This also takes a 3rd optional parameter that dictates how to treat the end
82+
* of the string. This can be either "min" or "wrap". If it is "min", then
83+
* characters after the end of the string are treated as 0's (the minimum).
84+
* If "wrap" is given, then the end of the string wraps back around to the
85+
* beginning. If this parameter is omitted, then "wrap" is assumed.
86+
*
87+
* In the case of strings, you can omit the 2nd paramter (length) and still
88+
* provide the 3rd paramter. For instance, suffixArray(str, "min").
89+
*
90+
* The returned array contains the indexes of the string in the lexicographical
91+
* order of the suffixes that start at those indexes.
92+
*
93+
* @param s A string or function that maps ints between [0, len) to integers.
94+
* @param len The length of s (optional if s is a string, required otherwise).
95+
* @param end Either "min", "wrap" or leave out (defaults to "wrap").
96+
* @return An array of indexes into s.
97+
*/
98+
var suffixArray = function(s, len, end) {
99+
end = end || len;
100+
len = isInt(len) ? len : s.length;
101+
102+
if (end == "wrap")
103+
return wrappedSuffixArray(s, len);
104+
else
105+
return _suffixArray(wrap(s), len);
106+
}
107+
suffixArray.bsort = bsort;
108+
module.exports=suffixArray;
109+
110+
// Export the Bucket Sort.
111+
112+
113+
114+
/**
115+
* Constructs the suffix array of s. It takes either a string, an array, or a
116+
* function that takes an integer and returns a unsigned integer. It also takes
117+
* an optional 2nd paramter, the length. This is required if the first
118+
* parameter is a function.
119+
*
120+
* This uses the nice idea from Karkkainen & Sander's paper of replacing each
121+
* letter with the equivalent k-letter version (3 in their paper, 2 in this
122+
* algorithm). This is repeated recursively until all the letters are
123+
* different. This doesn't have the nice 1/3 pruning / merge step of their
124+
* algorithm, but still performs relatively fast, running in O(n log n).
125+
*
126+
* @param s A string, array, or function.
127+
* @param len The length of s.
128+
* @return The order of the suffixes.
129+
*/
130+
function wrappedSuffixArray(s, len) {
131+
len = isInt(len) ? len : s.length;
132+
s = wrap(s);
133+
134+
var array = [],
135+
swap = [],
136+
order = [],
137+
span,
138+
sym,
139+
i = len;
140+
141+
while (i--)
142+
array[i] = s(order[i] = i);
143+
144+
for (span = 1; sym != len && span < len; span *= 2) {
145+
bsort(order, function(i) { return array[(i + span) % len] });
146+
bsort(order, function(i) { return array[i] });
147+
148+
sym = swap[order[0]] = 1;
149+
for (i = 1; i < len; i++) {
150+
if (array[order[i]] != array[order[i - 1]] || array[(order[i] + span) % len] != array[(order[i - 1] + span) % len])
151+
sym++;
152+
swap[order[i]] = sym;
153+
}
154+
155+
tmp = array;
156+
array = swap;
157+
swap = tmp;
158+
}
159+
160+
return order;
161+
}
162+
163+
164+
/* Constructs the suffix array of s. In this case, s must be a function that
165+
* maps integers between 0 and len - 1 to "symbols" (unsigned integers). It
166+
* returns the suffixes in lexicographical order as an array of indexes where
167+
* those suffixes start.
168+
*
169+
* I have tried to keep the code reasonably well commented. Both for my sake,
170+
* and yours. That said, my code was not written with pedagogy in mind, but
171+
* to be relatively fast and have a small minified size.
172+
*
173+
* The description of the algorithm in the paper is very concise and is well
174+
* worth a read.
175+
*
176+
* The C code accompanying the paper is very terse and, IMHO, creates more
177+
* confusion than clarity. While the algorithm itself is fairly simple (simple
178+
* and fast, who wants more?), it does deal with quite a bit of abstraction.
179+
* That is, you are dealing with a lot of placeholders, rather than concrete
180+
* objects; indexes into the string to represent suffixes, lexical names
181+
* representing triplets of symbols, indexes of these lexical names, etc.
182+
*/
183+
function _suffixArray(_s, len) {
184+
var a = [],
185+
b = [],
186+
alen = floor(2 * len / 3), // Number of indexes s.t. i % 3 != 0.
187+
blen = len - alen, // Number of indexes s.t. i % 3 = 0.
188+
r = (alen + 1) >> 1, // Number of indexes s.t. i % 3 = 1.
189+
i = alen,
190+
j = 0,
191+
k,
192+
lookup = [],
193+
result = [],
194+
tmp, cmp,
195+
s;
196+
197+
if (len == 1)
198+
return [ 0 ];
199+
200+
s = function(i) { return i >= len ? 0 : _s(i) };
201+
202+
// Sort suffixes w/ indices % 3 != 0 by their first 3 symbols (triplets).
203+
204+
while (i--)
205+
a[i] = ((i * 3) >> 1) + 1; // a = [1, 2, 4, 5, 7, 8, 10, 11, 13, ...]
206+
207+
for (i = 3; i--;)
208+
bsort(a, function(j) { return s(i + j) });
209+
210+
// Assign lexicographical names (j) to the triplets of consecutive symbols,
211+
// s.t. the order of the lex. names match the lex. order of the triplets.
212+
213+
// Array b contains lex. names in the order they appear in s for i % 3 != 0
214+
215+
j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 1;
216+
for (i = 1; i < alen; i++) {
217+
if (s(a[i]) != s(a[i-1]) || s(a[i] + 1) != s(a[i-1] + 1) || s(a[i] + 2) != s(a[i-1] + 2))
218+
j++;
219+
b[floor(a[i] / 3) + (a[i] % 3 == 1 ? 0 : r)] = j;
220+
}
221+
222+
// If all lex. names are unique, then a is already completely sorted.
223+
224+
if (j < alen) {
225+
226+
// Otherwise, recursively sort lex. names in b, then reconstruct the
227+
// indexes of the sorted array b so they are relative to a.
228+
229+
b = _suffixArray(function(i) { return b[i] }, alen);
230+
231+
for (i = alen; i--;)
232+
a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2);
233+
234+
}
235+
236+
// Create a reverse lookup table for the indexes i, s.t. i % 3 != 0.
237+
// This table can be used to simply determine the sorted order of 2
238+
// suffixes whose indexes are both not divisible by 3.
239+
240+
for (i = alen; i--;)
241+
lookup[a[i]] = i;
242+
lookup[len] = -1;
243+
lookup[len + 1] = -2;
244+
245+
/**
246+
* This is a comparison function for the suffixes at indices m & n that
247+
* uses the lookup table to shorten the searches. It assumes that
248+
* n % 3 == 0 and m % 3 != 0.
249+
*/
250+
cmp = function(m, n) {
251+
return (s(m) - s(n)) || (m % 3 == 2
252+
? (s(m + 1) - s(n + 1)) || (lookup[m + 2] - lookup[n + 2])
253+
: (lookup[m + 1] - lookup[n + 1]))
254+
};
255+
256+
// Sort remaining suffixes (i % 3 == 0) using prev result (i % 3 != 0).
257+
258+
b = len % 3 == 1 ? [ len - 1 ] : [];
259+
for (i = 0; i < alen; i++)
260+
if (a[i] % 3 == 1)
261+
b.push(a[i] - 1);
262+
bsort(b, function(j) { return s(j) });
263+
264+
// Merge a (i % 3 != 0) and b (i % 3 == 0) together. We only need to
265+
// compare, at most, 2 symbols before we end up comparing 2 suffixes whose
266+
// indices are both not divisible by 3. At this point, we can use the
267+
// reverse lookup array to order them.
268+
269+
for (i = 0, j = 0, k = 0; i < alen && j < blen;)
270+
result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++];
271+
while (i < alen)
272+
result[k++] = a[i++];
273+
while (j < blen)
274+
result[k++] = b[j++];
275+
276+
return result;
277+
}
278+

tokenizers.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ var tibetan =function(s) {
1616
});
1717
if (last<str.length) {
1818
tokens.push(str.substring(last));
19-
offsets.push(last);
19+
offsets.push(offset+last);
2020
}
2121
if (i===arr.length-1) break;
2222
tokens.push('\n');

0 commit comments

Comments
 (0)