|
| 1 | +/** |
| 2 | + * An implementation of the linear time suffix array construction of |
| 3 | + * Karkkainen & Sanders: |
| 4 | + * |
| 5 | + * "Simple Linear Work Suffix Array Construction", Karkainen and Sanders. |
| 6 | + * |
| 7 | + * Creating a suffix array is very simple; just call suffixArray(...) with |
| 8 | + * either a string or a function that returns integers and its length. For |
| 9 | + * example, |
| 10 | + * |
| 11 | + * var s = "Sort this!"; |
| 12 | + * suffixArray(s); // Returns [4, 9, 0, 6, 7, 1, 2, 8, 3, 5]. |
| 13 | + * |
| 14 | + * function reverse(i) { return s.charCodeAt(s.length - 1 - i) } |
| 15 | + * suffixArray(reverse, s.length); // Returns [5, 0, 9, 3, 2, 8, 7, 1, 4, 6]. |
| 16 | + * |
| 17 | + * @author Thomas Switzer |
| 18 | + */ |
| 19 | + |
| 20 | +var floor = Math.floor, |
| 21 | + identity = function(x) { return x }; |
| 22 | + |
| 23 | + |
| 24 | +/** |
| 25 | + * Sorts an array of (unsigned) integers in linear time. The values of the |
| 26 | + * array (a) act as keys, which are passed to the key function which returns an |
| 27 | + * integer value. |
| 28 | + * |
| 29 | + * @param a An array of keys to sort. |
| 30 | + * @param key A function that maps keys to integer values. |
| 31 | + * @return The array a. |
| 32 | + */ |
| 33 | +function bsort(a, key) { |
| 34 | + var len = a.length, |
| 35 | + buckets = [], |
| 36 | + i = len, j = -1, b, d = 0, |
| 37 | + keys = 0, |
| 38 | + bits; |
| 39 | + key = key || identity; |
| 40 | + while (i--) |
| 41 | + j = Math.max(key(a[i]), j); |
| 42 | + bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8; |
| 43 | + for (; d < bits; d += 4) { |
| 44 | + for (i = 16; i--;) |
| 45 | + buckets[i] = []; |
| 46 | + for (i = len; i--;) |
| 47 | + buckets[(key(a[i]) >> d) & 15].push(a[i]); |
| 48 | + for (b = 0; b < 16; b++) |
| 49 | + for (j = buckets[b].length; j--;) |
| 50 | + a[++i] = buckets[b][j]; |
| 51 | + } |
| 52 | + return a; |
| 53 | +} |
| 54 | + |
| 55 | + |
| 56 | +function isInt(n) { |
| 57 | + return typeof n == "number" || n instanceof Number; |
| 58 | +} |
| 59 | + |
| 60 | +function isStr(s) { |
| 61 | + return Object.prototype.toString.call(s) == "[object String]"; |
| 62 | +} |
| 63 | + |
| 64 | + |
| 65 | +function wrap(s) { |
| 66 | + return typeof s == "function" ? s : (isStr(s) |
| 67 | + ? function(i) { return s.charCodeAt(i) } |
| 68 | + : function(i) { return s[i] }); |
| 69 | +} |
| 70 | + |
| 71 | + |
| 72 | +/** |
| 73 | + * Returns the suffix array of the string s. The suffix array is constructed |
| 74 | + * in linear time. |
| 75 | + * |
| 76 | + * The string s can either be an Unicode string (ie. JavaScript String object) |
| 77 | + * or a function that takes an index (integer >= 0) and returns another |
| 78 | + * integer (a "symbol"). If a function is provided, then another argument |
| 79 | + * specifying its length (integer >= 0) must be provided. |
| 80 | + * |
| 81 | + * This also takes a 3rd optional parameter that dictates how to treat the end |
| 82 | + * of the string. This can be either "min" or "wrap". If it is "min", then |
| 83 | + * characters after the end of the string are treated as 0's (the minimum). |
| 84 | + * If "wrap" is given, then the end of the string wraps back around to the |
| 85 | + * beginning. If this parameter is omitted, then "wrap" is assumed. |
| 86 | + * |
| 87 | + * In the case of strings, you can omit the 2nd paramter (length) and still |
| 88 | + * provide the 3rd paramter. For instance, suffixArray(str, "min"). |
| 89 | + * |
| 90 | + * The returned array contains the indexes of the string in the lexicographical |
| 91 | + * order of the suffixes that start at those indexes. |
| 92 | + * |
| 93 | + * @param s A string or function that maps ints between [0, len) to integers. |
| 94 | + * @param len The length of s (optional if s is a string, required otherwise). |
| 95 | + * @param end Either "min", "wrap" or leave out (defaults to "wrap"). |
| 96 | + * @return An array of indexes into s. |
| 97 | + */ |
| 98 | +var suffixArray = function(s, len, end) { |
| 99 | + end = end || len; |
| 100 | + len = isInt(len) ? len : s.length; |
| 101 | + |
| 102 | + if (end == "wrap") |
| 103 | + return wrappedSuffixArray(s, len); |
| 104 | + else |
| 105 | + return _suffixArray(wrap(s), len); |
| 106 | +} |
| 107 | +suffixArray.bsort = bsort; |
| 108 | +module.exports=suffixArray; |
| 109 | + |
| 110 | +// Export the Bucket Sort. |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | +/** |
| 115 | + * Constructs the suffix array of s. It takes either a string, an array, or a |
| 116 | + * function that takes an integer and returns a unsigned integer. It also takes |
| 117 | + * an optional 2nd paramter, the length. This is required if the first |
| 118 | + * parameter is a function. |
| 119 | + * |
| 120 | + * This uses the nice idea from Karkkainen & Sander's paper of replacing each |
| 121 | + * letter with the equivalent k-letter version (3 in their paper, 2 in this |
| 122 | + * algorithm). This is repeated recursively until all the letters are |
| 123 | + * different. This doesn't have the nice 1/3 pruning / merge step of their |
| 124 | + * algorithm, but still performs relatively fast, running in O(n log n). |
| 125 | + * |
| 126 | + * @param s A string, array, or function. |
| 127 | + * @param len The length of s. |
| 128 | + * @return The order of the suffixes. |
| 129 | + */ |
| 130 | +function wrappedSuffixArray(s, len) { |
| 131 | + len = isInt(len) ? len : s.length; |
| 132 | + s = wrap(s); |
| 133 | + |
| 134 | + var array = [], |
| 135 | + swap = [], |
| 136 | + order = [], |
| 137 | + span, |
| 138 | + sym, |
| 139 | + i = len; |
| 140 | + |
| 141 | + while (i--) |
| 142 | + array[i] = s(order[i] = i); |
| 143 | + |
| 144 | + for (span = 1; sym != len && span < len; span *= 2) { |
| 145 | + bsort(order, function(i) { return array[(i + span) % len] }); |
| 146 | + bsort(order, function(i) { return array[i] }); |
| 147 | + |
| 148 | + sym = swap[order[0]] = 1; |
| 149 | + for (i = 1; i < len; i++) { |
| 150 | + if (array[order[i]] != array[order[i - 1]] || array[(order[i] + span) % len] != array[(order[i - 1] + span) % len]) |
| 151 | + sym++; |
| 152 | + swap[order[i]] = sym; |
| 153 | + } |
| 154 | + |
| 155 | + tmp = array; |
| 156 | + array = swap; |
| 157 | + swap = tmp; |
| 158 | + } |
| 159 | + |
| 160 | + return order; |
| 161 | +} |
| 162 | + |
| 163 | + |
| 164 | +/* Constructs the suffix array of s. In this case, s must be a function that |
| 165 | + * maps integers between 0 and len - 1 to "symbols" (unsigned integers). It |
| 166 | + * returns the suffixes in lexicographical order as an array of indexes where |
| 167 | + * those suffixes start. |
| 168 | + * |
| 169 | + * I have tried to keep the code reasonably well commented. Both for my sake, |
| 170 | + * and yours. That said, my code was not written with pedagogy in mind, but |
| 171 | + * to be relatively fast and have a small minified size. |
| 172 | + * |
| 173 | + * The description of the algorithm in the paper is very concise and is well |
| 174 | + * worth a read. |
| 175 | + * |
| 176 | + * The C code accompanying the paper is very terse and, IMHO, creates more |
| 177 | + * confusion than clarity. While the algorithm itself is fairly simple (simple |
| 178 | + * and fast, who wants more?), it does deal with quite a bit of abstraction. |
| 179 | + * That is, you are dealing with a lot of placeholders, rather than concrete |
| 180 | + * objects; indexes into the string to represent suffixes, lexical names |
| 181 | + * representing triplets of symbols, indexes of these lexical names, etc. |
| 182 | + */ |
| 183 | +function _suffixArray(_s, len) { |
| 184 | + var a = [], |
| 185 | + b = [], |
| 186 | + alen = floor(2 * len / 3), // Number of indexes s.t. i % 3 != 0. |
| 187 | + blen = len - alen, // Number of indexes s.t. i % 3 = 0. |
| 188 | + r = (alen + 1) >> 1, // Number of indexes s.t. i % 3 = 1. |
| 189 | + i = alen, |
| 190 | + j = 0, |
| 191 | + k, |
| 192 | + lookup = [], |
| 193 | + result = [], |
| 194 | + tmp, cmp, |
| 195 | + s; |
| 196 | + |
| 197 | + if (len == 1) |
| 198 | + return [ 0 ]; |
| 199 | + |
| 200 | + s = function(i) { return i >= len ? 0 : _s(i) }; |
| 201 | + |
| 202 | + // Sort suffixes w/ indices % 3 != 0 by their first 3 symbols (triplets). |
| 203 | + |
| 204 | + while (i--) |
| 205 | + a[i] = ((i * 3) >> 1) + 1; // a = [1, 2, 4, 5, 7, 8, 10, 11, 13, ...] |
| 206 | + |
| 207 | + for (i = 3; i--;) |
| 208 | + bsort(a, function(j) { return s(i + j) }); |
| 209 | + |
| 210 | + // Assign lexicographical names (j) to the triplets of consecutive symbols, |
| 211 | + // s.t. the order of the lex. names match the lex. order of the triplets. |
| 212 | + |
| 213 | + // Array b contains lex. names in the order they appear in s for i % 3 != 0 |
| 214 | + |
| 215 | + j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 1; |
| 216 | + for (i = 1; i < alen; i++) { |
| 217 | + if (s(a[i]) != s(a[i-1]) || s(a[i] + 1) != s(a[i-1] + 1) || s(a[i] + 2) != s(a[i-1] + 2)) |
| 218 | + j++; |
| 219 | + b[floor(a[i] / 3) + (a[i] % 3 == 1 ? 0 : r)] = j; |
| 220 | + } |
| 221 | + |
| 222 | + // If all lex. names are unique, then a is already completely sorted. |
| 223 | + |
| 224 | + if (j < alen) { |
| 225 | + |
| 226 | + // Otherwise, recursively sort lex. names in b, then reconstruct the |
| 227 | + // indexes of the sorted array b so they are relative to a. |
| 228 | + |
| 229 | + b = _suffixArray(function(i) { return b[i] }, alen); |
| 230 | + |
| 231 | + for (i = alen; i--;) |
| 232 | + a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2); |
| 233 | + |
| 234 | + } |
| 235 | + |
| 236 | + // Create a reverse lookup table for the indexes i, s.t. i % 3 != 0. |
| 237 | + // This table can be used to simply determine the sorted order of 2 |
| 238 | + // suffixes whose indexes are both not divisible by 3. |
| 239 | + |
| 240 | + for (i = alen; i--;) |
| 241 | + lookup[a[i]] = i; |
| 242 | + lookup[len] = -1; |
| 243 | + lookup[len + 1] = -2; |
| 244 | + |
| 245 | + /** |
| 246 | + * This is a comparison function for the suffixes at indices m & n that |
| 247 | + * uses the lookup table to shorten the searches. It assumes that |
| 248 | + * n % 3 == 0 and m % 3 != 0. |
| 249 | + */ |
| 250 | + cmp = function(m, n) { |
| 251 | + return (s(m) - s(n)) || (m % 3 == 2 |
| 252 | + ? (s(m + 1) - s(n + 1)) || (lookup[m + 2] - lookup[n + 2]) |
| 253 | + : (lookup[m + 1] - lookup[n + 1])) |
| 254 | + }; |
| 255 | + |
| 256 | + // Sort remaining suffixes (i % 3 == 0) using prev result (i % 3 != 0). |
| 257 | + |
| 258 | + b = len % 3 == 1 ? [ len - 1 ] : []; |
| 259 | + for (i = 0; i < alen; i++) |
| 260 | + if (a[i] % 3 == 1) |
| 261 | + b.push(a[i] - 1); |
| 262 | + bsort(b, function(j) { return s(j) }); |
| 263 | + |
| 264 | + // Merge a (i % 3 != 0) and b (i % 3 == 0) together. We only need to |
| 265 | + // compare, at most, 2 symbols before we end up comparing 2 suffixes whose |
| 266 | + // indices are both not divisible by 3. At this point, we can use the |
| 267 | + // reverse lookup array to order them. |
| 268 | + |
| 269 | + for (i = 0, j = 0, k = 0; i < alen && j < blen;) |
| 270 | + result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++]; |
| 271 | + while (i < alen) |
| 272 | + result[k++] = a[i++]; |
| 273 | + while (j < blen) |
| 274 | + result[k++] = b[j++]; |
| 275 | + |
| 276 | + return result; |
| 277 | +} |
| 278 | + |
0 commit comments