Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Switched algorithms for suffix arrays where the string is wrapped (ru…

…n with 'wrap' parameter).
  • Loading branch information...
commit 7dcd8d89637e7e13a1e713c29c8a4f795aadf604 1 parent 78ec006
Tom Switzer authored

Showing 1 changed file with 76 additions and 34 deletions. Show diff stats Hide diff stats

  1. +76 34 suffixarray.js
110 suffixarray.js
@@ -59,6 +59,17 @@ function isInt(n) {
59 59 return typeof n == "number" || n instanceof Number;
60 60 }
61 61
  62 +function isStr(s) {
  63 + return Object.prototype.toString.call(s) == "[object String]";
  64 +}
  65 +
  66 +
  67 +function wrap(s) {
  68 + return typeof s == "function" ? s : (isStr(s)
  69 + ? function(i) { return s.charCodeAt(i) }
  70 + : function(i) { return s[i] });
  71 +}
  72 +
62 73
63 74 /**
64 75 * Returns the suffix array of the string s. The suffix array is constructed
@@ -87,9 +98,13 @@ function isInt(n) {
87 98 * @return An array of indexes into s.
88 99 */
89 100 global.suffixArray = function(s, len, end) {
90   - return Object.prototype.toString.call(s) == "[object String]"
91   - ? suffixArray(function(i) { return s.charCodeAt(i) }, isInt(len) ? len : s.length, (!isInt(len) && !end) ? len : end)
92   - : _suffixArray(s, len, end);
  101 + end = end || len;
  102 + len = isInt(len) ? len : s.length;
  103 +
  104 + if (end == "wrap")
  105 + return wrappedSuffixArray(s, len);
  106 + else
  107 + return _suffixArray(wrap(s), len);
93 108 }
94 109
95 110
@@ -97,6 +112,56 @@ global.suffixArray = function(s, len, end) {
97 112 global.suffixArray.bsort = bsort;
98 113
99 114
  115 +/**
  116 + * Constructs the suffix array of s. It takes either a string, an array, or a
  117 + * function that takes an integer and returns a unsigned integer. It also takes
  118 + * an optional 2nd paramter, the length. This is required if the first
  119 + * parameter is a function.
  120 + *
  121 + * This uses the nice idea from Karkkainen & Sander's paper of replacing each
  122 + * letter with the equivalent k-letter version (3 in their paper, 2 in this
  123 + * algorithm). This is repeated recursively until all the letters are
  124 + * different. This doesn't have the nice 1/3 pruning / merge step of their
  125 + * algorithm, but still performs relatively fast, running in O(n log n).
  126 + *
  127 + * @param s A string, array, or function.
  128 + * @param len The length of s.
  129 + * @return The order of the suffixes.
  130 + */
  131 +function wrappedSuffixArray(s, len) {
  132 + len = isInt(len) ? len : s.length;
  133 + s = wrap(s);
  134 +
  135 + var array = [],
  136 + swap = [],
  137 + order = [],
  138 + span,
  139 + sym,
  140 + i = len;
  141 +
  142 + while (i--)
  143 + array[i] = s(order[i] = i);
  144 +
  145 + for (span = 1; sym != len && span < len; span *= 2) {
  146 + bsort(order, function(i) { return array[(i + span) % len] });
  147 + bsort(order, function(i) { return array[i] });
  148 +
  149 + sym = swap[order[0]] = 1;
  150 + for (i = 1; i < len; i++) {
  151 + if (array[order[i]] != array[order[i - 1]] || array[(order[i] + span) % len] != array[(order[i - 1] + span) % len])
  152 + sym++;
  153 + swap[order[i]] = sym;
  154 + }
  155 +
  156 + tmp = array;
  157 + array = swap;
  158 + swap = tmp;
  159 + }
  160 +
  161 + return order;
  162 +}
  163 +
  164 +
100 165 /* Constructs the suffix array of s. In this case, s must be a function that
101 166 * maps integers between 0 and len - 1 to "symbols" (unsigned integers). It
102 167 * returns the suffixes in lexicographical order as an array of indexes where
@@ -116,7 +181,7 @@ global.suffixArray.bsort = bsort;
116 181 * objects; indexes into the string to represent suffixes, lexical names
117 182 * representing triplets of symbols, indexes of these lexical names, etc.
118 183 */
119   -function _suffixArray(_s, len, end) {
  184 +function _suffixArray(_s, len) {
120 185 var a = [],
121 186 b = [],
122 187 alen = floor(2 * len / 3), // Number of indexes s.t. i % 3 != 0.
@@ -133,10 +198,7 @@ function _suffixArray(_s, len, end) {
133 198 if (len == 1)
134 199 return [ 0 ];
135 200
136   - end = end || "wrap";
137   - s = end == "wrap"
138   - ? function(i) { return _s(i % len) }
139   - : function(i) { return i >= len ? 0 : _s(i) };
  201 + s = function(i) { return i >= len ? 0 : _s(i) };
140 202
141 203 // Sort suffixes w/ indices % 3 != 0 by their first 3 symbols (triplets).
142 204
@@ -151,7 +213,7 @@ function _suffixArray(_s, len, end) {
151 213
152 214 // Array b contains lex. names in the order they appear in s for i % 3 != 0
153 215
154   - j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 0;
  216 + j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 1;
155 217 for (i = 1; i < alen; i++) {
156 218 if (s(a[i]) != s(a[i-1]) || s(a[i] + 1) != s(a[i-1] + 1) || s(a[i] + 2) != s(a[i-1] + 2))
157 219 j++;
@@ -160,12 +222,12 @@ function _suffixArray(_s, len, end) {
160 222
161 223 // If all lex. names are unique, then a is already completely sorted.
162 224
163   - if (j < alen - 1) {
  225 + if (j < alen) {
164 226
165 227 // Otherwise, recursively sort lex. names in b, then reconstruct the
166 228 // indexes of the sorted array b so they are relative to a.
167 229
168   - b = _suffixArray(function(i) { return b[i] }, alen, end);
  230 + b = _suffixArray(function(i) { return b[i] }, alen);
169 231
170 232 for (i = alen; i--;)
171 233 a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2);
@@ -178,18 +240,8 @@ function _suffixArray(_s, len, end) {
178 240
179 241 for (i = alen; i--;)
180 242 lookup[a[i]] = i;
181   - if (end == "wrap") {
182   - for (cmp = 1, i = alen - 1; i >= 0 && cmp > 0; i--) {
183   - for (cmp = 0, j = a[i], k = 0; !cmp && k < len; j = (j + 1) % len, k++)
184   - cmp = (j % 3 && k % 3) ? lookup[j] - lookup[k] : (s(j) - s(k));
185   - lookup[a[i]] += 1;
186   - }
187   - lookup[len] = i;
188   - lookup[len + 1] = lookup[1];
189   - } else {
190   - lookup[len] = -1;
191   - lookup[len + 1] = -2;
192   - }
  243 + lookup[len] = -1;
  244 + lookup[len + 1] = -2;
193 245
194 246 /**
195 247 * This is a comparison function for the suffixes at indices m & n that
@@ -203,21 +255,11 @@ function _suffixArray(_s, len, end) {
203 255 };
204 256
205 257 // Sort remaining suffixes (i % 3 == 0) using prev result (i % 3 != 0).
206   - // We handle the case where len % 3 == 1 specially, since we can't easily
207   - // determine the sorted order of s(len ..) if we "wrap" the string. In the
208   - // case where we treat the string as null terminated (end == "min"), then
209   - // s(len ..) is the least string.
210 258
211   - b = (len % 3 == 1 && end != "wrap") ? [ len - 1 ] : [];
  259 + b = len % 3 == 1 ? [ len - 1 ] : [];
212 260 for (i = 0; i < alen; i++)
213 261 if (a[i] % 3 == 1)
214 262 b.push(a[i] - 1);
215   - if (len % 3 == 1 && end == "wrap") {
216   - for (i = blen - 2; i > 0 && cmp(b[i] + 1, 0) > 0; i--)
217   - b[i + 1] = b[i];
218   - b[i + 1] = b[i];
219   - b[i] = len - 1;
220   - }
221 263 bsort(b, function(j) { return s(j) });
222 264
223 265 // Merge a (i % 3 != 0) and b (i % 3 == 0) together. We only need to

0 comments on commit 7dcd8d8

Please sign in to comment.
Something went wrong with that request. Please try again.