Added O(nlogn) suffix array construction using counting sort

EbTech · EbTech · commit 339bab69cd9c · 2017-11-11T00:25:14.000-08:00
diff --git a/README.md b/README.md
@@ -14,15 +14,17 @@ In addition, the Rust language has outstanding pedagogical attributes. Its compi
 
 ## For Programming Contests
 
-The original intent of this project was to build a reference for use in programming contests such as [Codeforces](http://codeforces.com) and the [Google Code Jam](https://code.google.com/codejam). As a result, it contains algorithms that are frequently useful to have in one's toolkit, with an emphasis on making the code concise and easy to modify under time pressure.
+The original intent of this project was to build a reference for use in programming contests such as [Codeforces](http://codeforces.com), [Hackerrank](https://www.hackerrank.com/), and the [Google Code Jam](https://code.google.com/codejam). As a result, it contains algorithms that are frequently useful to have in one's toolkit, with an emphasis on making the code concise and easy to modify under time pressure.
 
 Most competition programmers rely on C++ for its fast execution time. However, it's notoriously unsafe, diverting a considerable share of the contestant's time and attention on mistake prevention and debugging. Java is the next most popular choice, offering a little safety at some expense to speed of coding and execution.
 
-To my delight, I found that Rust provides even more safety without the visual clutter, and it's *fast*. A proficient Rust programmer stands to gain a competitive advantage as well as a more pleasant experience!
+To my delight, I found that Rust provides even more bug-safety without the visual clutter, and it's *fast*. A proficient Rust programmer might stand to gain a competitive advantage as well as a more pleasant experience!
+
+Note that the online judges [SPOJ](http://www.spoj.com/) and [Timus](http://acm.timus.ru/) also support submissions in Rust. As of this writing, they use older compilers which might reject certain features used in this cookbook.
 
 ## Programming Language Advocacy
 
-My other goal is to appeal to developers who feel, as I once did, trapped between the lesser of headaches among old mainstream languages (e.g., C++/Java), to raise awareness that *it doesn't have to be this way*.
+My other goal is to appeal to developers who feel limited by older mainstream languages, to raise awareness that *it doesn't have to be this way*.
 
 Rather than try to persuade you with words, this repository aims to show by example while easing the learning curve. See [Jim Blandy's *Why Rust?*](http://www.oreilly.com/programming/free/files/why-rust.pdf) for a brief introduction, or just [dive in!](https://doc.rust-lang.org/book/second-edition/)
 
@@ -34,4 +36,4 @@ Rather than try to persuade you with words, this repository aims to show by exam
 - [Associative range query](src/arq_tree.rs): known colloquially as *segtrees*
 - [Math](src/math.rs): Euclid's GCD algorithm, Bezout's identity
 - [Scanner](src/scanner.rs): utility for reading input data
-- [String processing](src/string_proc.rs): Knuth-Morris-Pratt string matching, Manacher's palindrome search
+- [String processing](src/string_proc.rs): Knuth-Morris-Pratt string matching, suffix arrays, Manacher's palindrome search
diff --git a/src/string_proc.rs b/src/string_proc.rs
@@ -2,12 +2,15 @@
 
 /// Data structure for Knuth-Morris-Pratt string matching against a pattern.
 pub struct Matcher<'a> {
+    /// The string pattern to search for.
     pub pattern: &'a [u8],
+    /// KMP match failure automaton. fail[i] is the length of the longest
+    /// proper prefix-suffix of pattern[0...i].
     pub fail: Vec<usize>,
 }
 
 impl<'a> Matcher<'a> {
-    /// Sets fail[i] = length of longest proper prefix-suffix of pattern[0...i].
+    /// Precomputes the automaton that allows linear-time string matching.
     ///
     /// # Panics
     ///
@@ -49,6 +52,87 @@ impl<'a> Matcher<'a> {
     }
 }
 
+/// Suffix array data structure, useful for a variety of string queries.
+pub struct SuffixArray {
+    /// The suffix array itself, holding suffix indices in sorted order.
+    pub sfx: Vec<usize>,
+    /// rank[i][j] = rank of the j'th suffix, considering only 2^i chars.
+    pub rank: Vec<Vec<usize>>,
+}
+
+impl SuffixArray {
+    /// O(n + max_key) stable sort on an input that is a permutation of (0..n).
+    fn counting_sort<I>(p_gen: I, keys: &[usize], max_key: usize) -> Vec<usize>
+    where
+        I: DoubleEndedIterator<Item = usize>,
+    {
+        let mut counts = vec![0; max_key];
+        for &k in keys {
+            counts[k] += 1;
+        }
+        let mut total = 0;
+        for c in counts.iter_mut() {
+            total += *c;
+            *c = total;
+        }
+        let mut result = vec![0; total];
+        for p in p_gen.rev() {
+            let c = &mut counts[keys[p]];
+            *c -= 1;
+            result[*c] = p;
+        }
+        result
+    }
+
+    /// Suffix array construction in O(n log n) time. Makes some unnecessary Vec clones
+    /// and initializations, so there's room to optimize.
+    pub fn new(text: &[u8]) -> Self {
+        let n = text.len();
+        let mut rank = vec![text.into_iter().map(|&ch| ch as usize).collect::<Vec<_>>()];
+        let mut sfx = Self::counting_sort(0..n, rank.last().unwrap(), 256);
+        // Invariant at the start of every loop iteration:
+        // suffixes are sorted according to the first skip characters.
+        for skip in (0..).map(|i| 1 << i).take_while(|&skip| skip < n) {
+            let prev_rank = rank.last().unwrap().clone();
+            let mut cur_rank = prev_rank.clone();
+
+            let p_gen = (n - skip..n).chain(sfx.into_iter().filter_map(|p| p.checked_sub(skip)));
+            sfx = Self::counting_sort(p_gen, &prev_rank, n.max(256));
+
+            let mut prev = sfx[0];
+            cur_rank[prev] = 0;
+            for &p in sfx.iter().skip(1) {
+                if prev.max(p) + skip < n && prev_rank[prev] == prev_rank[p] &&
+                    prev_rank[prev + skip] == prev_rank[p + skip]
+                {
+                    cur_rank[p] = cur_rank[prev];
+                } else {
+                    cur_rank[p] = cur_rank[prev] + 1;
+                }
+                prev = p;
+            }
+            rank.push(cur_rank);
+        }
+        Self { sfx, rank }
+    }
+
+    /// Computes the length of longest common prefix of text[i..] and text[j..].
+    pub fn longest_common_prefix(&self, mut i: usize, mut j: usize) -> usize {
+        let mut len = 0;
+        for (k, rank) in self.rank.iter().enumerate().rev() {
+            if rank[i] == rank[j] {
+                i += 1 << k;
+                j += 1 << k;
+                len += 1 << k;
+                if i.max(j) >= self.sfx.len() {
+                    break;
+                }
+            }
+        }
+        len
+    }
+}
+
 /// Manacher's algorithm for computing palindrome substrings in linear time.
 /// pal[2*i] = odd length of palindrome centred at text[i].
 /// pal[2*i+1] = even length of palindrome centred at text[i+0.5].
@@ -88,14 +172,44 @@ mod test {
     use super::*;
 
     #[test]
-    fn test_string() {
-        let text = "abcbc".as_bytes();
-        let pattern = "bc".as_bytes();
+    fn test_kmp() {
+        let text = "banana".as_bytes();
+        let pattern = "ana".as_bytes();
 
         let matches = Matcher::new(pattern).kmp_match(text);
-        assert_eq!(matches, vec![0, 1, 2, 1, 2]);
+
+        assert_eq!(matches, vec![0, 1, 2, 3, 2, 3]);
+    }
+
+    #[test]
+    fn test_suffix_array() {
+        let text1 = "bobocel".as_bytes();
+        let text2 = "banana".as_bytes();
+
+        let sfx1 = SuffixArray::new(text1);
+        let sfx2 = SuffixArray::new(text2);
+
+        assert_eq!(sfx1.sfx, vec![0, 2, 4, 5, 6, 1, 3]);
+        assert_eq!(sfx2.sfx, vec![5, 3, 1, 0, 4, 2]);
+
+        assert_eq!(sfx1.longest_common_prefix(0, 2), 2);
+        assert_eq!(sfx2.longest_common_prefix(1, 3), 3);
+
+        // Check that sfx and rank.last() are essentially inverses of each other.
+        for (p, &r) in sfx1.rank.last().unwrap().iter().enumerate() {
+            assert_eq!(sfx1.sfx[r], p);
+        }
+        for (p, &r) in sfx2.rank.last().unwrap().iter().enumerate() {
+            assert_eq!(sfx2.sfx[r], p);
+        }
+    }
+
+    #[test]
+    fn test_palindrome() {
+        let text = "banana".as_bytes();
 
         let pal_len = palindromes(text);
-        assert_eq!(pal_len, vec![1, 0, 1, 0, 3, 0, 3, 0, 1]);
+
+        assert_eq!(pal_len, vec![1, 0, 1, 0, 3, 0, 5, 0, 3, 0, 1]);
     }
 }