|
2 | 2 |
|
3 | 3 | /// Data structure for Knuth-Morris-Pratt string matching against a pattern.
|
4 | 4 | pub struct Matcher<'a> {
|
| 5 | + /// The string pattern to search for. |
5 | 6 | pub pattern: &'a [u8],
|
| 7 | + /// KMP match failure automaton. fail[i] is the length of the longest |
| 8 | + /// proper prefix-suffix of pattern[0...i]. |
6 | 9 | pub fail: Vec<usize>,
|
7 | 10 | }
|
8 | 11 |
|
9 | 12 | impl<'a> Matcher<'a> {
|
10 |
| - /// Sets fail[i] = length of longest proper prefix-suffix of pattern[0...i]. |
| 13 | + /// Precomputes the automaton that allows linear-time string matching. |
11 | 14 | ///
|
12 | 15 | /// # Panics
|
13 | 16 | ///
|
@@ -49,6 +52,87 @@ impl<'a> Matcher<'a> {
|
49 | 52 | }
|
50 | 53 | }
|
51 | 54 |
|
| 55 | +/// Suffix array data structure, useful for a variety of string queries. |
| 56 | +pub struct SuffixArray { |
| 57 | + /// The suffix array itself, holding suffix indices in sorted order. |
| 58 | + pub sfx: Vec<usize>, |
| 59 | + /// rank[i][j] = rank of the j'th suffix, considering only 2^i chars. |
| 60 | + pub rank: Vec<Vec<usize>>, |
| 61 | +} |
| 62 | + |
| 63 | +impl SuffixArray { |
| 64 | + /// O(n + max_key) stable sort on an input that is a permutation of (0..n). |
| 65 | + fn counting_sort<I>(p_gen: I, keys: &[usize], max_key: usize) -> Vec<usize> |
| 66 | + where |
| 67 | + I: DoubleEndedIterator<Item = usize>, |
| 68 | + { |
| 69 | + let mut counts = vec![0; max_key]; |
| 70 | + for &k in keys { |
| 71 | + counts[k] += 1; |
| 72 | + } |
| 73 | + let mut total = 0; |
| 74 | + for c in counts.iter_mut() { |
| 75 | + total += *c; |
| 76 | + *c = total; |
| 77 | + } |
| 78 | + let mut result = vec![0; total]; |
| 79 | + for p in p_gen.rev() { |
| 80 | + let c = &mut counts[keys[p]]; |
| 81 | + *c -= 1; |
| 82 | + result[*c] = p; |
| 83 | + } |
| 84 | + result |
| 85 | + } |
| 86 | + |
| 87 | + /// Suffix array construction in O(n log n) time. Makes some unnecessary Vec clones |
| 88 | + /// and initializations, so there's room to optimize. |
| 89 | + pub fn new(text: &[u8]) -> Self { |
| 90 | + let n = text.len(); |
| 91 | + let mut rank = vec![text.into_iter().map(|&ch| ch as usize).collect::<Vec<_>>()]; |
| 92 | + let mut sfx = Self::counting_sort(0..n, rank.last().unwrap(), 256); |
| 93 | + // Invariant at the start of every loop iteration: |
| 94 | + // suffixes are sorted according to the first skip characters. |
| 95 | + for skip in (0..).map(|i| 1 << i).take_while(|&skip| skip < n) { |
| 96 | + let prev_rank = rank.last().unwrap().clone(); |
| 97 | + let mut cur_rank = prev_rank.clone(); |
| 98 | + |
| 99 | + let p_gen = (n - skip..n).chain(sfx.into_iter().filter_map(|p| p.checked_sub(skip))); |
| 100 | + sfx = Self::counting_sort(p_gen, &prev_rank, n.max(256)); |
| 101 | + |
| 102 | + let mut prev = sfx[0]; |
| 103 | + cur_rank[prev] = 0; |
| 104 | + for &p in sfx.iter().skip(1) { |
| 105 | + if prev.max(p) + skip < n && prev_rank[prev] == prev_rank[p] && |
| 106 | + prev_rank[prev + skip] == prev_rank[p + skip] |
| 107 | + { |
| 108 | + cur_rank[p] = cur_rank[prev]; |
| 109 | + } else { |
| 110 | + cur_rank[p] = cur_rank[prev] + 1; |
| 111 | + } |
| 112 | + prev = p; |
| 113 | + } |
| 114 | + rank.push(cur_rank); |
| 115 | + } |
| 116 | + Self { sfx, rank } |
| 117 | + } |
| 118 | + |
| 119 | + /// Computes the length of longest common prefix of text[i..] and text[j..]. |
| 120 | + pub fn longest_common_prefix(&self, mut i: usize, mut j: usize) -> usize { |
| 121 | + let mut len = 0; |
| 122 | + for (k, rank) in self.rank.iter().enumerate().rev() { |
| 123 | + if rank[i] == rank[j] { |
| 124 | + i += 1 << k; |
| 125 | + j += 1 << k; |
| 126 | + len += 1 << k; |
| 127 | + if i.max(j) >= self.sfx.len() { |
| 128 | + break; |
| 129 | + } |
| 130 | + } |
| 131 | + } |
| 132 | + len |
| 133 | + } |
| 134 | +} |
| 135 | + |
52 | 136 | /// Manacher's algorithm for computing palindrome substrings in linear time.
|
53 | 137 | /// pal[2*i] = odd length of palindrome centred at text[i].
|
54 | 138 | /// pal[2*i+1] = even length of palindrome centred at text[i+0.5].
|
@@ -88,14 +172,44 @@ mod test {
|
88 | 172 | use super::*;
|
89 | 173 |
|
90 | 174 | #[test]
|
91 |
| - fn test_string() { |
92 |
| - let text = "abcbc".as_bytes(); |
93 |
| - let pattern = "bc".as_bytes(); |
| 175 | + fn test_kmp() { |
| 176 | + let text = "banana".as_bytes(); |
| 177 | + let pattern = "ana".as_bytes(); |
94 | 178 |
|
95 | 179 | let matches = Matcher::new(pattern).kmp_match(text);
|
96 |
| - assert_eq!(matches, vec![0, 1, 2, 1, 2]); |
| 180 | + |
| 181 | + assert_eq!(matches, vec![0, 1, 2, 3, 2, 3]); |
| 182 | + } |
| 183 | + |
| 184 | + #[test] |
| 185 | + fn test_suffix_array() { |
| 186 | + let text1 = "bobocel".as_bytes(); |
| 187 | + let text2 = "banana".as_bytes(); |
| 188 | + |
| 189 | + let sfx1 = SuffixArray::new(text1); |
| 190 | + let sfx2 = SuffixArray::new(text2); |
| 191 | + |
| 192 | + assert_eq!(sfx1.sfx, vec![0, 2, 4, 5, 6, 1, 3]); |
| 193 | + assert_eq!(sfx2.sfx, vec![5, 3, 1, 0, 4, 2]); |
| 194 | + |
| 195 | + assert_eq!(sfx1.longest_common_prefix(0, 2), 2); |
| 196 | + assert_eq!(sfx2.longest_common_prefix(1, 3), 3); |
| 197 | + |
| 198 | + // Check that sfx and rank.last() are essentially inverses of each other. |
| 199 | + for (p, &r) in sfx1.rank.last().unwrap().iter().enumerate() { |
| 200 | + assert_eq!(sfx1.sfx[r], p); |
| 201 | + } |
| 202 | + for (p, &r) in sfx2.rank.last().unwrap().iter().enumerate() { |
| 203 | + assert_eq!(sfx2.sfx[r], p); |
| 204 | + } |
| 205 | + } |
| 206 | + |
| 207 | + #[test] |
| 208 | + fn test_palindrome() { |
| 209 | + let text = "banana".as_bytes(); |
97 | 210 |
|
98 | 211 | let pal_len = palindromes(text);
|
99 |
| - assert_eq!(pal_len, vec![1, 0, 1, 0, 3, 0, 3, 0, 1]); |
| 212 | + |
| 213 | + assert_eq!(pal_len, vec![1, 0, 1, 0, 3, 0, 5, 0, 3, 0, 1]); |
100 | 214 | }
|
101 | 215 | }
|
0 commit comments