-
Notifications
You must be signed in to change notification settings - Fork 0
/
boyermoore.go
154 lines (135 loc) · 4.64 KB
/
boyermoore.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package wizardry
import (
"log"
"strings"
"github.com/yizhibenpao/go-filetype/wizardry/wizutil"
)
// StringFinder efficiently finds strings in a source text. It's implemented
// using the Boyer-Moore string search algorithm:
// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
// document uses 1-based indexing)
type StringFinder struct {
// pattern is the string that we are searching for in the text.
pattern string
// badCharSkip[b] contains the distance between the last byte of pattern
// and the rightmost occurrence of b in pattern. If b is not in pattern,
// badCharSkip[b] is len(pattern).
//
// Whenever a mismatch is found with byte b in the text, we can safely
// shift the matching frame at least badCharSkip[b] until the next time
// the matching char could be in alignment.
badCharSkip [256]int64
// goodSuffixSkip[i] defines how far we can shift the matching frame given
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
// not. There are two cases to consider:
//
// 1. The matched suffix occurs elsewhere in pattern (with a different
// byte preceding it that we might possibly match). In this case, we can
// shift the matching frame to align with the next suffix chunk. For
// example, the pattern "mississi" has the suffix "issi" next occurring
// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
// shift+len(suffix) == 3+4 == 7.
//
// 2. If the matched suffix does not occur elsewhere in pattern, then the
// matching frame may share part of its prefix with the end of the
// matching suffix. In this case, goodSuffixSkip[i] will contain how far
// to shift the frame to align this portion of the prefix to the
// suffix. For example, in the pattern "abcxxxabc", when the first
// mismatch from the back is found to be in position 3, the matching
// suffix "xxabc" is not found elsewhere in the pattern. However, its
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
goodSuffixSkip []int64
}
// MakeStringFinder prepares a finder for a given pattern
func MakeStringFinder(pattern string) *StringFinder {
f := &StringFinder{
pattern: pattern,
goodSuffixSkip: make([]int64, len(pattern)),
}
// last is the index of the last character in the pattern.
last := len(pattern) - 1
// Build bad character table.
// Bytes not in the pattern can skip one pattern's length.
for i := range f.badCharSkip {
f.badCharSkip[i] = int64(len(pattern))
}
// The loop condition is < instead of <= so that the last byte does not
// have a zero distance to itself. Finding this byte out of place implies
// that it is not in the last position.
for i := 0; i < last; i++ {
f.badCharSkip[pattern[i]] = int64(last - i)
}
// Build good suffix table.
// First pass: set each value to the next index which starts a prefix of
// pattern.
lastPrefix := last
for i := last; i >= 0; i-- {
if strings.HasPrefix(pattern, pattern[i+1:]) {
lastPrefix = i + 1
}
// lastPrefix is the shift, and (last-i) is len(suffix).
f.goodSuffixSkip[i] = int64(lastPrefix + last - i)
}
// Second pass: find repeats of pattern's suffix starting from the front.
for i := 0; i < last; i++ {
lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
// (last-i) is the shift, and lenSuffix is len(suffix).
f.goodSuffixSkip[last-lenSuffix] = int64(lenSuffix + last - i)
}
}
return f
}
func longestCommonSuffix(a, b string) (i int) {
for ; i < len(a) && i < len(b); i++ {
if a[len(a)-1-i] != b[len(b)-1-i] {
break
}
}
return
}
// next returns the index in text of the first occurrence of the pattern. If
// the pattern is not found, it returns -1.
func (f *StringFinder) next(sr *wizutil.SliceReader) int64 {
i := int64(len(f.pattern) - 1)
bv := &wizutil.ByteView{
Input: sr,
LookBack: int64(len(f.pattern)),
}
for i < sr.Size() {
// Compare backwards from the end until the first unmatching character.
j := len(f.pattern) - 1
var c int
for {
if j < 0 {
// match
return i + 1
}
c = bv.Get(i)
if c == -1 {
// relay errors
log.Printf("Read error at %d", i)
return -1
}
if byte(c) != f.pattern[j] {
// mismatch, must skip
break
}
i--
j--
}
i += max(f.badCharSkip[c], f.goodSuffixSkip[j])
}
return -1
}
func max(a, b int64) int64 {
if a > b {
return a
}
return b
}