/
search.go
152 lines (141 loc) · 6.83 KB
/
search.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
// stringFinder efficiently finds strings in a source text. It's implemented
// using the Boyer-Moore string search algorithm:
// https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
// https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
// document uses 1-based indexing)
// stringFinder 查找字符串, BM 算法
type stringFinder struct {
// pattern is the string that we are searching for in the text.
pattern string
// badCharSkip[b] contains the distance between the last byte of pattern
// and the rightmost occurrence of b in pattern. If b is not in pattern,
// badCharSkip[b] is len(pattern).
// badCharSkip[b] 包含 pattern 的最后一个字节与 pattern 中最右的b出现之间的距离。 如果b不在模式中,则 badCharSkip[b] 为 len(pattern) 。
//
// Whenever a mismatch is found with byte b in the text, we can safely
// shift the matching frame at least badCharSkip[b] until the next time
// the matching char could be in alignment.
// 只要发现 text 中的字节 b 不匹配,我们就可以安全地将匹配帧至少移至 badCharSkip[b] ,直到下次匹配的 char 可以对齐为止。
badCharSkip [256]int // 坏字符表
// goodSuffixSkip[i] defines how far we can shift the matching frame given
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
// not. There are two cases to consider:
// goodSuffixSkip[i] 定义了如果后缀模式 pattern[i+1:] 匹配,我们可以将匹配帧移动多远,而字节模式 pattern[i] 不匹配。 有两种情况需要考虑:
//
// 1. The matched suffix occurs elsewhere in pattern (with a different
// byte preceding it that we might possibly match). In this case, we can
// shift the matching frame to align with the next suffix chunk. For
// example, the pattern "mississi" has the suffix "issi" next occurring
// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
// shift+len(suffix) == 3+4 == 7.
// 1. 匹配的后缀出现在模式中的其他位置(我们可能会匹配一个不同的字节)。 在这种情况下,我们可以移动匹配的框架以使其与下一个后缀块对齐。
// 例如,pattern "mississi" 的下一个下标 "issi" 出现在索引 1 处(从右到左顺序),因此 goodSuffixSkip[3] == shift+len(suffix) == 3+4 == 7 。
//
//
// 2. If the matched suffix does not occur elsewhere in pattern, then the
// matching frame may share part of its prefix with the end of the
// matching suffix. In this case, goodSuffixSkip[i] will contain how far
// to shift the frame to align this portion of the prefix to the
// suffix. For example, in the pattern "abcxxxabc", when the first
// mismatch from the back is found to be in position 3, the matching
// suffix "xxabc" is not found elsewhere in the pattern. However, its
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
// 2. 如果匹配的后缀未在 pattern 中的其他位置出现,则匹配的帧可能会与匹配的后缀的末尾共享部分前缀。 在这种情况下,goodSuffixSkip[i] 将
// 包含将帧移动多远以使前缀的此部分与后缀对齐。 例如,在 pattern "abcxxxabc" 中,当发现从后面开始的第一个不匹配项位于位置 3 时,在 pattern
// 的其他位置找不到匹配的后缀 "xxabc" 。 但是,其最右边的 "abc"(在位置6)是整个 pattern 的前缀,因此 goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11 。
goodSuffixSkip []int // 好后缀
}
func makeStringFinder(pattern string) *stringFinder {
f := &stringFinder{
pattern: pattern,
goodSuffixSkip: make([]int, len(pattern)),
}
// last is the index of the last character in the pattern.
// last 是 pattern 中最后一个字符的索引
last := len(pattern) - 1
// Build bad character table.
// Bytes not in the pattern can skip one pattern's length.
// 构建 badCharSkip ,初始化为 len(pattern)
// 构建坏字符表 badCharSkip 。不在模式中的字节可以跳过一个 pattern 的长度。
for i := range f.badCharSkip {
f.badCharSkip[i] = len(pattern)
}
// The loop condition is < instead of <= so that the last byte does not
// have a zero distance to itself. Finding this byte out of place implies
// that it is not in the last position.
// 循环条件是 < 而不是 <= ,因此最后一个字节与其自身之间的距离不为零。 如果发现此字节不正确,则表明它不在最后一个位置。
// 从前往后遍历,更新存在的字符的 badCharSkip
for i := 0; i < last; i++ {
f.badCharSkip[pattern[i]] = last - i
}
// Build good suffix table.
// First pass: set each value to the next index which starts a prefix of
// pattern.
// 构建好后缀表 goodSuffixSkip
// First :好后缀的前缀匹配。上面说的第 2 点。
lastPrefix := last
for i := last; i >= 0; i-- {
// 是否有好后缀的前缀匹配,比如 DDABDDD 中 D, DD 就是
if HasPrefix(pattern, pattern[i+1:]) {
lastPrefix = i + 1
}
// lastPrefix is the shift, and (last-i) is len(suffix).
// 比如上面的:(abcxxxabc, last = 8)
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11
// i = 3 的时候 lastPrefix = 3 + 1 = 4, (last-i) = 8 - 3 = 5 , lastPrefix + last - i = 4 + 5 = 9
f.goodSuffixSkip[i] = lastPrefix + last - i
}
// Second pass: find repeats of pattern's suffix starting from the front.
// Second: 从前面开始查找 pattern 相同的后缀。上面说得第 1 点。
for i := 0; i < last; i++ {
// 获取最长后缀
lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
// (last-i) is the shift, and lenSuffix is len(suffix).
f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
}
}
return f
}
// 最长公共前缀
func longestCommonSuffix(a, b string) (i int) {
for ; i < len(a) && i < len(b); i++ {
if a[len(a)-1-i] != b[len(b)-1-i] {
break
}
}
return
}
// next returns the index in text of the first occurrence of the pattern. If
// the pattern is not found, it returns -1.
// next 返回第一次出现的 pattern 的索引。 如果找不到该模式,则返回-1。
func (f *stringFinder) next(text string) int {
i := len(f.pattern) - 1
for i < len(text) {
// Compare backwards from the end until the first unmatching character.
// 从后往前比较
j := len(f.pattern) - 1
for j >= 0 && text[i] == f.pattern[j] {
i--
j--
}
// 匹配上了
if j < 0 {
return i + 1 // match
}
// 没匹配上,去 badCharSkip 和 goodSuffixSkip 中较大者
i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
}
return -1
}
func max(a, b int) int {
if a > b {
return a
}
return b
}