This repository has been archived by the owner on Nov 9, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 57
/
LevenshteinTokenUtil.java
281 lines (247 loc) · 10.4 KB
/
LevenshteinTokenUtil.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
package org.zanata.search;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class LevenshteinTokenUtil {
private static final String SPLIT_REGEX = "[,.]?[\\s]+";
private static final Set<String> stopwords;
static {
Set<String> stopwordsSet = new HashSet<String>();
try {
BufferedReader reader =
new BufferedReader(new InputStreamReader(
LevenshteinTokenUtil.class
.getResourceAsStream("stopwords.txt")));
try {
String line;
while ((line = reader.readLine()) != null) {
if (!line.startsWith("#")) {
stopwordsSet.add(line);
}
}
} finally {
reader.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
stopwords = Collections.unmodifiableSet(stopwordsSet);
}
/**
* Compute Levenshtein distance. Taken from
* http://web.archive.org/web/20110720093554
* /http://www.merriampark.com/ldjava.htm (public domain)
*/
public static int getLevenshteinDistanceInWords(String[] s, String[] t) {
if (s == null || t == null) {
throw new IllegalArgumentException("Strings must not be null");
}
/*
* The difference between this impl. and the previous is that, rather
* than creating and retaining a matrix of size s.length()+1 by
* t.length()+1, we maintain two single-dimensional arrays of length
* s.length()+1. The first, d, is the 'current working' distance array
* that maintains the newest distance cost counts as we iterate through
* the characters of String s. Each time we increment the index of
* String t we are comparing, d is copied to p, the second int[]. Doing
* so allows us to retain the previous cost counts as required by the
* algorithm (taking the minimum of the cost count to the left, up one,
* and diagonally up and to the left of the current cost count being
* calculated). (Note that the arrays aren't really copied anymore, just
* switched...this is clearly much better than cloning an array or doing
* a System.arraycopy() each time through the outer loop.)
*
* Effectively, the difference between the two implementations is this
* one does not cause an out of memory condition when calculating the LD
* over two very large strings.
*/
int n = s.length; // length of s
int m = t.length; // length of t
if (n == 0) {
return m;
} else if (m == 0) {
return n;
}
int p[] = new int[n + 1]; // 'previous' cost array, horizontally
int d[] = new int[n + 1]; // cost array, horizontally
int _d[]; // placeholder to assist in swapping p and d
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
String t_j; // jth token of t
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
}
for (j = 1; j <= m; j++) {
t_j = t[j - 1];
d[0] = j;
for (i = 1; i <= n; i++) {
cost = s[i - 1].equals(t_j) ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left
// and up +cost
d[i] =
Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1]
+ cost);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
public static double getSimilarity(final String s1, final String s2) {
String[] s1s = tokenise(s1);
String[] s2s = tokenise(s2);
int levDistance = getLevenshteinDistanceInWords(s1s, s2s);
int maxDistance = Math.max(s1s.length, s2s.length);
double similarity = (maxDistance - levDistance) / (double) maxDistance;
return similarity;
}
/**
* Splits into tokens (lower-case).
*
* @param s the string to tokenise
* @return an array of lowercase tokens (words)
*/
static String[] tokenise(String s) {
String[] tokens = s.toLowerCase().split(SPLIT_REGEX);
ArrayList<String> list = new ArrayList<String>(tokens.length);
for (String tok : tokens) {
if (!stopwords.contains(tok)) {
list.add(tok);
}
}
return list.toArray(new String[list.size()]);
}
/**
* Like tokenise(String) but does not discard stop words.
*
* @param s the string to tokenise
* @return an array of lowercase tokens (words)
*/
static String[] tokeniseAndKeepStopWords(String s) {
return s.toLowerCase().split(SPLIT_REGEX);
}
private static int countExtraStringLengths(List<String> strings,
int fromIndex) {
int total = 0;
for (int i = fromIndex; i < strings.size(); i++) {
String s = strings.get(i);
String[] ss = tokenise(s);
total += ss.length;
}
return total;
}
/**
* Quick and dirty similarity score for a query string against a list of
* strings. Returns the mean similarity of s1 against each string in the
* list.
*
* @param s1 string to compare against each other string
* @param strings2 other strings to compare s1 against
* @return mean similarity between s1 and each of strings2
*/
public static double getSimilarity(final String s1,
final List<String> strings2) {
double totalSimilarity = 0.0;
for (String s2 : strings2) {
totalSimilarity += getSimilarity(s1, s2);
}
return totalSimilarity / strings2.size();
}
/**
* Calculate the word-based case-insensitive similarity of two lists of
* strings (range 0.0 to 1.0).
*
* - Strings at the same index are compared.
* - Stop-words are ignored in comparisons. See #stopwords.
* - When both lists are empty, they are considered identical (returns 1.0)
* - Empty strings are considered identical to other empty strings.
*
* If a string is made up only of stop-words, the calculation will be
* repeated without ignoring stop-words. This is so that a sensible score
* can be returned when there is nothing else to compare.
*
* If comparisons with and without stop-words generate no usable information,
* 0.0 is returned as a fallback.
*
* TODO review use of stop-words in these comparisons, since results can
* often be confusing to end-users.
*
* @param strings1 a list of strings to compare
* @param strings2 the other list of strings to compare
* @return average similarity between the strings, between 0.0 and 1.0
*/
public static double getSimilarity(final List<String> strings1,
final List<String> strings2) {
return getSimilarity(strings1, strings2, true);
}
/**
* Calculate word-based similarity of two lists of strings, optionally
* ignoring stop-words for all comparisons.
*
* If stop-words are ignored but no usable data remains for comparison,
* the calculation is repeated without ignoring stop-words.
*
* @param ignoreStopWords whether stop-words should be ignored for the first
* attempt at comparison.
* @return average similarity between the strings, between 0.0 and 1.0
*/
private static double getSimilarity(List<String> strings1,
List<String> strings2, boolean ignoreStopWords) {
// all empty lists are identical
if (strings1.isEmpty() && strings2.isEmpty()) {
return 1.0;
}
// length of the shorter list
final int minListSize = Math.min(strings1.size(), strings2.size());
final List<String> longestList = strings1.size() > minListSize ?
strings1 : strings2;
// total of "extra" strings in the longer list
final int extraStringLengths =
countExtraStringLengths(longestList, minListSize);
// running total of Levenshtein distance between corresponding strings
// in the two lists
int cumulativeLevDistance = 0;
// running total of max editing distance between all the corresponding
// strings.
int cumulativeMaxDistance = 0;
// count the strings which correspond between both lists
for (int i = 0; i < minListSize; i++) {
final String string1 = strings1.get(i);
final String string2 = strings2.get(i);
String[] tokens1 = ignoreStopWords ? tokenise(string1)
: tokeniseAndKeepStopWords(string1);
String[] tokens2 = ignoreStopWords ? tokenise(string2)
: tokeniseAndKeepStopWords(string2);
final int levenshteinDistance =
getLevenshteinDistanceInWords(tokens1, tokens2);
cumulativeLevDistance += levenshteinDistance;
// When a string contains only stop words, tokenise returns an empty
// array, so this value can remain at 0.
cumulativeMaxDistance += Math.max(tokens1.length, tokens2.length);
}
final int totalLevDistance = cumulativeLevDistance + extraStringLengths;
final int totalMaxDistance = cumulativeMaxDistance + extraStringLengths;
// if there would be a divide-by-zero situation due to all strings being
// only stop-words, compare the stop words instead. If this does not
// work, all strings must contain no tokens.
if (totalMaxDistance == 0) {
if (ignoreStopWords) {
return getSimilarity(strings1, strings2, false);
}
// TODO fall back on plain string comparison instead.
return 0.0;
}
return (totalMaxDistance - totalLevDistance) / (double) totalMaxDistance;
}
}