From 276f310a6561d2be8860583b8f0bdcff8b0dbe9d Mon Sep 17 00:00:00 2001 From: David Mason Date: Tue, 9 Jun 2015 08:53:54 +1000 Subject: [PATCH] Fall back on stop-words comparison for similarity when nothing else is available. A divide-by-zero condition was generating a NaN value for similarity for strings that contain only stop-words. When that condition would be met, this will instead repeat the comparison without ignoring stop-words, then finally just return 0.0 if there are still no tokens to compare. --- .../zanata/search/LevenshteinTokenUtil.java | 141 +++++++++++++----- .../search/LevenshteinTokenUtilTest.java | 57 +++++++ 2 files changed, 161 insertions(+), 37 deletions(-) diff --git a/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java b/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java index de2278f33d..223fd10740 100644 --- a/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java +++ b/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java @@ -129,8 +129,8 @@ public static double getSimilarity(final String s1, final String s2) { /** * Splits into tokens (lower-case). * - * @param s - * @return + * @param s the string to tokenise + * @return an array of lowercase tokens (words) */ static String[] tokenise(String s) { String[] tokens = s.toLowerCase().split(SPLIT_REGEX); @@ -143,6 +143,16 @@ static String[] tokenise(String s) { return list.toArray(new String[list.size()]); } + /** + * Like tokenise(String) but does not discard stop words. + * + * @param s the string to tokenise + * @return an array of lowercase tokens (words) + */ + static String[] tokeniseAndKeepStopWords(String s) { + return s.toLowerCase().split(SPLIT_REGEX); + } + private static int countExtraStringLengths(List strings, int fromIndex) { int total = 0; @@ -159,56 +169,113 @@ private static int countExtraStringLengths(List strings, * strings. Returns the mean similarity of s1 against each string in the * list. * - * @param s1 - * @param strings2 - * @return + * @param s1 string to compare against each other string + * @param strings2 other strings to compare s1 against + * @return mean similarity between s1 and each of strings2 */ public static double getSimilarity(final String s1, final List strings2) { double totalSimilarity = 0.0; - int stringCount = strings2.size(); - for (int i = 0; i < stringCount; i++) { - String s2 = strings2.get(i); + for (String s2 : strings2) { totalSimilarity += getSimilarity(s1, s2); } - double meanSimilarity = totalSimilarity / stringCount; - return meanSimilarity; + return totalSimilarity / strings2.size(); } + /** + * Calculate the word-based case-insensitive similarity of two lists of + * strings (range 0.0 to 1.0). + * + * - Strings at the same index are compared. + * - Stop-words are ignored in comparisons. See #stopwords. + * - When both lists are empty, they are considered identical (returns 1.0) + * - Empty strings are considered identical to other empty strings. + * + * If a string is made up only of stop-words, the calculation will be + * repeated without ignoring stop-words. This is so that a sensible score + * can be returned when there is nothing else to compare. + * + * If comparisons with and without stop-words generate no usable information, + * 0.0 is returned as a fallback. + * + * TODO review use of stop-words in these comparisons, since results can + * often be confusing to end-users. + * + * @param strings1 a list of strings to compare + * @param strings2 the other list of strings to compare + * @return average similarity between the strings, between 0.0 and 1.0 + */ public static double getSimilarity(final List strings1, final List strings2) { - // length of the shorter list - int minListSize; - - // count the extra strings first: - int extraStringLengths; // total of "extra" strings in the longer list - if (strings1.size() < strings2.size()) { - minListSize = strings1.size(); - extraStringLengths = countExtraStringLengths(strings2, minListSize); - } else { - minListSize = strings2.size(); - extraStringLengths = countExtraStringLengths(strings1, minListSize); + return getSimilarity(strings1, strings2, true); + } + + /** + * Calculate word-based similarity of two lists of strings, optionally + * ignoring stop-words for all comparisons. + * + * If stop-words are ignored but no usable data remains for comparison, + * the calculation is repeated without ignoring stop-words. + * + * @param ignoreStopWords whether stop-words should be ignored for the first + * attempt at comparison. + * @return average similarity between the strings, between 0.0 and 1.0 + */ + private static double getSimilarity(List strings1, + List strings2, boolean ignoreStopWords) { + // all empty lists are identical + if (strings1.isEmpty() && strings2.isEmpty()) { + return 1.0; } - // total of Levenshtein distance between corresponding strings in the - // two lists, plus the length of any extra strings if one list is longer - int totalLevDistance = extraStringLengths; - // total of max editing distance between all the corresponding strings, - // plus length of extra strings - int totalMaxDistance = extraStringLengths; + // length of the shorter list + final int minListSize = Math.min(strings1.size(), strings2.size()); + final List longestList = strings1.size() > minListSize ? + strings1 : strings2; + + // total of "extra" strings in the longer list + final int extraStringLengths = + countExtraStringLengths(longestList, minListSize); + + // running total of Levenshtein distance between corresponding strings + // in the two lists + int cumulativeLevDistance = 0; - // now count the strings which correspond between both lists + // running total of max editing distance between all the corresponding + // strings. + int cumulativeMaxDistance = 0; + + // count the strings which correspond between both lists for (int i = 0; i < minListSize; i++) { - String[] s1 = tokenise(strings1.get(i)); - String[] s2 = tokenise(strings2.get(i)); - int levenshteinDistance = getLevenshteinDistanceInWords(s1, s2); - totalLevDistance += levenshteinDistance; - totalMaxDistance += Math.max(s1.length, s2.length); + final String string1 = strings1.get(i); + final String string2 = strings2.get(i); + String[] tokens1 = ignoreStopWords ? tokenise(string1) + : tokeniseAndKeepStopWords(string1); + String[] tokens2 = ignoreStopWords ? tokenise(string2) + : tokeniseAndKeepStopWords(string2); + final int levenshteinDistance = + getLevenshteinDistanceInWords(tokens1, tokens2); + cumulativeLevDistance += levenshteinDistance; + + // When a string contains only stop words, tokenise returns an empty + // array, so this value can remain at 0. + cumulativeMaxDistance += Math.max(tokens1.length, tokens2.length); } - double similarity = - (totalMaxDistance - totalLevDistance) - / (double) totalMaxDistance; - return similarity; + final int totalLevDistance = cumulativeLevDistance + extraStringLengths; + final int totalMaxDistance = cumulativeMaxDistance + extraStringLengths; + + // if there would be a divide-by-zero situation due to all strings being + // only stop-words, compare the stop words instead. If this does not + // work, all strings must contain no tokens. + if (totalMaxDistance == 0) { + if (ignoreStopWords) { + return getSimilarity(strings1, strings2, false); + } + // TODO fall back on plain string comparison instead. + return 0.0; + } + + return (totalMaxDistance - totalLevDistance) / (double) totalMaxDistance; } } diff --git a/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java b/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java index 4cf0b02ead..b864f21228 100644 --- a/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java +++ b/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java @@ -120,4 +120,61 @@ public void testTokenise() { assert thefoobar.length == 2; } + @Test + public void testStopWordsIgnoredWhenOtherWordsPresent() { + assertDifferentStringSimilarity("The foo is not bar", "A foo bar", 1.0); + assertDifferentStringSimilarity("An bar is an baz", "My bar is foo", 0.5); + } + + @Test + public void testStopWordsNotIgnoredWhenNoOtherWordsPresent() { + assertDifferentStringSimilarity("The is not", "A", 0.0); + assertDifferentStringSimilarity("The not is and", "It not is but", 0.5); + } + + @Test + public void testIdenticalStringsSimilarity() { + assertIdenticalStringsAreSimilar( + "I am the very model of a modern major general"); + } + + @Test + public void testEmptyListOfStringsSimilarity() { + double similarity = LevenshteinTokenUtil.getSimilarity( + Arrays.asList(), Arrays.asList()); + assertThat(similarity).isEqualTo(1.0, DELTA); + } + + @Test + public void testEmptyStringsSimilarity() { + assertIdenticalStringsAreSimilar(""); + } + + /** + * Asserts that getSimilarity gives 1.00 when matching the given string + * against itself. + * + * @param s the string to test + */ + private void assertIdenticalStringsAreSimilar(String s) { + double similarity = LevenshteinTokenUtil.getSimilarity(Arrays.asList(s), + Arrays.asList(s)); + assertThat(similarity).isEqualTo(1.0, DELTA); + } + + /** + * Asserts that getSimilarity gives the expected value when matching the + * given strings against each other. + * + * @param s1 the first string to test + * @param s2 the other string to test + */ + private void assertDifferentStringSimilarity(String s1, String s2, + double expected) { + double similarity = LevenshteinTokenUtil + .getSimilarity(Arrays.asList(s1), Arrays.asList(s2)); + assertThat(similarity).isEqualTo(expected, DELTA); + } + + }