From 276f310a6561d2be8860583b8f0bdcff8b0dbe9d Mon Sep 17 00:00:00 2001
From: David Mason <drdmason@gmail.com>
Date: Tue, 9 Jun 2015 08:53:54 +1000
Subject: [PATCH] Fall back on stop-words comparison for similarity when
 nothing else is available.

A divide-by-zero condition was generating a NaN value for similarity for strings
that contain only stop-words. When that condition would be met, this will instead
repeat the comparison without ignoring stop-words, then finally just return 0.0
if there are still no tokens to compare.
---
 .../zanata/search/LevenshteinTokenUtil.java   | 141 +++++++++++++-----
 .../search/LevenshteinTokenUtilTest.java      |  57 +++++++
 2 files changed, 161 insertions(+), 37 deletions(-)
diff --git a/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java b/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java
index de2278f33d..223fd10740 100644
--- a/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java
+++ b/zanata-war/src/main/java/org/zanata/search/LevenshteinTokenUtil.java
@@ -129,8 +129,8 @@ public static double getSimilarity(final String s1, final String s2) {
     /**
      * Splits into tokens (lower-case).
      *
-     * @param s
-     * @return
+     * @param s the string to tokenise
+     * @return an array of lowercase tokens (words)
      */
     static String[] tokenise(String s) {
         String[] tokens = s.toLowerCase().split(SPLIT_REGEX);
@@ -143,6 +143,16 @@ static String[] tokenise(String s) {
         return list.toArray(new String[list.size()]);
     }
 
+    /**
+     * Like tokenise(String) but does not discard stop words.
+     *
+     * @param s the string to tokenise
+     * @return an array of lowercase tokens (words)
+     */
+    static String[] tokeniseAndKeepStopWords(String s) {
+        return s.toLowerCase().split(SPLIT_REGEX);
+    }
+
     private static int countExtraStringLengths(List<String> strings,
             int fromIndex) {
         int total = 0;
@@ -159,56 +169,113 @@ private static int countExtraStringLengths(List<String> strings,
      * strings. Returns the mean similarity of s1 against each string in the
      * list.
      *
-     * @param s1
-     * @param strings2
-     * @return
+     * @param s1 string to compare against each other string
+     * @param strings2 other strings to compare s1 against
+     * @return mean similarity between s1 and each of strings2
      */
     public static double getSimilarity(final String s1,
             final List<String> strings2) {
         double totalSimilarity = 0.0;
-        int stringCount = strings2.size();
-        for (int i = 0; i < stringCount; i++) {
-            String s2 = strings2.get(i);
+        for (String s2 : strings2) {
             totalSimilarity += getSimilarity(s1, s2);
         }
-        double meanSimilarity = totalSimilarity / stringCount;
-        return meanSimilarity;
+        return totalSimilarity / strings2.size();
     }
 
+    /**
+     * Calculate the word-based case-insensitive similarity of two lists of
+     * strings (range 0.0 to 1.0).
+     *
+     *  - Strings at the same index are compared.
+     *  - Stop-words are ignored in comparisons. See #stopwords.
+     *  - When both lists are empty, they are considered identical (returns 1.0)
+     *  - Empty strings are considered identical to other empty strings.
+     *
+     * If a string is made up only of stop-words, the calculation will be
+     * repeated without ignoring stop-words. This is so that a sensible score
+     * can be returned when there is nothing else to compare.
+     *
+     * If comparisons with and without stop-words generate no usable information,
+     * 0.0 is returned as a fallback.
+     *
+     * TODO review use of stop-words in these comparisons, since results can
+     * often be confusing to end-users.
+     *
+     * @param strings1 a list of strings to compare
+     * @param strings2 the other list of strings to compare
+     * @return average similarity between the strings, between 0.0 and 1.0
+     */
     public static double getSimilarity(final List<String> strings1,
             final List<String> strings2) {
-        // length of the shorter list
-        int minListSize;
-
-        // count the extra strings first:
-        int extraStringLengths; // total of "extra" strings in the longer list
-        if (strings1.size() < strings2.size()) {
-            minListSize = strings1.size();
-            extraStringLengths = countExtraStringLengths(strings2, minListSize);
-        } else {
-            minListSize = strings2.size();
-            extraStringLengths = countExtraStringLengths(strings1, minListSize);
+        return getSimilarity(strings1, strings2, true);
+    }
+
+    /**
+     * Calculate word-based similarity of two lists of strings, optionally
+     * ignoring stop-words for all comparisons.
+     *
+     * If stop-words are ignored but no usable data remains for comparison,
+     * the calculation is repeated without ignoring stop-words.
+     *
+     * @param ignoreStopWords whether stop-words should be ignored for the first
+     *                        attempt at comparison.
+     * @return average similarity between the strings, between 0.0 and 1.0
+     */
+    private static double getSimilarity(List<String> strings1,
+            List<String> strings2, boolean ignoreStopWords) {
+        // all empty lists are identical
+        if (strings1.isEmpty() && strings2.isEmpty()) {
+            return 1.0;
         }
 
-        // total of Levenshtein distance between corresponding strings in the
-        // two lists, plus the length of any extra strings if one list is longer
-        int totalLevDistance = extraStringLengths;
-        // total of max editing distance between all the corresponding strings,
-        // plus length of extra strings
-        int totalMaxDistance = extraStringLengths;
+        // length of the shorter list
+        final int minListSize = Math.min(strings1.size(), strings2.size());
+        final List<String> longestList = strings1.size() > minListSize ?
+                strings1 : strings2;
+
+        // total of "extra" strings in the longer list
+        final int extraStringLengths =
+                countExtraStringLengths(longestList, minListSize);
+
+        // running total of Levenshtein distance between corresponding strings
+        // in the two lists
+        int cumulativeLevDistance = 0;
 
-        // now count the strings which correspond between both lists
+        // running total of max editing distance between all the corresponding
+        // strings.
+        int cumulativeMaxDistance = 0;
+
+        // count the strings which correspond between both lists
         for (int i = 0; i < minListSize; i++) {
-            String[] s1 = tokenise(strings1.get(i));
-            String[] s2 = tokenise(strings2.get(i));
-            int levenshteinDistance = getLevenshteinDistanceInWords(s1, s2);
-            totalLevDistance += levenshteinDistance;
-            totalMaxDistance += Math.max(s1.length, s2.length);
+            final String string1 = strings1.get(i);
+            final String string2 = strings2.get(i);
+            String[] tokens1 = ignoreStopWords ? tokenise(string1)
+                    : tokeniseAndKeepStopWords(string1);
+            String[] tokens2 = ignoreStopWords ? tokenise(string2)
+                    : tokeniseAndKeepStopWords(string2);
+            final int levenshteinDistance =
+                    getLevenshteinDistanceInWords(tokens1, tokens2);
+            cumulativeLevDistance += levenshteinDistance;
+
+            // When a string contains only stop words, tokenise returns an empty
+            // array, so this value can remain at 0.
+            cumulativeMaxDistance += Math.max(tokens1.length, tokens2.length);
         }
-        double similarity =
-                (totalMaxDistance - totalLevDistance)
-                        / (double) totalMaxDistance;
-        return similarity;
+        final int totalLevDistance = cumulativeLevDistance + extraStringLengths;
+        final int totalMaxDistance = cumulativeMaxDistance + extraStringLengths;
+
+        // if there would be a divide-by-zero situation due to all strings being
+        // only stop-words, compare the stop words instead. If this does not
+        // work, all strings must contain no tokens.
+        if (totalMaxDistance == 0) {
+            if (ignoreStopWords) {
+                return getSimilarity(strings1, strings2, false);
+            }
+            // TODO fall back on plain string comparison instead.
+            return 0.0;
+        }
+
+        return (totalMaxDistance - totalLevDistance) / (double) totalMaxDistance;
     }
 
 }
diff --git a/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java b/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java
index 4cf0b02ead..b864f21228 100644
--- a/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java
+++ b/zanata-war/src/test/java/org/zanata/search/LevenshteinTokenUtilTest.java
@@ -120,4 +120,61 @@ public void testTokenise() {
         assert thefoobar.length == 2;
     }
 
+    @Test
+    public void testStopWordsIgnoredWhenOtherWordsPresent() {
+        assertDifferentStringSimilarity("The foo is not bar", "A foo bar", 1.0);
+        assertDifferentStringSimilarity("An bar is an baz", "My bar is foo", 0.5);
+    }
+
+    @Test
+    public void testStopWordsNotIgnoredWhenNoOtherWordsPresent() {
+        assertDifferentStringSimilarity("The is not", "A", 0.0);
+        assertDifferentStringSimilarity("The not is and", "It not is but", 0.5);
+    }
+
+    @Test
+    public void testIdenticalStringsSimilarity() {
+        assertIdenticalStringsAreSimilar(
+                "I am the very model of a modern major general");
+    }
+
+    @Test
+    public void testEmptyListOfStringsSimilarity() {
+        double similarity = LevenshteinTokenUtil.getSimilarity(
+                Arrays.<String>asList(), Arrays.<String>asList());
+        assertThat(similarity).isEqualTo(1.0, DELTA);
+    }
+
+    @Test
+    public void testEmptyStringsSimilarity() {
+        assertIdenticalStringsAreSimilar("");
+    }
+
+    /**
+     * Asserts that getSimilarity gives 1.00 when matching the given string
+     * against itself.
+     *
+     * @param s the string to test
+     */
+    private void assertIdenticalStringsAreSimilar(String s) {
+        double similarity = LevenshteinTokenUtil.getSimilarity(Arrays.asList(s),
+                Arrays.asList(s));
+        assertThat(similarity).isEqualTo(1.0, DELTA);
+    }
+
+    /**
+     * Asserts that getSimilarity gives the expected value when matching the
+     * given strings against each other.
+     *
+     * @param s1 the first string to test
+     * @param s2 the other string to test
+     */
+    private void assertDifferentStringSimilarity(String s1, String s2,
+            double expected) {
+        double similarity = LevenshteinTokenUtil
+                .getSimilarity(Arrays.asList(s1), Arrays.asList(s2));
+        assertThat(similarity).isEqualTo(expected, DELTA);
+    }
+
+
 }