Merge pull request #13 from vickumar1981/tversky-tanimoto-scores

Adding symmetric version of Tversky index/score.
vickumar1981 · Nov 18, 2018 · d3b32ba · d3b32ba
2 parents 7f81e8e + 2d59376
commit d3b32ba
Show file tree

Hide file tree

Showing 9 changed files with 57 additions and 6 deletions.
diff --git a/src/main/java/com/github/vickumar1981/stringdistance/util/StringDistance.java b/src/main/java/com/github/vickumar1981/stringdistance/util/StringDistance.java
@@ -90,4 +90,8 @@ public static Integer longestCommonSeq(String s1, String s2) {
     public static Double overlap(String s1, String s2) { return overlap(s1, s2, 1); }
 
     public static Double overlap(String s1, String s2, Integer n) { return overLap.overlap(s1, s2, n); }
+
+    public static Double tversky(String s1, String s2) { return tversky(s1, s2, 1d); }
+
+    public static Double tversky(String s1, String s2, Double n) { return jaccard.tversky(s1, s2, n); }
 }
diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/StringDistance.scala b/src/main/scala/com/github/vickumar1981/stringdistance/StringDistance.scala
@@ -20,6 +20,7 @@ import com.github.vickumar1981.stringdistance.impl._
   * val ngramSimilarity: Double = NGram.score("karolin", "kathrin")
   * val bigramSimilarity: Double = NGram.score("karolin", "kathrin", 2)
   * val overlap: Double = Overlap.score("karolin", "kathrin")
+  * val tversky: Double = Tversky.score("karolin", "kathrin", 0.5)
   *
   * // Distances between strings
   * val damerauDist: Int = Damerau.distance("martha", "marhta")
@@ -42,6 +43,7 @@ object StringDistance {
   object LongestCommonSeq extends StringMetric[LongestCommonSeqAlorithm]
   object NGram extends StringMetric[NGramAlgorithm]
   object Overlap extends StringMetric[OverlapAlgorithm]
+  object Tversky extends StringMetric[TverskyAlgorithm]
 }
 
 /**

diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/impl/JaccardImpl.scala b/src/main/scala/com/github/vickumar1981/stringdistance/impl/JaccardImpl.scala
@@ -5,7 +5,17 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
 trait JaccardImpl extends NGramTokenizer {
   protected def jaccard(s1: String, s2: String, n: Int = 1): Double = {
     foldNGram(s1, s2, n)(0d)(_ => 1d) {
-      (s1TokLen, s2TokenLen, dist) => dist.toDouble / (s1TokLen + s2TokenLen - dist)
+      (s1Tok, s2Tok, dist) => dist.toDouble / (s1Tok.length + s2Tok.length - dist)
+    }
+  }
+
+  protected def tversky(s1: String, s2: String, n: Double = 1): Double = {
+    foldNGram(s1, s2, 2)(0d)(_ => 1d) {
+      (s1Tok, s2Tok, dist) => {
+        val s1Complement = s1Tok.map { s => !s2Tok.contains(s) }.filter { identity }
+        val s2Complement = s2Tok.map { s => !s1Tok.contains(s) }.filter { identity }
+        dist.toDouble / (dist.toDouble + (n * s1Complement.length) + (n * s2Complement.length))
+      }
     }
   }
 }
diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/impl/NGramImpl.scala b/src/main/scala/com/github/vickumar1981/stringdistance/impl/NGramImpl.scala
@@ -5,7 +5,7 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
 trait NGramImpl extends NGramTokenizer {
   protected def nGram(s1: String, s2: String, n: Int = 1): Double = {
     foldNGram(s1, s2, n)(0d)(_ => 1d) {
-      (s1TokLen, s2TokenLen, dist) => 1 - dist.toDouble / math.max(s1TokLen, s2TokenLen)
+      (s1Tok, s2Tok, dist) => 1 - dist.toDouble / math.max(s1Tok.length, s2Tok.length)
     }
   }
 

diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/impl/OverlapImpl.scala b/src/main/scala/com/github/vickumar1981/stringdistance/impl/OverlapImpl.scala
@@ -5,7 +5,7 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
 trait OverlapImpl extends NGramTokenizer {
   protected def overlap(s1: String, s2: String, n: Int = 1): Double = {
     foldNGram(s1, s2, n)(0d)(_ => 1d) {
-      (s1TokLen, s2TokenLen, dist) => 1 - dist.toDouble / math.min(s1TokLen, s2TokenLen)
+      (s1Tok, s2Tok, dist) => 1 - dist.toDouble / math.min(s1Tok.length, s2Tok.length)
     }
   }
 }
diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/implicits/package.scala b/src/main/scala/com/github/vickumar1981/stringdistance/implicits/package.scala
@@ -205,4 +205,18 @@ package object implicits {
       */
     override def score(s1: String, s2: String): Boolean = soundex(s1, s2)
   }
+
+  /**
+    * Implicit definition of tversky score for [[TverskyAlgorithm]].
+    */
+  implicit object TverskyScore extends JaccardImpl with WeightedScoringAlgorithm[TverskyAlgorithm, Double] {
+    /**
+      * The score method takes two strings and returns tversky score between them.
+      *
+      * @param s1 The 1st String.
+      * @param s2 The 2nd String.
+      * @return Returns the tversky score between Strings s1 and s2.
+      */
+    override def score(s1: String, s2: String, n: Double = 1): Double = tversky(s1, s2, n)
+  }
 }
diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/interfaces/NGramTokenizer.scala b/src/main/scala/com/github/vickumar1981/stringdistance/interfaces/NGramTokenizer.scala
@@ -9,13 +9,13 @@ trait NGramTokenizer {
   protected def tokenizeNGram(a: String, n: Int): Array[String] = tokenize(a.toCharArray, n).map(_.mkString)
 
   @annotation.tailrec
-  private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) =>
+  private val sequence: (Array[Char], Array[Array[Char]], Int) => Array[Array[Char]] = (i, o, n) =>
     if (i.length <= n) o :+ i
     else sequence(i.tail, o :+ i.take(n), n)
 
   protected def foldNGram[T](s1: String, s2: String, n: Int = 1)
                             (err: => T)(success: (Int) => T)
-                            (fuzzy: (Int, Int, Int) => T): T = {
+                            (fuzzy: (Seq[String], Seq[String], Int) => T): T = {
     if (n <= 0 || s1.length < n || s2.length < n) err
     else if (s1.sameElements(s2)) {
       val s1Tokenized = tokenizeNGram(s1, n)
@@ -25,7 +25,7 @@ trait NGramTokenizer {
       val s1Tokenized = tokenizeNGram(s1, n)
       val s2Tokenized = tokenizeNGram(s2, n)
       val intersectionLength = intersectLength(s1Tokenized, s2Tokenized)
-      fuzzy(s1Tokenized.length, s2Tokenized.length, intersectionLength)
+      fuzzy(s1Tokenized.toSeq, s2Tokenized.toSeq, intersectionLength)
     }
   }
 }
diff --git a/src/main/scala/com/github/vickumar1981/stringdistance/package.scala b/src/main/scala/com/github/vickumar1981/stringdistance/package.scala
@@ -73,6 +73,11 @@ package object stringdistance {
     */
   trait SoundexAlgorithm extends StringMetricAlgorithm
 
+  /**
+    * A marker interface for the tversky similarity algorithm.
+    */
+  trait TverskyAlgorithm extends StringMetricAlgorithm
+
   /**
     * The Strategy object has two strategies(reg ex) expressions on which to split input.
     * [[Strategy.splitWord]] splits a word into a sequence of characters.
@@ -248,6 +253,7 @@ package object stringdistance {
       def nGram(s2: String, nGram: Int = 1): Double = NGram.score(s1, s2, nGram)
       def nGramDist(s2: String, nGram: Int = 1): Double = NGram.distance(s1, s2, nGram)
       def overlap(s2: String, nGram: Int = 1): Double = Overlap.score(s1, s2, nGram)
+      def tversky(s2: String, n: Double = 1): Double = Tversky.score(s1, s2, n)
 
       def metaphone(s2: String): Boolean = Metaphone.score(s1, s2)
       def soundex(s2: String): Boolean = Soundex.score(s1, s2)

diff --git a/src/test/scala/TestStringDistance.scala b/src/test/scala/TestStringDistance.scala
@@ -111,5 +111,20 @@ class TestStringDistance extends FlatSpec with Matchers {
       roundToPrecision(overlap) should be (t.overlap.get)
     })
   }
+
+  "The Tversky Score with weight 0.5" should "match the dice coefficient" in {
+    testCases.filter(_.diceCoefficient.isDefined).map(t => {
+      val tversky = t.s1 tversky (t.s2, 0.5)
+      roundToPrecision(tversky) should be (t.diceCoefficient.get)
+    })
+  }
+
+  "The Tversky Score with weight 1.0" should "match the jaccard score using bigrams" in {
+    testCases.filter(_.jaccard.isDefined).map(t => {
+      val tversky = t.s1 tversky (t.s2)
+      val jaccard = t.s1 jaccard (t.s2, 2)
+      roundToPrecision(tversky) should be (roundToPrecision(jaccard))
+    })
+  }
 }