Skip to content

Commit

Permalink
Merge pull request #13 from vickumar1981/tversky-tanimoto-scores
Browse files Browse the repository at this point in the history
Adding symmetric version of Tversky index/score.
  • Loading branch information
vickumar1981 committed Nov 18, 2018
2 parents 7f81e8e + 2d59376 commit d3b32ba
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,8 @@ public static Integer longestCommonSeq(String s1, String s2) {
public static Double overlap(String s1, String s2) { return overlap(s1, s2, 1); }

public static Double overlap(String s1, String s2, Integer n) { return overLap.overlap(s1, s2, n); }

public static Double tversky(String s1, String s2) { return tversky(s1, s2, 1d); }

public static Double tversky(String s1, String s2, Double n) { return jaccard.tversky(s1, s2, n); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import com.github.vickumar1981.stringdistance.impl._
* val ngramSimilarity: Double = NGram.score("karolin", "kathrin")
* val bigramSimilarity: Double = NGram.score("karolin", "kathrin", 2)
* val overlap: Double = Overlap.score("karolin", "kathrin")
* val tversky: Double = Tversky.score("karolin", "kathrin", 0.5)
*
* // Distances between strings
* val damerauDist: Int = Damerau.distance("martha", "marhta")
Expand All @@ -42,6 +43,7 @@ object StringDistance {
object LongestCommonSeq extends StringMetric[LongestCommonSeqAlorithm]
object NGram extends StringMetric[NGramAlgorithm]
object Overlap extends StringMetric[OverlapAlgorithm]
object Tversky extends StringMetric[TverskyAlgorithm]
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
trait JaccardImpl extends NGramTokenizer {
protected def jaccard(s1: String, s2: String, n: Int = 1): Double = {
foldNGram(s1, s2, n)(0d)(_ => 1d) {
(s1TokLen, s2TokenLen, dist) => dist.toDouble / (s1TokLen + s2TokenLen - dist)
(s1Tok, s2Tok, dist) => dist.toDouble / (s1Tok.length + s2Tok.length - dist)
}
}

protected def tversky(s1: String, s2: String, n: Double = 1): Double = {
foldNGram(s1, s2, 2)(0d)(_ => 1d) {
(s1Tok, s2Tok, dist) => {
val s1Complement = s1Tok.map { s => !s2Tok.contains(s) }.filter { identity }
val s2Complement = s2Tok.map { s => !s1Tok.contains(s) }.filter { identity }
dist.toDouble / (dist.toDouble + (n * s1Complement.length) + (n * s2Complement.length))
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
trait NGramImpl extends NGramTokenizer {
protected def nGram(s1: String, s2: String, n: Int = 1): Double = {
foldNGram(s1, s2, n)(0d)(_ => 1d) {
(s1TokLen, s2TokenLen, dist) => 1 - dist.toDouble / math.max(s1TokLen, s2TokenLen)
(s1Tok, s2Tok, dist) => 1 - dist.toDouble / math.max(s1Tok.length, s2Tok.length)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import com.github.vickumar1981.stringdistance.interfaces.NGramTokenizer
trait OverlapImpl extends NGramTokenizer {
protected def overlap(s1: String, s2: String, n: Int = 1): Double = {
foldNGram(s1, s2, n)(0d)(_ => 1d) {
(s1TokLen, s2TokenLen, dist) => 1 - dist.toDouble / math.min(s1TokLen, s2TokenLen)
(s1Tok, s2Tok, dist) => 1 - dist.toDouble / math.min(s1Tok.length, s2Tok.length)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,18 @@ package object implicits {
*/
override def score(s1: String, s2: String): Boolean = soundex(s1, s2)
}

/**
* Implicit definition of tversky score for [[TverskyAlgorithm]].
*/
implicit object TverskyScore extends JaccardImpl with WeightedScoringAlgorithm[TverskyAlgorithm, Double] {
/**
* The score method takes two strings and returns tversky score between them.
*
* @param s1 The 1st String.
* @param s2 The 2nd String.
* @return Returns the tversky score between Strings s1 and s2.
*/
override def score(s1: String, s2: String, n: Double = 1): Double = tversky(s1, s2, n)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ trait NGramTokenizer {
protected def tokenizeNGram(a: String, n: Int): Array[String] = tokenize(a.toCharArray, n).map(_.mkString)

@annotation.tailrec
private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) =>
private val sequence: (Array[Char], Array[Array[Char]], Int) => Array[Array[Char]] = (i, o, n) =>
if (i.length <= n) o :+ i
else sequence(i.tail, o :+ i.take(n), n)

protected def foldNGram[T](s1: String, s2: String, n: Int = 1)
(err: => T)(success: (Int) => T)
(fuzzy: (Int, Int, Int) => T): T = {
(fuzzy: (Seq[String], Seq[String], Int) => T): T = {
if (n <= 0 || s1.length < n || s2.length < n) err
else if (s1.sameElements(s2)) {
val s1Tokenized = tokenizeNGram(s1, n)
Expand All @@ -25,7 +25,7 @@ trait NGramTokenizer {
val s1Tokenized = tokenizeNGram(s1, n)
val s2Tokenized = tokenizeNGram(s2, n)
val intersectionLength = intersectLength(s1Tokenized, s2Tokenized)
fuzzy(s1Tokenized.length, s2Tokenized.length, intersectionLength)
fuzzy(s1Tokenized.toSeq, s2Tokenized.toSeq, intersectionLength)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ package object stringdistance {
*/
trait SoundexAlgorithm extends StringMetricAlgorithm

/**
* A marker interface for the tversky similarity algorithm.
*/
trait TverskyAlgorithm extends StringMetricAlgorithm

/**
* The Strategy object has two strategies(reg ex) expressions on which to split input.
* [[Strategy.splitWord]] splits a word into a sequence of characters.
Expand Down Expand Up @@ -248,6 +253,7 @@ package object stringdistance {
def nGram(s2: String, nGram: Int = 1): Double = NGram.score(s1, s2, nGram)
def nGramDist(s2: String, nGram: Int = 1): Double = NGram.distance(s1, s2, nGram)
def overlap(s2: String, nGram: Int = 1): Double = Overlap.score(s1, s2, nGram)
def tversky(s2: String, n: Double = 1): Double = Tversky.score(s1, s2, n)

def metaphone(s2: String): Boolean = Metaphone.score(s1, s2)
def soundex(s2: String): Boolean = Soundex.score(s1, s2)
Expand Down
15 changes: 15 additions & 0 deletions src/test/scala/TestStringDistance.scala
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,20 @@ class TestStringDistance extends FlatSpec with Matchers {
roundToPrecision(overlap) should be (t.overlap.get)
})
}

"The Tversky Score with weight 0.5" should "match the dice coefficient" in {
testCases.filter(_.diceCoefficient.isDefined).map(t => {
val tversky = t.s1 tversky (t.s2, 0.5)
roundToPrecision(tversky) should be (t.diceCoefficient.get)
})
}

"The Tversky Score with weight 1.0" should "match the jaccard score using bigrams" in {
testCases.filter(_.jaccard.isDefined).map(t => {
val tversky = t.s1 tversky (t.s2)
val jaccard = t.s1 jaccard (t.s2, 2)
roundToPrecision(tversky) should be (roundToPrecision(jaccard))
})
}
}

0 comments on commit d3b32ba

Please sign in to comment.