Skip to content

Commit

Permalink
Merge pull request #16 from vickumar1981/smith-waterman-score
Browse files Browse the repository at this point in the history
Smith waterman score
  • Loading branch information
vickumar1981 committed Dec 1, 2018
2 parents 614d316 + ebcb87d commit 85ee8de
Show file tree
Hide file tree
Showing 12 changed files with 293 additions and 25 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ val levenshtein: Double = Levenshtein.score("martha", "marhta")
// Longest Common Subsequence
val longestCommonSubSeq: Int = LongestCommonSeq.distance("martha", "marhta")

// Needleman Wunsch
val needlemanWunsch: Double = NeedlemanWunsch.score("martha", "marhta")

// N-Gram Similarity and Distance
val ngramDist: Int = NGram.distance("karolin", "kathrin")
val bigramDist: Int = NGram.distance("karolin", "kathrin", 2)
Expand All @@ -98,6 +101,10 @@ val bigramSimilarity: Double = NGram.score("karolin", "kathrin", 2)
val overlap: Double = Overlap.score("karolin", "kathrin")
val overlapBiGram: Double = Overlap.score("karolin", "kathrin", 2)

// Smith Waterman Similarities
val smithWaterman: Double = SmithWaterman.score("martha", "marhta")
val smithWatermanGotoh: Double = SmithWatermanGotoh.score("martha", "marhta")

// Tversky Similarity
val tversky: Double = Tversky.score("karolin", "kathrin", 0.5)

Expand All @@ -123,10 +130,13 @@ val jaccard: Double = "karolin".jaccard("kathrin")
val jaro: Double = "martha".jaro("marhta")
val jaroWinkler: Double = "martha".jaroWinkler("marhta")
val levenshtein: Double = "martha".levenshtein("marhta")
val needlemanWunsch: Double = "martha".needlemanWusnch("marhta")
val ngramSimilarity: Double = "karolin".nGram("kathrin")
val bigramSimilarity: Double = "karolin".nGram("kathrin", 2)
val overlap: Double = "karolin".overlap("kathrin")
val overlapBiGram: Double "karolin".overlap("kathrin", 2)
val smithWaterman: Double = "martha".smithWaterman("marhta")
val smithWatermanGotoh: Double = "martha".smithWatermanGotoh("marhta")
val tversky: Double = "karolin".tversky("kathrin", 0.5)

// Distances between two strings
Expand Down Expand Up @@ -160,10 +170,13 @@ Double jaccard = StringDistance.jaccard("karolin", "kathrin");
Double jaro = StringDistance.jaro("martha", "marhta");
Double jaroWinkler = StringDistance.jaroWinkler("martha", "marhta");
Double levenshtein = StringDistance.levenshtein("martha", "marhta");
Double needlemanWunsch = StringDistance.needlemanWunsch("martha", "marhta");
Double ngramSimilarity = StringDistance.nGram("karolin", "kathrin");
Double bigramSimilarity = StringDistance.nGram("karolin", "kathrin", 2);
Double overlap = StringDistance.overlap("karolin", "kathrin");
Double overlapBiGram = StringDistance.overlap("karolin", "kathrin", 2);
Double smithWaterman = StringDistance.smithWaterman("martha", "marhta");
Double smithWatermanGotoh = StringDistance.smithWatermanGotoh("martha", "marhta");
Double tversky = StringDistance.tversky("karolin", "kathrin", 0.5);

// Distances between two strings
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package com.github.vickumar1981.stringdistance.util;

import com.github.vickumar1981.stringdistance.SmithWatermanImplWrapper;

public class SmithWatermanImpl extends SmithWatermanImplWrapper {
public SmithWatermanImpl() {}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.github.vickumar1981.stringdistance.util;

import com.github.vickumar1981.stringdistance.impl.ConstantGap;
import com.github.vickumar1981.stringdistance.impl.Gap;
import com.github.vickumar1981.stringdistance.impl.LinearGap;

public class StringDistance {
private final static CosSimilarityImpl cosine = new CosSimilarityImpl();
Expand All @@ -13,6 +15,7 @@ public class StringDistance {
private final static NeedlemanWunschImpl needlemanWunsch = new NeedlemanWunschImpl();
private final static NGramImpl ngram = new NGramImpl();
private final static OverlapImpl overLap = new OverlapImpl();
private final static SmithWatermanImpl smithWaterman = new SmithWatermanImpl();

private final static String splitOnWord = "(?!^)";
private final static String splitOnSentence = "\\W+";
Expand Down Expand Up @@ -102,6 +105,26 @@ public static Double needlemanWunsch(String s1, String s2, ConstantGap gap) {

public static Double overlap(String s1, String s2, Integer n) { return overLap.overlap(s1, s2, n); }

public static Double smithWaterman(String s1, String s2) {
return smithWaterman(s1, s2, new LinearGap(1d, -1d, 1d));
}

public static Double smithWaterman(String s1, String s2, Gap gap) {
return smithWaterman.smithWaterman(s1, s2, gap, Integer.MAX_VALUE);
}

public static Double smithWaterman(String s1, String s2, Gap gap, Integer windowSize) {
return smithWaterman.smithWaterman(s1, s2, gap, windowSize);
}

public static Double smithWatermanGotoh(String s1, String s2) {
return smithWatermanGotoh(s1, s2, new ConstantGap(1d, -1d, 1d));
}

public static Double smithWatermanGotoh(String s1, String s2, ConstantGap gap) {
return smithWaterman.smithWatermanGotoh(s1, s2, gap);
}

public static Double tversky(String s1, String s2) { return tversky(s1, s2, 1d); }

public static Double tversky(String s1, String s2, Double n) { return jaccard.tversky(s1, s2, n); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import com.github.vickumar1981.stringdistance.impl._
* val ngramSimilarity: Double = NGram.score("karolin", "kathrin")
* val bigramSimilarity: Double = NGram.score("karolin", "kathrin", 2)
* val overlap: Double = Overlap.score("karolin", "kathrin")
* val smithWaterman: Double = SmithWaterman.score("martha", "marhta")
* val smithWatermanGotoh: Double = SmithWatermanGotoh.score("martha", "marhta")
* val tversky: Double = Tversky.score("karolin", "kathrin", 0.5)
*
* // Distances between strings
Expand All @@ -45,6 +47,8 @@ object StringDistance {
object NeedlemanWunsch extends StringMetric[NeedlemanWunschAlgorithm]
object NGram extends StringMetric[NGramAlgorithm]
object Overlap extends StringMetric[OverlapAlgorithm]
object SmithWaterman extends StringMetric[SmithWatermanAlgorithm]
object SmithWatermanGotoh extends StringMetric[SmithWatermanGotohAlgorithm]
object Tversky extends StringMetric[TverskyAlgorithm]
}

Expand Down Expand Up @@ -98,3 +102,9 @@ class NGramImplWrapper extends NGramImpl
*/
class OverlapImplWrapper extends OverlapImpl


/**
* Jave Wrapper for smith waterman similarity.
*/
class SmithWatermanImplWrapper extends SmithWatermanImpl

Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package com.github.vickumar1981.stringdistance.impl

trait GapSubstitution {
protected def subst(a: String, aIndex: Int, b: String, bIndex: Int, gap: Gap): Double = {
if (a(aIndex) == b(bIndex)) gap.matchValue else gap.misMatchValue
}
}

trait Gap {
def matchValue: Double
def misMatchValue: Double
Expand All @@ -10,18 +16,19 @@ trait Gap {

case class ConstantGap(matchValue: Double = 1, misMatchValue: Double = -1, gapValue: Double = 0) extends Gap {
def value(fromIndex: Double, toIndex: Double): Double = gapValue
def value: Double = gapValue
def max: Double = gapValue
def min: Double = gapValue
}

case class LinearGap(matchValue: Double = 1, misMatchValue: Double = -1, gapValue: Double = 0) extends Gap {
case class LinearGap(matchValue: Double = 1, misMatchValue: Double = -1, gapValue: Double = 1) extends Gap {
def value(fromIndex: Double, toIndex: Double): Double = gapValue * (toIndex - fromIndex - 1)
def max: Double = 0
def min: Double = Double.NegativeInfinity
}

case class AffineGap(matchValue: Double = 1, misMatchValue: Double = -1,
startValue: Double = 0, gapValue: Double = 0) extends Gap {
startValue: Double = 0, gapValue: Double = 1) extends Gap {
def value(fromIndex: Double, toIndex: Double): Double = startValue + gapValue * (toIndex - fromIndex - 1)
def max: Double = startValue
def min: Double = Double.NegativeInfinity
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package com.github.vickumar1981.stringdistance.impl

import scala.math.{max, min}

trait NeedlemanWunschImpl {
trait NeedlemanWunschImpl extends GapSubstitution {
def needleman(s1: String, s2: String, gap: ConstantGap = ConstantGap()): Double = {
require(gap.matchValue > 0, "NeedlmanWunsch match value must be a number > 0.")
require(gap.misMatchValue < 0, "NeedlemanWunsh mismatch value must be a number < 0.")
Expand All @@ -11,15 +11,11 @@ trait NeedlemanWunschImpl {
else {
val maxDist = max(s1.length, s2.length) * max(gap.matchValue, gap.gapValue)
val minDist = max(s1.length, s2.length) * min(gap.misMatchValue, gap.gapValue)
(-1d * needlemanW(s1, s2, gap) - minDist) / (maxDist - minDist)
(-1d * calculateNeedlemanW(s1, s2, gap) - minDist) / (maxDist - minDist)
}
}

private def subst(a: String, aIndex: Int, b: String, bIndex: Int, gap: Gap): Double = {
if (a(aIndex) == b(bIndex)) gap.matchValue else gap.misMatchValue
}

private def needlemanW(s1: String, s2: String, gap: ConstantGap): Double = {
private def calculateNeedlemanW(s1: String, s2: String, gap: ConstantGap): Double = {
(s1.length, s2.length) match {
case (0, s2Len) => -gap.gapValue * s2Len
case (s1Len, 0) => -gap.gapValue * s1Len
Expand All @@ -33,16 +29,11 @@ trait NeedlemanWunschImpl {
(1 until s1Len).foreach {
i => {
(1 until v0.length).foreach {
j => {
j =>
v1(j) = min(min(v0(j) - gap.gapValue, v1(j - 1) - gap.gapValue),
v0(j - 1) - subst(s1, i - 1, s2, j - 1, gap))
}
}
v0.indices.foreach {
j => {
v0(j) = v1(j)
}
}
v0.indices.foreach { j => v0(j) = v1(j) }
}
}
v1(v1.length - 1)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package com.github.vickumar1981.stringdistance.impl

import scala.math.{max, min}

trait SmithWatermanImpl extends GapSubstitution {
def smithWaterman(s1: String, s2: String,
gap: Gap = ConstantGap(),
windowSize: Int = Integer.MAX_VALUE): Double = {
require(gap.matchValue > 0, "Smith Waterman match value must be a number > 0.")
require(gap.misMatchValue < 0, "Smith Waterman mismatch value must be a number < 0.")
require(windowSize > 0, "Smith Waterman window size must be a number > 0")

if (s1.isEmpty || s2.isEmpty) 0d
else {
val maxDist = min(s1.length, s2.length) * max(gap.matchValue, gap.min)
val calcScore = calculateSmithWaterman(s1, s2, gap, windowSize)
1 - ((calcScore - maxDist) / maxDist)
}
}

def smithWatermanGotoh(s1: String, s2: String, gap: ConstantGap = ConstantGap()): Double = {
require(gap.matchValue > 0, "Smith Waterman Gotoh match value must be a number > 0.")
require(gap.misMatchValue < 0, "Smith Waterman Gotoh mismatch value must be a number < 0.")
if (s1.isEmpty || s2.isEmpty) 0d
else {
val maxDist = min(s1.length, s2.length) * max(gap.matchValue, gap.gapValue)
val calcScore = calculateSmithWatermanGotoh(s1, s2, gap)
(maxDist - calcScore) / maxDist
}
}

// scalastyle:off
private def calculateSmithWaterman(s1: String, s2: String, gap: Gap, windowSize: Int): Double = {
val (s1Len, s2Len) = (s1.length, s2.length)
val d = Array.ofDim[Double](s1Len, s2Len)
var maxValue: Double = max(0d, subst(s1, 0, s2, 0, gap))
d(0)(0) = maxValue
s1.indices.foreach {
i => {
// Get the optimal deletion
var maxGapCost = 0d
(max(1, i - windowSize) until i).foreach {
k => maxGapCost = max(maxGapCost, d(i - k)(0) + gap.value(i - k, i))
}
d(i)(0) = max(max(0, maxGapCost), subst(s1, i, s2, 0, gap))
maxValue = max(maxValue, d(i)(0))
}
}

(1 until s2Len).foreach {
j => {
// Get the optimal insertion
var maxGapCost = 0d
(max(1, j - windowSize) until j).foreach {
k => maxGapCost = max(maxGapCost, d(0)(j - k) + gap.value(j - k, j))
}
d(0)(j) = max(max(0d, maxGapCost), subst(s1, 0, s2, j, gap))
maxValue = max(maxValue, d(0)(j))
}
}

// Build 2-d array
(1 until s1Len).foreach {
i => {
(1 until s2Len).foreach {
j => {
var maxGapCost = 0d
(max(1, i - windowSize) until i).foreach {
k => maxGapCost = max(maxGapCost, d(i - k)(j) + gap.value(i - k, i))
}

(max(1, j - windowSize) until j).foreach {
k => maxGapCost = max(maxGapCost, d(i)(j - k) + gap.value(j - k, j))
}

d(i)(j) = max(max(0d, maxGapCost), d(i - 1)(j - 1) + subst(s1, i, s2, j, gap))
maxValue = max(maxValue, d(i)(j))
}
}
}
}
maxValue
}

// scalastyle: on

private def calculateSmithWatermanGotoh(s1: String, s2: String, gap: ConstantGap): Double = {
val (s1Len, s2Len) = (s1.length, s2.length)
val v0 = Array.ofDim[Double](s2Len)
val v1 = Array.ofDim[Double](s2Len)
var maxValue = max(max(0, gap.gapValue), subst(s1, 0, s2, 0, gap))

(1 until s2Len).foreach {
j => v0(j) = max(max(0, v0(j - 1) + gap.gapValue), subst(s1, 0, s2, j, gap))
}

(1 until s1Len).foreach {
i => {
v1(0) = max(max(0, v0(0) + gap.gapValue), subst(s1, i, s2, 0, gap))
maxValue = max(maxValue, v1(0))

(1 until s2Len).foreach {
j => {
v1(j) = max(max(0, v0(j) + gap.gapValue), v0(j - 1) + subst(s1, i, s2, j, gap))
maxValue = max(maxValue, v1(j))
}
}

s2.indices.foreach {
j => v0(j) = v1(j)
}
}
}
maxValue
}
}


Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,39 @@ package object implicits {
override def score(s1: String, s2: String): Boolean = metaphone(s1, s2)
}

/**
* Implicit definition of smith waterman score for [[SmithWatermanAlgorithm]].
*/
implicit object SmithWatermanScore extends SmithWatermanImpl
with WeightedScoringAlgorithm[SmithWatermanAlgorithm, (Gap, Int)] {
/**
* The score method takes two strings and returns smith waterman similarity between them.
*
* @param s1 The 1st String.
* @param s2 The 2nd String.
* @return Returns the smith waterman similarity between Strings s1 and s2.
*/
override def score(s1: String, s2: String,
gapAndWindowSize: (Gap, Int) = (LinearGap(gapValue = 1), Integer.MAX_VALUE)): Double =
smithWaterman(s1, s2, gapAndWindowSize._1, gapAndWindowSize._2)
}

/**
* Implicit definition of smith waterman gotoh score for [[SmithWatermanGotohAlgorithm]].
*/
implicit object SmithWatermanGotohScore extends SmithWatermanImpl
with WeightedScoringAlgorithm[SmithWatermanGotohAlgorithm, ConstantGap] {
/**
* The score method takes two strings and returns smith waterman similarity between them.
*
* @param s1 The 1st String.
* @param s2 The 2nd String.
* @return Returns the smith waterman gotoh similarity between Strings s1 and s2.
*/
override def score(s1: String, s2: String, gap: ConstantGap = ConstantGap()): Double =
smithWatermanGotoh(s1, s2, gap)
}

/**
* Implicit definition of soundex score for [[SoundexAlgorithm]].
*/
Expand Down

0 comments on commit 85ee8de

Please sign in to comment.