-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from vickumar1981/smith-waterman-score
Smith waterman score
- Loading branch information
Showing
12 changed files
with
293 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 changes: 7 additions & 0 deletions
7
src/main/java/com/github/vickumar1981/stringdistance/util/SmithWatermanImpl.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package com.github.vickumar1981.stringdistance.util; | ||
|
||
import com.github.vickumar1981.stringdistance.SmithWatermanImplWrapper; | ||
|
||
public class SmithWatermanImpl extends SmithWatermanImplWrapper { | ||
public SmithWatermanImpl() {} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
118 changes: 118 additions & 0 deletions
118
src/main/scala/com/github/vickumar1981/stringdistance/impl/SmithWatermanImpl.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package com.github.vickumar1981.stringdistance.impl | ||
|
||
import scala.math.{max, min} | ||
|
||
trait SmithWatermanImpl extends GapSubstitution { | ||
def smithWaterman(s1: String, s2: String, | ||
gap: Gap = ConstantGap(), | ||
windowSize: Int = Integer.MAX_VALUE): Double = { | ||
require(gap.matchValue > 0, "Smith Waterman match value must be a number > 0.") | ||
require(gap.misMatchValue < 0, "Smith Waterman mismatch value must be a number < 0.") | ||
require(windowSize > 0, "Smith Waterman window size must be a number > 0") | ||
|
||
if (s1.isEmpty || s2.isEmpty) 0d | ||
else { | ||
val maxDist = min(s1.length, s2.length) * max(gap.matchValue, gap.min) | ||
val calcScore = calculateSmithWaterman(s1, s2, gap, windowSize) | ||
1 - ((calcScore - maxDist) / maxDist) | ||
} | ||
} | ||
|
||
def smithWatermanGotoh(s1: String, s2: String, gap: ConstantGap = ConstantGap()): Double = { | ||
require(gap.matchValue > 0, "Smith Waterman Gotoh match value must be a number > 0.") | ||
require(gap.misMatchValue < 0, "Smith Waterman Gotoh mismatch value must be a number < 0.") | ||
if (s1.isEmpty || s2.isEmpty) 0d | ||
else { | ||
val maxDist = min(s1.length, s2.length) * max(gap.matchValue, gap.gapValue) | ||
val calcScore = calculateSmithWatermanGotoh(s1, s2, gap) | ||
(maxDist - calcScore) / maxDist | ||
} | ||
} | ||
|
||
// scalastyle:off | ||
private def calculateSmithWaterman(s1: String, s2: String, gap: Gap, windowSize: Int): Double = { | ||
val (s1Len, s2Len) = (s1.length, s2.length) | ||
val d = Array.ofDim[Double](s1Len, s2Len) | ||
var maxValue: Double = max(0d, subst(s1, 0, s2, 0, gap)) | ||
d(0)(0) = maxValue | ||
s1.indices.foreach { | ||
i => { | ||
// Get the optimal deletion | ||
var maxGapCost = 0d | ||
(max(1, i - windowSize) until i).foreach { | ||
k => maxGapCost = max(maxGapCost, d(i - k)(0) + gap.value(i - k, i)) | ||
} | ||
d(i)(0) = max(max(0, maxGapCost), subst(s1, i, s2, 0, gap)) | ||
maxValue = max(maxValue, d(i)(0)) | ||
} | ||
} | ||
|
||
(1 until s2Len).foreach { | ||
j => { | ||
// Get the optimal insertion | ||
var maxGapCost = 0d | ||
(max(1, j - windowSize) until j).foreach { | ||
k => maxGapCost = max(maxGapCost, d(0)(j - k) + gap.value(j - k, j)) | ||
} | ||
d(0)(j) = max(max(0d, maxGapCost), subst(s1, 0, s2, j, gap)) | ||
maxValue = max(maxValue, d(0)(j)) | ||
} | ||
} | ||
|
||
// Build 2-d array | ||
(1 until s1Len).foreach { | ||
i => { | ||
(1 until s2Len).foreach { | ||
j => { | ||
var maxGapCost = 0d | ||
(max(1, i - windowSize) until i).foreach { | ||
k => maxGapCost = max(maxGapCost, d(i - k)(j) + gap.value(i - k, i)) | ||
} | ||
|
||
(max(1, j - windowSize) until j).foreach { | ||
k => maxGapCost = max(maxGapCost, d(i)(j - k) + gap.value(j - k, j)) | ||
} | ||
|
||
d(i)(j) = max(max(0d, maxGapCost), d(i - 1)(j - 1) + subst(s1, i, s2, j, gap)) | ||
maxValue = max(maxValue, d(i)(j)) | ||
} | ||
} | ||
} | ||
} | ||
maxValue | ||
} | ||
|
||
// scalastyle: on | ||
|
||
private def calculateSmithWatermanGotoh(s1: String, s2: String, gap: ConstantGap): Double = { | ||
val (s1Len, s2Len) = (s1.length, s2.length) | ||
val v0 = Array.ofDim[Double](s2Len) | ||
val v1 = Array.ofDim[Double](s2Len) | ||
var maxValue = max(max(0, gap.gapValue), subst(s1, 0, s2, 0, gap)) | ||
|
||
(1 until s2Len).foreach { | ||
j => v0(j) = max(max(0, v0(j - 1) + gap.gapValue), subst(s1, 0, s2, j, gap)) | ||
} | ||
|
||
(1 until s1Len).foreach { | ||
i => { | ||
v1(0) = max(max(0, v0(0) + gap.gapValue), subst(s1, i, s2, 0, gap)) | ||
maxValue = max(maxValue, v1(0)) | ||
|
||
(1 until s2Len).foreach { | ||
j => { | ||
v1(j) = max(max(0, v0(j) + gap.gapValue), v0(j - 1) + subst(s1, i, s2, j, gap)) | ||
maxValue = max(maxValue, v1(j)) | ||
} | ||
} | ||
|
||
s2.indices.foreach { | ||
j => v0(j) = v1(j) | ||
} | ||
} | ||
} | ||
maxValue | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.