Skip to content

Commit

Permalink
handle null / empty / missing text edge cases (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
reynoldsm88 committed Mar 2, 2022
1 parent 398fd61 commit 5d72991
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 11 deletions.
2 changes: 1 addition & 1 deletion build.sbt
Expand Up @@ -3,7 +3,7 @@ import sbt._

organization in ThisBuild := "com.github.reynoldsm88"
name := "dedup"
version in ThisBuild := "1.0.0-SNAPSHOT"
version in ThisBuild := "1.0.2-SNAPSHOT"

scalaVersion in ThisBuild := "2.12.7"

Expand Down
2 changes: 1 addition & 1 deletion project/build.properties
@@ -1 +1 @@
sbt.version = 1.4.1
sbt.version = 1.4.4
Expand Up @@ -16,23 +16,30 @@ class ShingleprintDedup( val maxWords : Int,

override def check( text : String ) : Set[ Duplicate ] = {
val docShingles = shingleprints( text )
val matches = cache.search( docShingles )
if ( matches.nonEmpty ) matches.map( m => Duplicate( m, 1.0 ) )
else Set()
if ( docShingles.nonEmpty ) {
val matches = cache.search( docShingles )
if ( matches.nonEmpty ) matches.map( m => Duplicate( m, 1.0 ) )
else Set()
} else Set()
}

override def update( id : String, text : String ) : Unit = {
cache.update( id, shingleprints( text ) )
val shingles = shingleprints( text )
if ( shingles.nonEmpty ) {
cache.update( id, shingles )
}
}

private def shingleprints( text : String ) : Set[ Int ] = {
val shingleOne = text.substring( 0, text.length / 2 )
val shingleTwo = text.substring( text.length / 2, text.length )
if ( text != null && text.nonEmpty ) {
val shingleOne = text.substring( 0, text.length / 2 )
val shingleTwo = text.substring( text.length / 2, text.length )

val (min1, max2) = minMaxHash( shingleOne )
val (min2, max1) = minMaxHash( shingleTwo )
val (min1, max2) = minMaxHash( shingleOne )
val (min2, max1) = minMaxHash( shingleTwo )

Set( hashCombine( min1, min2 ), hashCombine( min1, max2 ), hashCombine( max1, min2 ), hashCombine( max1, max2 ) )
Set( hashCombine( min1, min2 ), hashCombine( min1, max2 ), hashCombine( max1, min2 ), hashCombine( max1, max2 ) )
} else Set()
}

private def minMaxHash( text : String ) : (Int, Int) = {
Expand Down
Expand Up @@ -93,6 +93,17 @@ class ShingleprintDedupTestSuite extends TestBase {
results.head.docId shouldBe "nytimes-nasa-moon"
}

"Shingleprint Deduplication" should "ignore documents with null text" in {
val dedup : Dedup = init()
val results = dedup.check( null )
results.isEmpty shouldBe true
}

"Shingleprint Deduplication" should "ignore documents with empty text" in {
val dedup : Dedup = init()
val results = dedup.check( "" )
results.isEmpty shouldBe true
}

private def init( ) : Dedup = {
val dedup = new ShingleprintDedup( maxWords = 100, threshold = 0.9 )
Expand Down

0 comments on commit 5d72991

Please sign in to comment.