From 5d7299114e39ed8db2a4f55064e5735998894c20 Mon Sep 17 00:00:00 2001 From: Michael Reynolds Date: Wed, 2 Mar 2022 11:15:51 -0500 Subject: [PATCH] handle null / empty / missing text edge cases (#1) --- build.sbt | 2 +- project/build.properties | 2 +- .../shingleprint/ShingleprintDedup.scala | 25 ++++++++++++------- .../ShingleprintDedupTestSuite.scala | 11 ++++++++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/build.sbt b/build.sbt index 4a445bd..1bff5fa 100644 --- a/build.sbt +++ b/build.sbt @@ -3,7 +3,7 @@ import sbt._ organization in ThisBuild := "com.github.reynoldsm88" name := "dedup" -version in ThisBuild := "1.0.0-SNAPSHOT" +version in ThisBuild := "1.0.2-SNAPSHOT" scalaVersion in ThisBuild := "2.12.7" diff --git a/project/build.properties b/project/build.properties index 1afe89b..efae80e 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 1.4.1 \ No newline at end of file +sbt.version = 1.4.4 \ No newline at end of file diff --git a/src/main/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedup.scala b/src/main/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedup.scala index 0cacd55..463eb07 100644 --- a/src/main/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedup.scala +++ b/src/main/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedup.scala @@ -16,23 +16,30 @@ class ShingleprintDedup( val maxWords : Int, override def check( text : String ) : Set[ Duplicate ] = { val docShingles = shingleprints( text ) - val matches = cache.search( docShingles ) - if ( matches.nonEmpty ) matches.map( m => Duplicate( m, 1.0 ) ) - else Set() + if ( docShingles.nonEmpty ) { + val matches = cache.search( docShingles ) + if ( matches.nonEmpty ) matches.map( m => Duplicate( m, 1.0 ) ) + else Set() + } else Set() } override def update( id : String, text : String ) : Unit = { - cache.update( id, shingleprints( text ) ) + val shingles = shingleprints( text ) + if ( shingles.nonEmpty ) { + cache.update( id, shingles ) + } } private def shingleprints( text : String ) : Set[ Int ] = { - val shingleOne = text.substring( 0, text.length / 2 ) - val shingleTwo = text.substring( text.length / 2, text.length ) + if ( text != null && text.nonEmpty ) { + val shingleOne = text.substring( 0, text.length / 2 ) + val shingleTwo = text.substring( text.length / 2, text.length ) - val (min1, max2) = minMaxHash( shingleOne ) - val (min2, max1) = minMaxHash( shingleTwo ) + val (min1, max2) = minMaxHash( shingleOne ) + val (min2, max1) = minMaxHash( shingleTwo ) - Set( hashCombine( min1, min2 ), hashCombine( min1, max2 ), hashCombine( max1, min2 ), hashCombine( max1, max2 ) ) + Set( hashCombine( min1, min2 ), hashCombine( min1, max2 ), hashCombine( max1, min2 ), hashCombine( max1, max2 ) ) + } else Set() } private def minMaxHash( text : String ) : (Int, Int) = { diff --git a/src/test/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedupTestSuite.scala b/src/test/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedupTestSuite.scala index 65648f0..e6ea823 100644 --- a/src/test/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedupTestSuite.scala +++ b/src/test/scala/com/github/reynoldsm88/dedup/shingleprint/ShingleprintDedupTestSuite.scala @@ -93,6 +93,17 @@ class ShingleprintDedupTestSuite extends TestBase { results.head.docId shouldBe "nytimes-nasa-moon" } + "Shingleprint Deduplication" should "ignore documents with null text" in { + val dedup : Dedup = init() + val results = dedup.check( null ) + results.isEmpty shouldBe true + } + + "Shingleprint Deduplication" should "ignore documents with empty text" in { + val dedup : Dedup = init() + val results = dedup.check( "" ) + results.isEmpty shouldBe true + } private def init( ) : Dedup = { val dedup = new ShingleprintDedup( maxWords = 100, threshold = 0.9 )