From b7d582d42fd2a1eaa0d7b52cd061cbfbc505012f Mon Sep 17 00:00:00 2001 From: phpisciuneri Date: Mon, 4 Nov 2019 11:46:36 -0500 Subject: [PATCH 1/4] consistent indentation --- .../validator/StringLengthCheck.scala | 50 +++++++++---------- .../validator/StringRegexCheck.scala | 22 ++++---- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala index 2293338..40e137d 100644 --- a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala @@ -12,28 +12,28 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.{IntegerType, StringType, StructType} case class StringLengthCheck( - column: String, - minLength: Option[Json], - maxLength: Option[Json], - threshold: Option[String] - ) extends RowBased { + column: String, + minLength: Option[Json], + maxLength: Option[Json], + threshold: Option[String] +) extends RowBased { override def substituteVariables(dict: VarSubstitution): ValidatorBase = { val ret = StringLengthCheck( - getVarSub(column, "column", dict), - minLength.map(getVarSubJson(_, "minLength", dict)), - maxLength.map(getVarSubJson(_, "maxLength", dict)), - threshold.map(getVarSub(_, "threshold", dict)) - ) + getVarSub(column, "column", dict), + minLength.map(getVarSubJson(_, "minLength", dict)), + maxLength.map(getVarSubJson(_, "maxLength", dict)), + threshold.map(getVarSub(_, "threshold", dict)) + ) getEvents.foreach(ret.addEvent) ret } private def cmpExpr(colExpr: Expression, - value: Option[Json], - cmp: (Expression, Expression) => Expression - ): Option[Expression] = { + value: Option[Json], + cmp: (Expression, Expression) => Expression + ): Option[Expression] = { value.map { v => cmp(colExpr, createLiteralOrUnresolvedAttribute(IntegerType, v)) } } @@ -57,20 +57,20 @@ case class StringLengthCheck( private def checkMinLessThanOrEqualToMax(values: List[Json]): Unit = { if (values.forall(_.isNumber)) { - values.flatMap(_.asNumber) match { - case mv :: xv :: Nil if mv.toDouble > xv.toDouble => - addEvent(ValidatorError(s"min: ${minLength.get} must be less than or equal to max: ${maxLength.get}")) - case _ => - } + values.flatMap(_.asNumber) match { + case mv :: xv :: Nil if mv.toDouble > xv.toDouble => + addEvent(ValidatorError(s"min: ${minLength.get} must be less than or equal to max: ${maxLength.get}")) + case _ => + } } else if (values.forall(_.isString)) { - values.flatMap(_.asString) match { - case mv :: xv :: Nil if mv == xv => - addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv")) - case _ => - } + values.flatMap(_.asString) match { + case mv :: xv :: Nil if mv == xv => + addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv")) + case _ => + } } else { - // Not Strings or Numbers - addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}")) + // Not Strings or Numbers + addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}")) } } diff --git a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala index db4ee42..735aa94 100644 --- a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala @@ -12,10 +12,10 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.{StringType, StructType} case class StringRegexCheck( - column: String, - regex: Option[Json], - threshold: Option[String] - ) extends RowBased { + column: String, + regex: Option[Json], + threshold: Option[String] +) extends RowBased { override def substituteVariables(dict: VarSubstitution): ValidatorBase = { @@ -35,13 +35,13 @@ case class StringRegexCheck( val regexExpression = regex.map { r => RLike(colExp, createLiteralOrUnresolvedAttribute(StringType, r)) } val ret = regexExpression match { - /* - RLike returns false if the column value is null. - To avoid counting null values as validation failures (like other validations), - an explicit non null check on the column value is required. - */ - case Some(x) => And(Not(x), IsNotNull(colExp)) - case _ => throw new RuntimeException("Must define a regex.") + /* + RLike returns false if the column value is null. + To avoid counting null values as validation failures (like other validations), + an explicit non null check on the column value is required. + */ + case Some(x) => And(Not(x), IsNotNull(colExp)) + case _ => throw new RuntimeException("Must define a regex.") } logger.debug(s"Expr: $ret") ret From 687e2768bca9dccbc2b14282b72a9cf2874152f4 Mon Sep 17 00:00:00 2001 From: phpisciuneri Date: Mon, 4 Nov 2019 11:48:45 -0500 Subject: [PATCH 2/4] optimize imports --- .../target/data_validator/validator/StringLengthCheck.scala | 4 ++-- .../target/data_validator/validator/StringRegexCheck.scala | 5 ++--- .../data_validator/validator/StringLengthCheckSpec.scala | 4 ++-- .../data_validator/validator/StringRegexCheckSpec.scala | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala index 40e137d..362f2a6 100644 --- a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala @@ -1,11 +1,11 @@ package com.target.data_validator.validator -import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} import com.target.data_validator.JsonUtils.debugJson import com.target.data_validator.validator.ValidatorBase._ +import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} import com.typesafe.scalalogging.LazyLogging -import io.circe.{DecodingFailure, HCursor, Json} import io.circe.syntax._ +import io.circe.{DecodingFailure, HCursor, Json} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ diff --git a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala index 735aa94..deb597d 100644 --- a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala @@ -1,11 +1,10 @@ package com.target.data_validator.validator -import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} -import com.target.data_validator.JsonUtils.debugJson import com.target.data_validator.validator.ValidatorBase._ +import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} import com.typesafe.scalalogging.LazyLogging -import io.circe.{DecodingFailure, HCursor, Json} import io.circe.syntax._ +import io.circe.{DecodingFailure, HCursor, Json} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ diff --git a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala index c6a07de..9b52b2d 100644 --- a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala @@ -3,10 +3,10 @@ package com.target.data_validator.validator import com.target.TestingSparkSession import com.target.data_validator._ import io.circe.Json -import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalatest.{FunSpec, Matchers} import scala.util.Random @@ -428,4 +428,4 @@ class StringLengthCheckSpec extends FunSpec with Matchers with TestingSparkSessi } } } -} \ No newline at end of file +} diff --git a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala index 0e26613..0f27d4f 100644 --- a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala @@ -3,7 +3,7 @@ package com.target.data_validator.validator import com.target.TestingSparkSession import com.target.data_validator._ import io.circe.Json -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ From ca360813496da548f3c7995e1fb4ce62d9ce3c11 Mon Sep 17 00:00:00 2001 From: phpisciuneri Date: Mon, 4 Nov 2019 12:02:46 -0500 Subject: [PATCH 3/4] fix scalastyle warnings --- .../validator/StringLengthCheck.scala | 6 +++--- .../validator/StringRegexCheck.scala | 6 +++--- .../validator/StringLengthCheckSpec.scala | 4 ++-- .../validator/StringRegexCheckSpec.scala | 19 ++++++++++++------- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala index 362f2a6..be33a06 100644 --- a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala @@ -1,11 +1,11 @@ package com.target.data_validator.validator +import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} import com.target.data_validator.JsonUtils.debugJson import com.target.data_validator.validator.ValidatorBase._ -import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} import com.typesafe.scalalogging.LazyLogging -import io.circe.syntax._ import io.circe.{DecodingFailure, HCursor, Json} +import io.circe.syntax._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ @@ -89,7 +89,7 @@ case class StringLengthCheck( val colType = findColumnInDataFrame(df, column) if (colType.isDefined) { val dataType = colType.get.dataType - if (!(dataType.isInstanceOf[StringType])) { + if (!dataType.isInstanceOf[StringType]) { addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType")) } } diff --git a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala index deb597d..02ac5a7 100644 --- a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala @@ -1,10 +1,10 @@ package com.target.data_validator.validator -import com.target.data_validator.validator.ValidatorBase._ import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} +import com.target.data_validator.validator.ValidatorBase._ import com.typesafe.scalalogging.LazyLogging -import io.circe.syntax._ import io.circe.{DecodingFailure, HCursor, Json} +import io.circe.syntax._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ @@ -58,7 +58,7 @@ case class StringRegexCheck( val colType = findColumnInDataFrame(df, column) if (colType.isDefined) { val dataType = colType.get.dataType - if (!(dataType.isInstanceOf[StringType])) { + if (!dataType.isInstanceOf[StringType]) { addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType")) } } diff --git a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala index 9b52b2d..d68d5fc 100644 --- a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala @@ -3,10 +3,10 @@ package com.target.data_validator.validator import com.target.TestingSparkSession import com.target.data_validator._ import io.circe.Json +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalatest.{FunSpec, Matchers} import scala.util.Random @@ -364,7 +364,7 @@ class StringLengthCheckSpec extends FunSpec with Matchers with TestingSparkSessi assert(config.quickChecks(spark, dict)) assert(sut.failed) assert(sut.getEvents contains - ValidatorCheckEvent(failure = true, "StringLengthCheck on column 'item'", 4, 2)) + ValidatorCheckEvent(failure = true, "StringLengthCheck on column 'item'", 4, 2)) // scalastyle:ignore assert(sut.getEvents contains ValidatorQuickCheckError(("item", "Item1") :: Nil, "Item1", diff --git a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala index 0f27d4f..b2d347b 100644 --- a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala @@ -22,8 +22,8 @@ class StringRegexCheckSpec extends FunSpec with Matchers with TestingSparkSessio Row("Item1", 2.99), Row("Item23", 5.35), Row("I", 1.00), - Row(null, 1.00), - Row(null, 2.00) + Row(null, 1.00), // scalastyle:ignore + Row(null, 2.00) // scalastyle:ignore ) describe("StringRegexCheck") { @@ -40,7 +40,7 @@ class StringRegexCheckSpec extends FunSpec with Matchers with TestingSparkSessio it("error if column is not found in df") { val df = mkDataFrame(spark, defData, schema) - val sut = StringRegexCheck( "bad_column_name", Some(Json.fromString("I%")), None) + val sut = StringRegexCheck("bad_column_name", Some(Json.fromString("I%")), None) assert(sut.configCheck(df)) assert(sut.getEvents contains ValidatorError("Column: 'bad_column_name' not found in schema.")) assert(sut.failed) @@ -50,7 +50,10 @@ class StringRegexCheckSpec extends FunSpec with Matchers with TestingSparkSessio val df = mkDataFrame(spark, defData, schema) val sut = StringRegexCheck("baseprice", Some(Json.fromString("I%")), None) assert(sut.configCheck(df)) - assert(sut.getEvents contains ValidatorError("Data type of column 'baseprice' must be String, but was found to be DoubleType")) + assert( + sut.getEvents contains + ValidatorError("Data type of column 'baseprice' must be String, but was found to be DoubleType") + ) assert(sut.failed) } } @@ -71,7 +74,7 @@ class StringRegexCheckSpec extends FunSpec with Matchers with TestingSparkSessio } it("substitute with threshold") { - val dict = mkParams(List(("column", "item"), ("regex", "I%"), ("threshold", Json.fromInt(100)))) + val dict = mkParams(List(("column", "item"), ("regex", "I%"), ("threshold", Json.fromInt(100)))) // scalastyle:ignore val sut = StringRegexCheck("$column", Some(Json.fromString("${regex}")), Some("${threshold}")) assert(sut.substituteVariables(dict) == StringRegexCheck("item", Some(Json.fromString("I%")), Some("100"))) assert(!sut.failed) @@ -82,8 +85,10 @@ class StringRegexCheckSpec extends FunSpec with Matchers with TestingSparkSessio it("regex pattern ab%") { val sut = StringRegexCheck("item", Some(Json.fromString("ab%")), None) - assert(sut.colTest(schema, mkParams()).sql == - And(Not(RLike(UnresolvedAttribute("item"), Literal.create("ab%", StringType))), IsNotNull(UnresolvedAttribute("item"))).sql) + assert(sut.colTest(schema, mkParams()).sql == And( + Not(RLike(UnresolvedAttribute("item"), Literal.create("ab%", StringType))), + IsNotNull(UnresolvedAttribute("item"))).sql + ) } } From 20b2449a36349b1696f535d1e1916074124f400e Mon Sep 17 00:00:00 2001 From: phpisciuneri Date: Mon, 4 Nov 2019 12:07:36 -0500 Subject: [PATCH 4/4] remove log msgs that print config to console --- .../com/target/data_validator/validator/StringLengthCheck.scala | 2 -- .../com/target/data_validator/validator/StringRegexCheck.scala | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala index be33a06..199f3c3 100644 --- a/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala @@ -123,8 +123,6 @@ object StringLengthCheck extends LazyLogging { logger.debug(s"minLength: $minLengthJ type: ${minLengthJ.getClass.getCanonicalName}") logger.debug(s"maxLength: $maxLengthJ type: ${maxLengthJ.getClass.getCanonicalName}") logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}") - - c.focus.foreach {f => logger.info(s"StringLengthCheckJson: ${f.spaces2}")} scala.util.Right(StringLengthCheck(column, minLengthJ, maxLengthJ, threshold)) } } diff --git a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala index 02ac5a7..157d634 100644 --- a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala @@ -89,8 +89,6 @@ object StringRegexCheck extends LazyLogging { logger.debug(s"column: $column") logger.debug(s"regex: $regex type: ${regex.getClass.getCanonicalName}") logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}") - - c.focus.foreach {f => logger.info(s"StringRegexCheckJson: ${f.spaces2}")} scala.util.Right(StringRegexCheck(column, regex, threshold)) } }