From 1dd6bb8fecf231318c6ef2c74546635af713e3ee Mon Sep 17 00:00:00 2001 From: peng Date: Sat, 10 Feb 2024 16:04:26 -0500 Subject: [PATCH] parser related impl is now moved into its own package, not useful in most cases cleanup common change NullSafeMagnet type alias to avoid Scala 3 syntax conflict evict DSL component for Spark ML pipeline, not useful in heterogeneous inference --- parent/benchmark/build.gradle.kts | 1 + .../spookystuff/relay/AutomaticRelay.scala | 2 +- .../spookystuff/relay/RelayRegistry.scala | 3 +- .../spookystuff/relay/RootTagged.scala | 2 +- .../tribbloids/spookystuff/relay/TreeIR.scala | 2 +- .../tribbloids/spookystuff/utils/Types.scala | 2 +- .../utils/UnsafeReflections.scala} | 4 +- .../spookystuff/utils/data/AttrLike.scala | 4 +- .../spookystuff/utils/data/EAVLike.scala | 6 +- .../utils/refl/CatalystTypeMixin.scala | 2 +- .../utils/refl/CatalystTypeOps.scala | 2 +- .../utils/refl/FromClassMixin.scala | 2 +- .../utils/refl/FromClassTagMixin.scala | 2 +- .../utils/refl/ReflectionUtils.scala | 2 +- .../spookystuff}/utils/refl/ScalaUDT.scala | 2 +- .../spookystuff}/utils/refl/ToCatalyst.scala | 2 +- .../spookystuff}/utils/refl/TypeMagnet.scala | 2 +- .../spookystuff}/utils/refl/TypeUtils.scala | 2 +- .../spookystuff}/utils/refl/Unerase.scala | 2 +- .../utils/refl/UnreifiedObjectType.scala | 2 +- .../spark/ml/dsl/AbstractNamedStage.scala | 103 --- .../org/apache/spark/ml/dsl/Compactions.scala | 63 -- .../scala/org/apache/spark/ml/dsl/DFD.scala | 173 ---- .../apache/spark/ml/dsl/DFDComponent.scala | 744 ------------------ .../spark/ml/dsl/DynamicParamsMixin.scala | 76 -- .../spark/ml/dsl/PipelineModelShim.scala | 17 - .../spark/ml/dsl/SchemaAdaptation.scala | 28 - .../org/apache/spark/ml/dsl/ShimViews.scala | 30 - .../org/apache/spark/ml/dsl/StepGraph.scala | 198 ----- .../org/apache/spark/ml/dsl/StepLike.scala | 259 ------ .../org/apache/spark/ml/dsl/StepMapView.scala | 9 - .../apache/spark/ml/dsl/StepTreeNode.scala | 77 -- .../org/apache/spark/ml/dsl/TrieNode.scala | 88 --- .../apache/spark/ml/dsl/UDFTransformer.scala | 73 -- .../org/apache/spark/ml/dsl/package.scala | 24 - .../spark/ml/dsl/utils/ClassOpsMixin.scala | 33 + .../apache/spark/ml/dsl/utils/EnumMixin.scala | 6 - .../apache/spark/ml/dsl/utils/LazyVar.scala | 4 +- .../spark/ml/dsl/utils/NullSafeMagnet.scala | 62 ++ .../spark/ml/dsl/utils/NullSafety.scala | 75 -- .../ml/dsl/utils/ObjectSimpleNameMixin.scala | 22 - .../spark/ml/dsl/utils/OptionConversion.scala | 8 - .../apache/spark/ml/dsl/utils/package.scala | 14 +- .../spark/sql/utils/DataTypeRelay.scala | 34 - .../spookystuff/relay/RelaySuite.scala | 19 +- .../relay/xml}/XMLWeakDeserializerSuite.scala | 7 +- .../RecursiveEitherAsUnionToJSONSpike.scala | 8 +- .../utils/PairwiseConversionMixin.scala | 2 +- .../utils/refl/TypeMagnetSpike.scala | 3 +- .../utils/refl/TypeMagnetSuite.scala | 11 +- .../spookystuff}/utils/refl/TypeSpike.scala | 2 +- .../refl/UnreifiedObjectTypeSuite.scala} | 5 +- .../spark/ml/dsl/AbstractDFDSuite.scala | 84 -- .../org/apache/spark/ml/dsl/AppendSuite.scala | 178 ----- .../apache/spark/ml/dsl/CompactionSuite.scala | 138 ---- .../apache/spark/ml/dsl/ComposeSuite.scala | 481 ----------- .../spark/ml/dsl/DFDReadWriteSuite.scala | 100 --- .../org/apache/spark/ml/dsl/DFDSuite.scala | 504 ------------ .../apache/spark/ml/dsl/MapHeadSuite.scala | 231 ------ .../spark/ml/dsl/SchemaAdaptationSuite.scala | 76 -- .../apache/spark/ml/dsl/TrieNodeSuite.scala | 253 ------ .../spark/ml/dsl/UDFTransformerSuite.scala | 49 -- ...xinSuite.scala => ClassOpsMixinSpec.scala} | 12 +- .../spark/ml/dsl/utils/DSLUtilsSuite.scala | 31 - .../ml/dsl/utils/NullSafeMagnetSuite.scala | 32 + .../spark/ml/dsl/utils/NullSafetySuite.scala | 44 -- .../spookystuff/SpookyContext.scala | 2 +- .../spookystuff/actions/actions.scala | 2 +- .../spookystuff/doc/FetchedUDT.scala | 2 +- .../spookystuff/doc/UnstructuredUDT.scala | 2 +- .../spookystuff/execution/Delta.scala | 2 +- .../spookystuff/execution/DeltaPlan.scala | 2 +- .../spookystuff/extractors/GenExtractor.scala | 2 +- .../extractors/GenExtractorImplicits.scala | 2 +- .../extractors/ScalaDynamicExtractor.scala | 2 +- .../spookystuff/extractors/impl/Get.scala | 2 +- .../spookystuff/extractors/impl/Lit.scala | 2 +- .../spookystuff/metrics/AbstractMetrics.scala | 2 +- .../tribbloids/spookystuff/metrics/Acc.scala | 4 +- .../spookystuff/metrics/MetricLike.scala | 6 +- .../spookystuff/python/ref/ClassRef.scala | 2 +- .../spookystuff/rdd/FetchedDataset.scala | 2 +- .../spookystuff/row/SpookySchema.scala | 2 +- .../spookystuff/utils/SpookyUtils.scala | 5 +- .../tribbloids/spookystuff/TestBeans.scala | 2 +- .../execution/ExplodeDataPlanSpec.scala | 2 +- .../ScalaDynamicExtractorSuite.scala | 2 +- .../extractors/ScalaReflectionSpike.scala | 2 +- .../spookystuff/utils/RDDDisperseSuite.scala | 6 +- .../spookystuff/utils/ScalaUDTSuite.scala | 2 +- parent/parsing/build.gradle.kts | 7 + .../spookystuff/graph/Algebra.scala | 0 .../spookystuff/graph/DataAlgebra.scala | 0 .../tribbloids/spookystuff/graph/Domain.scala | 0 .../spookystuff/graph/EdgeFilter.scala | 0 .../spookystuff/graph/Element.scala | 6 +- .../spookystuff/graph/ElementTreeNode.scala | 0 .../spookystuff/graph/ElementView.scala | 0 .../spookystuff/graph/FlowLayout.scala | 0 .../spookystuff/graph/IDAlgebra.scala | 0 .../tribbloids/spookystuff/graph/Layout.scala | 0 .../spookystuff/graph/LocalGraph.scala | 8 +- .../tribbloids/spookystuff/graph/Module.scala | 4 +- .../spookystuff/graph/StaticGraph.scala | 8 +- .../spookystuff/graph/Visualisation.scala | 0 .../parsing/BacktrackingManager.scala | 0 .../spookystuff/parsing/FSMParserDSL.scala | 0 .../spookystuff/parsing/FSMParserGraph.scala | 0 .../spookystuff/parsing/FState.scala | 0 .../spookystuff/parsing/ParsingRun.scala | 0 .../spookystuff/parsing/Pattern.scala | 0 .../spookystuff/parsing/PhaseVec.scala | 0 .../spookystuff/parsing/RuleIO.scala | 0 .../spookystuff/parsing/RuleInput.scala | 0 .../spookystuff/parsing/RuleOutcome.scala | 0 .../spookystuff/parsing/Transitions.scala | 0 .../exception/BacktrackableFailure.scala | 0 .../exception/BacktrackableMixin.scala | 0 .../parsing/exception/ParsingError.scala | 0 .../spookystuff/parsing/package.scala | 0 .../spookystuff/graph/FlowLayoutSuite.scala | 0 .../graph/example/SimpleFlowGraph.scala | 0 .../parsing/FSMParserDSLSuite.scala | 0 .../spookystuff/parsing/ParsingRunSuite.scala | 0 .../spookystuff/web/actions/SelectorUDT.scala | 2 +- prover-commons | 2 +- settings.gradle.kts | 1 + 127 files changed, 242 insertions(+), 4395 deletions(-) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl/UnsafeUtils.scala => com/tribbloids/spookystuff/utils/UnsafeReflections.scala} (97%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/CatalystTypeMixin.scala (77%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/CatalystTypeOps.scala (99%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/FromClassMixin.scala (98%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/FromClassTagMixin.scala (87%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/ReflectionUtils.scala (98%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/ScalaUDT.scala (96%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/ToCatalyst.scala (88%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/TypeMagnet.scala (98%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/TypeUtils.scala (98%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/Unerase.scala (94%) rename parent/commons/src/main/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/UnreifiedObjectType.scala (97%) delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/AbstractNamedStage.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/Compactions.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFD.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFDComponent.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/DynamicParamsMixin.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/PipelineModelShim.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/SchemaAdaptation.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/ShimViews.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepGraph.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepLike.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepMapView.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepTreeNode.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/TrieNode.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/UDFTransformer.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/package.scala create mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixin.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/EnumMixin.scala create mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnet.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafety.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ObjectSimpleNameMixin.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/OptionConversion.scala delete mode 100644 parent/commons/src/main/scala/org/apache/spark/sql/utils/DataTypeRelay.scala rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl/utils => com/tribbloids/spookystuff/relay/xml}/XMLWeakDeserializerSuite.scala (94%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl/utils => com/tribbloids/spookystuff/spike}/RecursiveEitherAsUnionToJSONSpike.scala (92%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/PairwiseConversionMixin.scala (98%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/TypeMagnetSpike.scala (96%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/TypeMagnetSuite.scala (96%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl => com/tribbloids/spookystuff}/utils/refl/TypeSpike.scala (91%) rename parent/commons/src/test/scala/{org/apache/spark/ml/dsl/utils/refl/UnReifiedObjectTypeSuite.scala => com/tribbloids/spookystuff/utils/refl/UnreifiedObjectTypeSuite.scala} (64%) delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/AbstractDFDSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/AppendSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/CompactionSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/ComposeSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDReadWriteSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/MapHeadSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/SchemaAdaptationSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/TrieNodeSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/UDFTransformerSuite.scala rename parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/{ScalaNameMixinSuite.scala => ClassOpsMixinSpec.scala} (53%) delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/DSLUtilsSuite.scala create mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnetSuite.scala delete mode 100644 parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafetySuite.scala create mode 100644 parent/parsing/build.gradle.kts rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Algebra.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/DataAlgebra.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Domain.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/EdgeFilter.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala (97%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/ElementTreeNode.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/ElementView.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/FlowLayout.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/IDAlgebra.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Layout.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala (96%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala (95%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala (89%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/graph/Visualisation.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/BacktrackingManager.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserDSL.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserGraph.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/FState.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/ParsingRun.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/Pattern.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/PhaseVec.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/RuleIO.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/RuleInput.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/RuleOutcome.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/Transitions.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableFailure.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableMixin.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/exception/ParsingError.scala (100%) rename parent/{commons => parsing}/src/main/scala/com/tribbloids/spookystuff/parsing/package.scala (100%) rename parent/{commons => parsing}/src/test/scala/com/tribbloids/spookystuff/graph/FlowLayoutSuite.scala (100%) rename parent/{commons/src/main => parsing/src/test}/scala/com/tribbloids/spookystuff/graph/example/SimpleFlowGraph.scala (100%) rename parent/{commons => parsing}/src/test/scala/com/tribbloids/spookystuff/parsing/FSMParserDSLSuite.scala (100%) rename parent/{commons => parsing}/src/test/scala/com/tribbloids/spookystuff/parsing/ParsingRunSuite.scala (100%) diff --git a/parent/benchmark/build.gradle.kts b/parent/benchmark/build.gradle.kts index ff6e87795..d8895df5f 100644 --- a/parent/benchmark/build.gradle.kts +++ b/parent/benchmark/build.gradle.kts @@ -4,5 +4,6 @@ val vs = versions() dependencies { api(project(":parent:core")) + api(project(":parent:parsing")) testFixturesApi(testFixtures(project(":parent:core"))) } \ No newline at end of file diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/AutomaticRelay.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/AutomaticRelay.scala index d4aa786a0..011dee3a0 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/AutomaticRelay.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/AutomaticRelay.scala @@ -1,6 +1,6 @@ package com.tribbloids.spookystuff.relay -import org.apache.spark.ml.dsl.utils.refl.ReflectionUtils +import com.tribbloids.spookystuff.utils.refl.ReflectionUtils object AutomaticRelay {} diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RelayRegistry.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RelayRegistry.scala index 8f2b74e27..322c1bc1c 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RelayRegistry.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RelayRegistry.scala @@ -1,7 +1,6 @@ package com.tribbloids.spookystuff.relay -import org.apache.spark.ml.dsl.utils.refl.TypeMagnet - +import com.tribbloids.spookystuff.utils.refl.TypeMagnet import scala.collection.mutable import scala.util.{Failure, Success, Try} diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RootTagged.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RootTagged.scala index 78af6d946..21cfaccd9 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RootTagged.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/RootTagged.scala @@ -1,6 +1,6 @@ package com.tribbloids.spookystuff.relay -import org.apache.spark.ml.dsl.utils.refl.TypeMagnet +import com.tribbloids.spookystuff.utils.refl.TypeMagnet trait RootTagged { diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/TreeIR.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/TreeIR.scala index d0d1e55cf..4cdcd6699 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/TreeIR.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/relay/TreeIR.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.relay import com.tribbloids.spookystuff.tree.TreeView import com.tribbloids.spookystuff.relay.io.Decoder -import org.apache.spark.ml.dsl.utils.refl.ReflectionUtils +import com.tribbloids.spookystuff.utils.refl.ReflectionUtils import scala.collection.immutable.ListMap import scala.language.implicitConversions diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/Types.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/Types.scala index 1bec57596..6816ef694 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/Types.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/Types.scala @@ -2,5 +2,5 @@ package com.tribbloids.spookystuff.utils object Types { - type Binary[T] = (T, T) => T + type Compose[T] = (T, T) => T } diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/UnsafeUtils.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/UnsafeReflections.scala similarity index 97% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/UnsafeUtils.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/UnsafeReflections.scala index 4b6340dbe..56ebca563 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/UnsafeUtils.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/UnsafeReflections.scala @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.spark.ml.dsl +package com.tribbloids.spookystuff.utils -object UnsafeUtils { +object UnsafeReflections { def setSuperField(obj: Object, fieldName: String, fieldValue: Object): Unit = { setAncestorField(obj, 1, fieldName, fieldValue) diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/AttrLike.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/AttrLike.scala index 3d955beee..ff86c3e27 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/AttrLike.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/AttrLike.scala @@ -1,7 +1,7 @@ package com.tribbloids.spookystuff.utils.data import ai.acyclic.prover.commons.same.EqualBy -import org.apache.spark.ml.dsl.utils.? +import org.apache.spark.ml.dsl.utils.?? import scala.util.Try @@ -17,7 +17,7 @@ trait AttrLike[T] extends Serializable with EqualBy { Magnets.AttrValueMag[T](this.name, Some(v)) } - def -?>(vOpt: T `?` _): Magnets.AttrValueMag[T] = { + def -?>(vOpt: T ?? _): Magnets.AttrValueMag[T] = { Magnets.AttrValueMag[T](this.name, vOpt.asOption) } diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/EAVLike.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/EAVLike.scala index 2fb4cb615..f824e4973 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/EAVLike.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/data/EAVLike.scala @@ -3,7 +3,7 @@ package com.tribbloids.spookystuff.utils.data import com.tribbloids.spookystuff.relay.RootTagged import com.tribbloids.spookystuff.relay.xml.Xml import com.tribbloids.spookystuff.utils.{CommonUtils, TreeThrowable} -import org.apache.spark.ml.dsl.utils.{?, HasEagerInnerObjects} +import org.apache.spark.ml.dsl.utils.{??, HasEagerInnerObjects} import java.util.Properties import scala.collection.mutable @@ -136,8 +136,8 @@ trait EAVLike extends HasEagerInnerObjects with RootTagged with Serializable { // should only be used in setters val aliases: List[String] = Nil, nullable: Boolean = false, - default: T `?` _ = None, - nameOverride: String `?` _ = None + default: T ?? _ = None, + nameOverride: String ?? _ = None )( implicit ev: T <:< Any diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeMixin.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeMixin.scala similarity index 77% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeMixin.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeMixin.scala index 43ff7cf5f..f2f90de74 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeMixin.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeMixin.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import org.apache.spark.sql.types.DataType diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeOps.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeOps.scala similarity index 99% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeOps.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeOps.scala index 4a317a632..ca911c5e5 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/CatalystTypeOps.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/CatalystTypeOps.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import org.apache.spark.sql.types._ diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassMixin.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassMixin.scala similarity index 98% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassMixin.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassMixin.scala index d23102b73..f7bbdc22e 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassMixin.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassMixin.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import ai.acyclic.prover.commons.util.Caching diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassTagMixin.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassTagMixin.scala similarity index 87% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassTagMixin.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassTagMixin.scala index 66ea3989f..e53af66c0 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/FromClassTagMixin.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/FromClassTagMixin.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import scala.language.implicitConversions import scala.reflect.ClassTag diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ReflectionUtils.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ReflectionUtils.scala similarity index 98% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ReflectionUtils.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ReflectionUtils.scala index b201f906a..aab59838c 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ReflectionUtils.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ReflectionUtils.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl //TODO: simply by using a common relay that different type representation can be cast into /** diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ScalaUDT.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ScalaUDT.scala similarity index 96% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ScalaUDT.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ScalaUDT.scala index 557ccf980..b95da1a5f 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ScalaUDT.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ScalaUDT.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import java.nio.ByteBuffer diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ToCatalyst.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ToCatalyst.scala similarity index 88% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ToCatalyst.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ToCatalyst.scala index 9cb24f824..aa8a80c3d 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/ToCatalyst.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/ToCatalyst.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import org.apache.spark.sql.types.DataType diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnet.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnet.scala similarity index 98% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnet.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnet.scala index e0cf74020..9180b7232 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnet.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnet.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import com.tribbloids.spookystuff.utils.serialization.{SerializerEnv, SerializerOverride} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeUtils.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeUtils.scala similarity index 98% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeUtils.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeUtils.scala index 06ef425a7..2aff687a1 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/TypeUtils.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/TypeUtils.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.ScalaReflection diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/Unerase.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/Unerase.scala similarity index 94% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/Unerase.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/Unerase.scala index df88cc75d..d20527c6b 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/Unerase.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/Unerase.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import scala.reflect.runtime.universe import scala.collection.concurrent.TrieMap diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/UnreifiedObjectType.scala b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectType.scala similarity index 97% rename from parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/UnreifiedObjectType.scala rename to parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectType.scala index 92e5ae4e4..6cf202531 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/refl/UnreifiedObjectType.scala +++ b/parent/commons/src/main/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectType.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import org.apache.spark.sql.catalyst.ScalaReflection.universe._ import org.apache.spark.sql.types._ diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/AbstractNamedStage.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/AbstractNamedStage.scala deleted file mode 100644 index 3149fb306..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/AbstractNamedStage.scala +++ /dev/null @@ -1,103 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.PipelineStage -import org.apache.spark.ml.param.{ParamMap, Params} -import org.apache.spark.ml.param.shared.{HasInputCol, HasInputCols, HasOutputCol} - -import scala.util.Random - -case class AbstractNamedStage[+T <: PipelineStage]( - stage: T, - name: String, - tags: Set[String] = Set(), - outputColOverride: Option[String] = None, // set to manually override output column name - // intermediate: Boolean = false //TODO: enable - _id: String = "" + Random.nextLong() // TODO: multiple Stages with same uid can't be used together? -) { - - import ShimViews._ - - // create a new PipelineStage that doesn't share the same parameter - def replicate: AbstractNamedStage[T] = { - val result = this.copy( - stage = this.stage.copy(ParamMap.empty).asInstanceOf[T], - _id = "" + Random.nextLong() - ) - result - } - - def id: String = outputColOverride.getOrElse(_id) - - def outputOpt: Option[String] = stage match { - case s: HasOutputCol => - Some(s.getOutputCol) - case _ => - None - } - def hasOutputs: Boolean = stage match { - case _: HasOutputCol => true // TODO: do we really need this? implementation is inconsistent - case _ => false - } - def setOutput(v: String): Params = { - stage.trySetOutputCol(v) - } - - def inputs: Seq[String] = stage match { - case s: HasInputCol => Seq(s.getInputCol) - case ss: HasInputCols => ss.getInputCols - case _ => Seq() - } - - // always have inputs - // def hasInputs = stage match { - // case s: HasInputCol => true - // case ss: HasInputCols => true - // case _ => false - // } - def setInputs(v: Seq[String]): AbstractNamedStage[T] = { - if (v.nonEmpty) { // otherwise it can be assumed that the input of this stage is already set. - stage.trySetInputCols(v) - } - this - } - - def show( - showID: Boolean = true, - showInputs: Boolean = true, - showOutput: Boolean = true - ): String = { - - val in = - try { - inputs - } catch { - case _: Exception => - Seq("Pending...") - } - - val inStr = if (showInputs) { - in.mkString("[", ",", "]") + " > " - } else "" - - val out = - try { - outputOpt - } catch { - case _: Exception => - Some("Pending...") - } - - val outStr = if (showOutput) { - " > " + out.mkString("[", ",", "]") - } else "" - - val body = name + { - if (showID) ":" + id - else "" - } - - inStr + body + outStr - } - - override def toString: String = show() -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/Compactions.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/Compactions.scala deleted file mode 100644 index a298a835b..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/Compactions.scala +++ /dev/null @@ -1,63 +0,0 @@ -package org.apache.spark.ml.dsl - -/** - * Created by peng on 28/04/16. - */ -object Compactions { - - object DoNotCompact extends PathCompaction { - - override def apply(v1: Set[Seq[String]]): Map[Seq[String], Seq[String]] = { - Map(v1.map(v => v -> v).toSeq: _*) - } - } - - object PruneDownPath extends PathCompaction { - - override def apply(names: Set[Seq[String]]): Map[Seq[String], Seq[String]] = { - - val trie = TrieNode.build( - names - .map(_.reverse) - .map(v => v -> v) - ) - - val pairs = trie.pruneUp - .flatMap { node => - val k = node.key - node.value.map(_ -> k) - } - .map(tuple => tuple._1.reverse -> tuple._2.reverse) - val lookup: Map[Seq[String], Seq[String]] = Map(pairs: _*) - lookup - } - } - - object PruneDownPathKeepRoot extends PathCompaction { - - override def apply(names: Set[Seq[String]]): Map[Seq[String], Seq[String]] = { - - val trie = TrieNode.build( - names - .map(_.reverse) - .map(v => v -> v) - ) - - val pairs = trie.pruneUp - .flatMap { node => - val k = node.key - node.value.map(_ -> k) - } - .map(tuple => - if (!tuple._2.endsWith(tuple._1.lastOption.toSeq)) { - tuple._1 -> (tuple._2 ++ tuple._1.lastOption) - } else { - tuple._1 -> tuple._2 - } - ) - .map(tuple => tuple._1.reverse -> tuple._2.reverse) - val lookup: Map[Seq[String], Seq[String]] = Map(pairs: _*) - lookup - } - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFD.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFD.scala deleted file mode 100644 index 0cfb796c2..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFD.scala +++ /dev/null @@ -1,173 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.PipelineStage -import com.tribbloids.spookystuff.relay.{MessageAPI, Relay, TreeIR} -import org.apache.spark.sql.types.StructField - -import scala.collection.mutable -import scala.language.implicitConversions - -object DFD extends Relay.<<[DFD] { - - final val DEFAULT_COMPACTION: PathCompaction = Compactions.PruneDownPath - final val DEFAULT_SCHEMA_ADAPTATION: SchemaAdaptation = SchemaAdaptation.FailFast - - final val COMPACTION_FOR_TYPECHECK: PathCompaction = Compactions.DoNotCompact - final val SCHEMA_ADAPTATION_FOR_TYPECHECK: SchemaAdaptation = SchemaAdaptation.IgnoreIrrelevant_ValidateSchema - - def apply(v: PipelineStage): Step = v - def apply(tuple: (PipelineStage, Any)): Step = tuple - def apply(s: Symbol): Source = s - def apply(s: StructField): Source = s - - override def toMessage_>>(flow: DFD): IR_>> = { - - flow.propagateCols(DFD.DEFAULT_COMPACTION) - - val steps: Seq[Step] = flow.coll.values.collect { - case st: Step => st - }.toSeq - - val leftWrappers = flow.leftTails.map(SimpleStepWrapper) - val leftTrees = leftWrappers.map(flow.ForwardNode) - - val rightWrappers = flow.rightTails.map(SimpleStepWrapper) - val rightTrees = rightWrappers.map(flow.ForwardNode) - - TreeIR - .leaf( - this.Msg( - Declaration( - steps.map(Step.toMessageBody) - ), - Seq( - GraphRepr( - leftTrees.map(StepTreeNode.toMessageBody), - `@direction` = Some(FORWARD_LEFT) - ), - GraphRepr( - rightTrees.map(StepTreeNode.toMessageBody), - `@direction` = Some(FORWARD_RIGHT) - ) - ), - HeadIDs(flow.headIDs) - ) - ) - } - - def FORWARD_RIGHT: String = "forwardRight" - def FORWARD_LEFT: String = "forwardLeft" - - case class Msg( - declarations: Declaration, - flowLines: Seq[GraphRepr], - headIDs: HeadIDs - ) extends MessageAPI.<< { - - implicit def stepsToView(steps: StepMap[String, StepLike]): StepMapView = new StepMapView(steps) - - override def toProto_<< : DFD = { - - val steps = declarations.stage.map(_.toProto_<<) - var buffer: StepMap[String, StepLike] = StepMap(steps.map(v => v.id -> v): _*) - - def treeNodeReprToLink(repr: StepTreeNode.Msg): Unit = { - if (!buffer.contains(repr.id)) { - buffer = buffer.updated(repr.id, Source(repr.id, repr.dataTypes.map(_.toProto_<<))) - } - val children = repr.stage - buffer = buffer.connectAll(Seq(repr.id), children.map(_.id)) - children.foreach(treeNodeReprToLink) - } - - for ( - graph <- flowLines; - tree <- graph.flowLine - ) { - treeNodeReprToLink(tree) - } - - val leftTailIDs = flowLines.filter(_.`@direction`.exists(_ == FORWARD_LEFT)).flatMap(_.flowLine.map(_.id)) - val rightTailIDs = flowLines.filter(_.`@direction`.exists(_ == FORWARD_RIGHT)).flatMap(_.flowLine.map(_.id)) - - DFD( - buffer, - leftTailIDs = leftTailIDs, - rightTailIDs = rightTailIDs, - headIDs = headIDs.headID - ) - } - } - - case class Declaration( - stage: Seq[Step.Msg] - ) - - case class GraphRepr( - flowLine: Seq[StepTreeNode.Msg], - `@direction`: Option[String] = None - ) - - case class HeadIDs( - headID: Seq[String] - ) -} - -//TODO: should I be using decorator/mixin? -/** - * End result of the DSL that can be converted to a Spark ML pipeline - * @param coll - * @param leftTailIDs - * @param rightTailIDs - * @param headIDs - * @param fromIDsOpt - */ -case class DFD( - coll: StepMap[String, StepLike], - leftTailIDs: Seq[String], - rightTailIDs: Seq[String], - headIDs: Seq[String], - fromIDsOpt: Option[Seq[String]] = None // overrridden by using "from" function -) extends DFDComponent { - - override def fromIDs: Seq[String] = fromIDsOpt.getOrElse(headIDs) - - lazy val stages: Array[PipelineStage] = coll.values - .collect { - case st: Step => - st.stage.stage - } - .toArray - .distinct - - def from(name: String): DFD = { - val newFromIDs = coll.values.filter(_.name == name).map(_.id).toSeq - this.copy( - fromIDsOpt = Some(newFromIDs) - ) - } - def :>-(name: String): DFD = from(name) - - def and(name: String): DFD = { - val newFromIDs = coll.values.filter(_.name == name).map(_.id).toSeq - this.copy( - fromIDsOpt = Some(this.fromIDs ++ newFromIDs) - ) - } - def :&&(name: String): DFD = and(name) - - def replicate(suffix: String = ""): DFD = { - - val idConversion = mutable.Map[String, String]() - - val newSteps: StepMap[String, StepLike] = replicateColl(suffix = suffix, idConversion = idConversion) - - new DFD( - newSteps, - leftTailIDs.map(idConversion), - rightTailIDs.map(idConversion), - headIDs.map(idConversion), - this.fromIDsOpt - ) - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFDComponent.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFDComponent.scala deleted file mode 100644 index 482673e07..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DFDComponent.scala +++ /dev/null @@ -1,744 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.dsl.StepGraph.{MayHaveHeads, MayHaveTails} -import org.apache.spark.ml.dsl.utils.DSLUtils -import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage, Transformer} -import org.apache.spark.sql.catalyst.expressions.NamedExpression -import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.{Column, DataFrame} -import org.scalameta.ascii.graph.Graph -import org.scalameta.ascii.layout.GraphLayout -import org.scalameta.ascii.layout.prefs.LayoutPrefsImpl - -import scala.collection.mutable.ArrayBuffer -import scala.language.implicitConversions - -object DFDComponent { - - implicit def pipelineStageToStep(v: PipelineStage): Step = { - val namedStage = NamedStage( - v, - v.getClass.getSimpleName, - Set(v.getClass.getSimpleName) - ) - Step(namedStage) - } - - // TODO: why bother importing SQLContext.Implicits? - implicit def pipelineStageTupleToStep(tuple: (PipelineStage, Any)): Step = { - val namedStage = tuple match { - case (v, s: Symbol) => - NamedStage( - v, - s.name, - Set(s.name), - Some(s.name) - ) - case (v, s: Column) => - val clazz = s.getClass - val name = UnsafeUtils.invoke(clazz, s, "named").asInstanceOf[NamedExpression].name - NamedStage( - v, - name, - Set(name), - Some(name) - ) - case (v, s: String) => - NamedStage( - v, - s, - Set(s) - ) - } - Step(namedStage) - } - - implicit def symbolToSource(s: Symbol): Source = { - val name = s.name - Source(name) - } - - implicit def structFieldToSource(s: StructField): Source = { - val name = s.name - val dataType = s.dataType - Source(name, Set(dataType)) - } - - // viewbound parameter - // TODO: why bother importing SQLContext.Implicits? - implicit def columnToSource(s: Column): Source = { - val col: Column = s - val clazz = col.getClass - val name = UnsafeUtils.invoke(clazz, col, "named").asInstanceOf[NamedExpression].name - Source(name) - } - - def declare(flows: DFD*): DFD = { - flows.reduce(_ union _) - } -} - -trait DFDComponent extends MayHaveHeads with MayHaveTails { - - // validations - { - coll.values.foreach { stage => - if (!this.tailIDs.contains(stage.id)) - assume(stage.dependencyIDs.nonEmpty, "non-tail stage should have non-empty dependency") - } - - if (coll.values.toSeq.contains(PASSTHROUGH)) { - assume(hasPASSTHROUGHOutput, "PASSTHROUGH should be detached") - assume(leftTails.contains(PASSTHROUGH), "PASSTHROUGH should be detached") - assume(rightTails.contains(PASSTHROUGH), "PASSTHROUGH should be detached") - } - } - - // no replicate - // ~> this <~ - // | - // "~> right <~ - // | - def composeImpl_>(fromIDs: Seq[String], right: DFDComponent): DFD = { - checkConnectivity_>(fromIDs, right) - val effectiveFromIDs = fromIDs.map(coll).filter(_ != PASSTHROUGH).map(_.id) - val toIDs = right.leftIntakes.map(_.id) - - // detached port should not have any tail removed - val newLeftTailIDs = ( - this.leftTails.flatMap { - case PASSTHROUGH => right.leftTailIDs - case v: StepLike => Seq(v.id) - } - ++ right.leftDetached.map(_.id) - ).distinct - val newRightTailIDs = if (right.headExists) { - ( - right.rightTails.flatMap { - case PASSTHROUGH => this.rightTailIDs - case v: StepLike => Seq(v.id) - } - ++ this.rightRoots.map(_.id) - ).distinct - } else { - this.rightTailIDs - } - - val newTailIDs = newLeftTailIDs ++ newRightTailIDs - val obsoleteIDs = (right.leftConnectors ++ this.PASSTHROUGHOutput) - .filterNot(v => newTailIDs.contains(v.id)) - .map(_.id) // if in the new TailIDs, cannot be deleted which causes not found error. - - val allSteps = (coll ++ right.coll).remove(obsoleteIDs: _*) - val newSteps = allSteps.connectAll(effectiveFromIDs, toIDs) - - val newHeadIDs = if (right.headExists) { - this.headIDs.toBuffer --= fromIDs ++= right.heads.flatMap { - case PASSTHROUGH => effectiveFromIDs - case v: StepLike => Seq(v.id) - } - } else { - this.headIDs - } - - val result = new DFD( - newSteps, - leftTailIDs = newLeftTailIDs, - rightTailIDs = newRightTailIDs, - headIDs = newHeadIDs.toSeq - ) - - result.validateOnSources() - result - } - - // no replicate - // ~> this <~ - // | - // ~> left <~ - // | - def composeImpl_<(fromIDs: Seq[String], left: DFDComponent): DFD = { - checkConnectivity_<(fromIDs, left) - val effectiveFromIDs = fromIDs.map(coll).filter(_ != PASSTHROUGH).map(_.id) - val toIDs = left.rightIntakes.map(_.id) - - // detached port should not have any tail removed - - val newLeftTailIDs: Seq[String] = if (left.headExists) { - ( - left.leftTails.flatMap { - case PASSTHROUGH => this.leftTailIDs - case v: StepLike => Seq(v.id) - } - ++ this.leftRoots.map(_.id) - ).distinct - } else { - this.leftTailIDs - } - val newRightTailIDs = ( - this.rightTails.flatMap { - case PASSTHROUGH => left.rightTailIDs - case v: StepLike => Seq(v.id) - } - ++ left.rightDetached.map(_.id) - ).distinct - - val newTailIDs = newLeftTailIDs ++ newRightTailIDs - val obsoleteIDs = (left.rightConnectors ++ this.PASSTHROUGHOutput) - .filterNot(v => newTailIDs.contains(v.id)) - .map(_.id) // if in the new TailIDs, cannot be deleted which causes not found error. - - val allSteps = (coll ++ left.coll).remove(obsoleteIDs: _*) - val newSteps = allSteps.connectAll(effectiveFromIDs, toIDs) - - val newHeadIDs: Seq[String] = if (left.headExists) { - this.headIDs.toBuffer.--=(fromIDs).toSeq ++ left.heads.flatMap { - case PASSTHROUGH => effectiveFromIDs - case v: StepLike => Seq(v.id) - } - } else { - this.headIDs - } - - val result = new DFD( - newSteps, - leftTailIDs = newLeftTailIDs, - rightTailIDs = newRightTailIDs, - headIDs = newHeadIDs.toSeq - ) - - result.validateOnSources() - result - } - - def compose_>(right: DFDComponent): DFD = composeImpl_>(this.fromIDs, right) - def compose(right: DFDComponent): DFD = compose_>(right) - def :>>(right: DFDComponent): DFD = compose_>(right) -// def >(right: FlowComponent) = compose_>(right) - - // TODO: fast-forward handling: if right is reused for many times, ensure that only the part that doesn't overlap with this got duplicated (conditional duplicate) - def mapHead_>(right: DFDComponent): DFD = { - - // checkConnectivity_>(fromIDs, right) - val firstResult: DFD = this.composeImpl_>(Seq(fromIDs.head), right) - - this.fromIDs.slice(1, Int.MaxValue).foldLeft(firstResult) { (flow, id) => - flow.composeImpl_>(Seq(id), right.replicate()) - } - } - def mapHead(right: DFDComponent): DFD = mapHead_>(right) - def :=>>(right: DFDComponent): DFD = mapHead_>(right) - - def compose_<(left: DFDComponent): DFD = composeImpl_<(this.fromIDs, left) - def <<:(left: DFDComponent): DFD = compose_<(left) -// def <(left: FlowComponent) = compose_<(left) - - def replicate(suffix: String = ""): DFDComponent - - def mapHead_<(left: DFDComponent): DFD = { - - // checkConnectivity_<(fromIDs, left) - val firstResult: DFD = this.composeImpl_<(Seq(fromIDs.head), left) - - this.fromIDs.slice(1, Int.MaxValue).foldLeft(firstResult) { (flow, id) => - flow.composeImpl_<(Seq(id), left.replicate()) - } - } - def <<=:(prev: DFDComponent): DFD = mapHead_<(prev) - - def union(another: DFDComponent): DFD = { - val result = DFD( - coll = this.coll UU another.coll, - leftTailIDs = (this.leftTailIDs ++ another.leftTailIDs).distinct, - rightTailIDs = (this.rightTailIDs ++ another.rightTailIDs).distinct, - headIDs = (this.headIDs ++ another.headIDs).distinct - ) - result.validateOnSources() - result - } - def U(another: DFDComponent): DFD = union(another) - - def append_>(right: DFDComponent): DFD = { - val intakes = right.leftIntakes - require(intakes.size <= 1, "non-linear right operand, please use compose_>, mapHead_> or union instead") - intakes.headOption match { - case Some(_) => - this.mapHead_>(right) - case _ => - this.union(right) - } - } - def append(right: DFDComponent): DFD = append_>(right) - def :->(right: DFDComponent): DFD = append_>(right) - - def append_<(left: DFDComponent): DFD = { - val intakes = left.rightIntakes - require(intakes.size <= 1, "non-linear left operand, please use compose_<, mapHead_< or union instead") - intakes.headOption match { - case Some(_) => - this.mapHead_<(left) - case _ => - this.union(left) - } - } - def <-:(left: DFDComponent): DFD = append_<(left) - - case class StepVisualWrapper( - override val self: StepLike, - showID: Boolean = true, - showInputs: Boolean = true, - showOutput: Boolean = true, - showPrefix: Boolean = true - ) extends StepWrapperLike(self) { - - def prefixes: Seq[String] = - if (showPrefix) { - val buffer = ArrayBuffer[String]() - if (DFDComponent.this.headIDs contains self.id) buffer += "HEAD" - // else { - val isLeftTail = DFDComponent.this.leftTailIDs contains self.id - val isRightTail = DFDComponent.this.rightTailIDs contains self.id - if (isLeftTail && isRightTail) buffer += "TAIL" - else { - if (isLeftTail) buffer += "TAIL>" - if (isRightTail) buffer += " v.stage.show(showID, showInputs, showOutput) - case v: Connector => "[" + v.id + "]" - } - } - - override def copy(self: StepLike): StepWrapperLike = - StepVisualWrapper(self, showID, showInputs, showOutput, showPrefix) - } - // TODO: not optimized, children are repeatedly created when calling .path - // TODO: use mapChildren to recursively get TreeNode[(Seq[String] -> Tree)] efficiently - case class ForwardNode( - wrapper: StepWrapperLike - ) extends StepTreeNode[ForwardNode] { - - // def prefix = if (this.children.nonEmpty) "v " - def prefix: String = - if (this.children.nonEmpty) "> " - else "> " - - override def nodeName: String = prefix + super.nodeName - - override val self: StepLike = wrapper.self - - override lazy val children: Seq[ForwardNode] = { - self.usageIDs - .map { id => - DFDComponent.this.coll(id) - } - .toList - .sortBy(_.name) - .map { v => - ForwardNode( - wrapper.copy( - v - ) - ) - } - } - } - - case class BackwardNode( - wrapper: StepWrapperLike - ) extends StepTreeNode[BackwardNode] { - - // def prefix = if (this.children.nonEmpty) "^ " - def prefix: String = - if (this.children.nonEmpty) "< " - else "< " - - override def nodeName: String = prefix + super.nodeName - - override val self: StepLike = wrapper.self - - override lazy val children: Seq[BackwardNode] = { - self.dependencyIDs - .map { id => - DFDComponent.this.coll(id) - } - .map(v => BackwardNode(wrapper.copy(v))) - } - } - - def disambiguateNames[T <: PipelineStage](ids_MultiPartNames: Map[String, Seq[String]]): Map[String, Seq[String]] = { - val ids_disambiguatedNames = ids_MultiPartNames - .groupBy(_._2) - .map { tuple => - val coNamed = tuple._2 - val revised: Map[String, Seq[String]] = if (coNamed.size > 1) { - coNamed.zipWithIndex.map { withIndex => - val id = withIndex._1._1 - val names = withIndex._1._2 - val lastName = names.last - val withSuffix = lastName + withIndex._2 - val namesWithSuffix = names.slice(0, names.size - 1) :+ withSuffix - id -> namesWithSuffix - }.toMap - } else coNamed - revised - } - .reduce(_ ++ _) - ids_disambiguatedNames - } - - // this operation IS stateful & destructive, any other options? - // TODO: should generate deep copy to become stateless - def propagateCols[T <: PipelineStage](compaction: PathCompaction): Unit = { - val ids_MultiPartNames = coll.mapValues(v => this.BackwardNode(StepVisualWrapper(v)).mergedPath) - - val lookup = compaction(ids_MultiPartNames.values.toSet) - val compactNames = lookup.values.toSeq - require(compactNames.size == compactNames.distinct.size) - - val ids_compactNames = ids_MultiPartNames.mapValues(lookup).toMap - val ids_disambiguatedNames = disambiguateNames(ids_compactNames) - val disambiguatedNames = ids_disambiguatedNames.values.toSeq - require(disambiguatedNames.size == disambiguatedNames.distinct.size) - - val ids_cols = ids_disambiguatedNames.mapValues(_.mkString("$")) - - this.coll.foreach { - case (_, step: Step) => - val stage = step.stage - - if (stage.hasOutputs) { - val outCol = ids_cols(step.id) - stage.setOutput(outCol) - } - - val inCols = step.dependencyIDs.map(ids_cols) - stage.setInputs(inCols) - case _ => // do nothing - } - } - - // algorithm that starts from tail and gradually append by exploring all directed edges, - // it only append steps that has all dependencies in the list - // it is fast and can be used whenever a new Flow is constructed and has typed sources. - def buildStagesImpl[T <: PipelineStage]( - compaction: PathCompaction = DFD.DEFAULT_COMPACTION, - fieldsEvidenceOpt: Option[Array[StructField]] = - None, // set this to make pipeline adaptive to df being transformed. - adaptation: SchemaAdaptation = DFD.DEFAULT_SCHEMA_ADAPTATION - ): Pipeline = { - propagateCols(compaction) - - val stageBuffer = ArrayBuffer[T]() - - val effectiveAdaptation = fieldsEvidenceOpt match { - case None => SchemaAdaptation.Force - case _ => adaptation - } - - // has to preserve order of insertion. - val queue: StepBuffer[String, StepLike] = effectiveAdaptation match { - case SchemaAdaptation.Force => - StepBuffer - .newBuilder[String, StepLike] - .++= { - sourceColl - } - .result() - case _ => - StepBuffer - .newBuilder[String, StepLike] - .++= { - fieldsEvidenceOpt.get.map { field => - val source = Source(field.name, dataTypes = Set(field.dataType)) - source.id -> source - } - } - .result() - } - - // if nonEmpty, validate sink in each iteration by performing a PipelineStage.transformSchema - var currentSchemaOpt: Option[StructType] = effectiveAdaptation match { - case _: SchemaAdaptation.TypeUnsafe => - None - case _ => - fieldsEvidenceOpt.map { fields => - new StructType(fields) - } - } - - val allSteps = this.coll.collect { - case (id: String, step: Step) => id -> step - } - val warehouse: StepBuffer[String, Step] = { - StepBuffer.newBuilder - .++= { - allSteps - } - .result() - } - - // has 2 resolutions: - // if dependency is fulfilled and pass the schema check, return it - // if dependency is fulfilled but but doesn't pass schema check (if any), do not return it/fail fast depending on adaptation - - def nextOptImpl(): Option[(String, Step)] = { - val candidate = warehouse.find { v => - if (v._2.dependencyIDs.forall(queue.contains)) { - - // schema validation here. - try { - currentSchemaOpt = currentSchemaOpt.map { schema => - v._2.stage.stage.transformSchema(schema) - } - true - } catch { - case e: Exception => - effectiveAdaptation match { - case _: SchemaAdaptation.FailOnInconsistentSchema => - throw e - case SchemaAdaptation.IgnoreIrrelevant => - false - case _ => - sys.error("impossible") - } - } - } else false - } - candidate - } - - var nextOpt = nextOptImpl() - while (nextOpt.nonEmpty) { - val next = nextOpt.get - - stageBuffer += next._2.stage.stage.asInstanceOf[T] - - warehouse -= next._1 - queue += next - - nextOpt = nextOptImpl() - } - - val result = new Pipeline() - .setStages(stageBuffer.toArray[PipelineStage]) - - effectiveAdaptation match { - case _: SchemaAdaptation.IgnoreIrrelevant => - // add assumed sources into fulfilled dependency list and try again to exhaust warehouse - queue ++= this.coll.collect { - case (id: String, src: Source) => id -> src - } - currentSchemaOpt = None // typeCheck no longer required - - nextOpt = nextOptImpl() - while (nextOpt.nonEmpty) { - val next = nextOpt.get - - warehouse -= next._1 - queue += next - - nextOpt = nextOptImpl() - } - case _ => - require( - warehouse.isEmpty, - s"Missing dependency:\n" + warehouse.values.map(_.stage).mkString("\n") - ) - } - - require( - warehouse.isEmpty, - "Cyclic pipeline stage dependency:\n" + warehouse.values.map(_.stage).mkString("\n") - ) - - result - } - - def build( - compaction: PathCompaction = DFD.DEFAULT_COMPACTION, - fieldsEvidence: Array[StructField] = null, // set this to make pipeline adaptive to df being transformed. - schemaEvidence: StructType = null, // set this to make pipeline adaptive to df being transformed. - dfEvidence: DataFrame = null, // set this to make pipeline adaptive to df being transformed. - adaptation: SchemaAdaptation = DFD.DEFAULT_SCHEMA_ADAPTATION - ): Pipeline = { - - buildStagesImpl[PipelineStage]( - compaction, - Option(fieldsEvidence) - .orElse { - Option(schemaEvidence).map(_.fields) - } - .orElse { - Option(dfEvidence).map(_.schema.fields) - }, - adaptation - ) - } - - def buildModel( - compaction: PathCompaction = DFD.DEFAULT_COMPACTION, - fieldsEvidence: Array[StructField] = null, // set this to make pipeline adaptive to df being transformed. - schemaEvidence: StructType = null, // set this to make pipeline adaptive to df being transformed. - dfEvidence: DataFrame = null, // set this to make pipeline adaptive to df being transformed. - adaptation: SchemaAdaptation = DFD.DEFAULT_SCHEMA_ADAPTATION - ): PipelineModel = { - - coll.foreach { - case (_, v: Step) => require(v.stage.stage.isInstanceOf[Transformer]) - case _ => - } - - val pipeline = buildStagesImpl[Transformer]( - compaction, - Option(fieldsEvidence) - .orElse { - Option(schemaEvidence).map(_.fields) - } - .orElse { - Option(dfEvidence).map(_.schema.fields) - }, - adaptation - ) - - new PipelineModel(pipeline.uid, pipeline.getStages.map(_.asInstanceOf[Transformer])) - .setParent(pipeline) - } - - // preemptive buildStage with type safety check, always fail fast - // use to validate in FlowComponent Constructor and fail early. - // stateless, replicate self before applying propagateCols, stateful changes are discarded. - protected def validateOnSchema(fieldsEvidence: Array[StructField]): Unit = { - this - .replicate() - .buildStagesImpl[PipelineStage]( - DFD.COMPACTION_FOR_TYPECHECK, - fieldsEvidenceOpt = Some(fieldsEvidence), - adaptation = DFD.SCHEMA_ADAPTATION_FOR_TYPECHECK - ) - } - - protected def validateOnSources(): Unit = { - val fields: List[Set[StructField]] = this.sourceColl - .filter(_._2.dataTypes.nonEmpty) - .values - .map { source => - source.dataTypes.map(t => StructField(source.name, t)) - } - .toList - - val cartesian: Set[List[StructField]] = DSLUtils.cartesianProductSet(fields) - val schemas = cartesian.map(v => new StructType(v.toArray)) - schemas.foreach { schema => - if (schema.fields.nonEmpty) validateOnSchema(schema.fields) - } - } - - def showForwardTree( - tails: Seq[StepLike], - showID: Boolean, - showInputs: Boolean, - showOutput: Boolean, - showPrefix: Boolean - ): String = { - tails - .map { tail => - val prettyTail = StepVisualWrapper(tail, showID, showInputs, showOutput, showPrefix) - val treeNode = ForwardNode(prettyTail) - treeNode.treeString(verbose = false) - } - .mkString("") - } - - def showBackwardTree( - heads: Seq[StepLike], - showID: Boolean, - showInputs: Boolean, - showOutput: Boolean, - showPrefix: Boolean - ): String = { - heads - .map { head => - val prettyHead = StepVisualWrapper(head, showID, showInputs, showOutput, showPrefix) - val treeNode = BackwardNode(prettyHead) - treeNode.treeString(verbose = false) - } - .mkString("") - } - - final protected val mirrorImgs: List[(Char, Char)] = List( - 'v' -> '^', - '┌' -> '└', - '┘' -> '┐', - '┬' -> '┴' - ) - - protected def flipChar(char: Char): Char = { - mirrorImgs.find(_._1 == char).map(_._2).getOrElse { - mirrorImgs.find(_._2 == char).map(_._1).getOrElse { - char - } - } - } - - final protected val layoutPrefs: LayoutPrefsImpl = LayoutPrefsImpl(unicode = true, explicitAsciiBends = false) - - def showASCIIArt( - showID: Boolean = true, - showInputs: Boolean = true, - showOutput: Boolean = true, - showPrefix: Boolean = true, - forward: Boolean = true - ): String = { - - val prettyColl = coll.mapValues { v => - StepVisualWrapper(v, showID, showInputs, showOutput, showPrefix) - } - - val vertices: Set[StepVisualWrapper] = prettyColl.values.toSet - val edges: List[(StepVisualWrapper, StepVisualWrapper)] = prettyColl.values.toList.flatMap { v => - v.self.usageIDs.map(prettyColl).map(vv => v -> vv) - } - val graph: Graph[StepVisualWrapper] = Graph[StepVisualWrapper](vertices = vertices, edges = edges) - - val forwardStr = GraphLayout.renderGraph(graph, layoutPrefs = layoutPrefs) - if (forward) forwardStr - else { - forwardStr - .split('\n') - .reverse - .mkString("\n") - .map(flipChar) - } - } - - def show( - showID: Boolean = true, - showInputs: Boolean = true, - showOutput: Boolean = true, - showPrefix: Boolean = true, - forward: Boolean = true, - asciiArt: Boolean = false, - compactionOpt: Option[PathCompaction] = Some(DFD.DEFAULT_COMPACTION) - ): String = { - compactionOpt.foreach(this.propagateCols) - - if (!asciiArt) { - if (forward) { - - "\\ left >\n" + showForwardTree(leftTails, showID, showInputs, showOutput, showPrefix) + - "/ right <\n" + showForwardTree(rightTails, showID, showInputs, showOutput, showPrefix) - } else { - - showBackwardTree(this.heads, showID, showInputs, showOutput, showPrefix) - } - } else { - showASCIIArt(showID, showInputs, showOutput, showPrefix, forward) - } - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DynamicParamsMixin.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DynamicParamsMixin.scala deleted file mode 100644 index 543d4c8a9..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/DynamicParamsMixin.scala +++ /dev/null @@ -1,76 +0,0 @@ -package org.apache.spark.ml.dsl - -import ai.acyclic.prover.commons.debug.Debug.CallStackRef -import com.tribbloids.spookystuff.relay.{MessageMLParam, Relay} -import org.apache.spark.ml.dsl.utils.DSLUtils -import com.tribbloids.spookystuff.relay.io.FallbackSerializer -import org.apache.spark.ml.param.{Param, Params} -import org.json4s.Formats - -import scala.language.{dynamics, implicitConversions} -import scala.reflect.ClassTag - -/** - * Created by peng on 10/04/16. - */ -trait DynamicParamsMixin extends Params with Dynamic { - - implicit protected def unwrap[T](v: Param[T]): T = this.getOrDefault(v) - - def applyDynamic(methodName: String)(args: Any*): this.type = { - - if (methodName.startsWith("set")) { - assert(args.length == 1) - val arg = args.head - - val fieldName = methodName.stripPrefix("set") - val fieldOption = - this.params.find(v => (v.name == fieldName) || (DSLUtils.liftCamelCase(v.name) == fieldName)) - - fieldOption match { - case Some(field) => - set(field.asInstanceOf[Param[Any]], arg) - case None => - throw new IllegalArgumentException(s"parameter $fieldName doesn't exist") - // dynamicParams.put(fieldName, arg) - } - - this - } else throw new IllegalArgumentException(s"function $methodName doesn't exist") - } - - protected def Param[T: ClassTag]( - name: String = CallStackRef.here.pop { v => - v.isArgDefault || v.isLazyCompute - }.fnName, - doc: String = "Pending ...", - default: T = null - ): Param[T] = { - - val result = new Param[T](this, name, doc) - - Option(default).foreach(v => this.setDefault(result, v)) - - result - } - - protected def GenericParam[T: Manifest]( - name: String = CallStackRef.here.pop { v => - v.isArgDefault || v.isLazyCompute - }.fnName, - doc: String = "Pending ...", - default: T = null - ): Param[T] = { - - val reader = new Relay.ToSelf[T]() { - - override def fallbackFormats: Formats = super.fallbackFormats + FallbackSerializer - } - - val result: MessageMLParam[T] = reader.Param(this, name, doc) - - Option(default).foreach(v => this.setDefault(result, v)) - - result - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/PipelineModelShim.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/PipelineModelShim.scala deleted file mode 100644 index cdf4dc4cd..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/PipelineModelShim.scala +++ /dev/null @@ -1,17 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.{PipelineModel, Transformer} - -/** - * Created by peng on 11/04/16. - */ -object PipelineModelShim { - - def create( - stages: Array[Transformer], - uid: String = Identifiable.randomUID(classOf[PipelineModel].getSimpleName) - ): PipelineModel = new PipelineModel(uid, stages) - - def apply(stages: Transformer*): PipelineModel = create(stages.toArray) -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/SchemaAdaptation.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/SchemaAdaptation.scala deleted file mode 100644 index 47708aa82..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/SchemaAdaptation.scala +++ /dev/null @@ -1,28 +0,0 @@ -package org.apache.spark.ml.dsl - -/** - * Created by peng on 29/04/16. - */ -sealed abstract class SchemaAdaptation - -object SchemaAdaptation { - - // disable schema validations ( e.g. Transformer.transformSchema) - sealed trait TypeUnsafe extends SchemaAdaptation - - sealed trait FailOnInconsistentSchema extends SchemaAdaptation - sealed trait FailOnNonExistingInputCol extends SchemaAdaptation - - object FailFast extends FailOnInconsistentSchema with FailOnNonExistingInputCol - object FailFast_TypeUnsafe extends FailOnNonExistingInputCol with TypeUnsafe - - // allow incomplete output - sealed abstract class IgnoreIrrelevant extends SchemaAdaptation - - object IgnoreIrrelevant extends IgnoreIrrelevant - object IgnoreIrrelevant_TypeUnsafe extends IgnoreIrrelevant with TypeUnsafe - - object IgnoreIrrelevant_ValidateSchema extends IgnoreIrrelevant with FailOnInconsistentSchema - - object Force extends TypeUnsafe -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/ShimViews.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/ShimViews.scala deleted file mode 100644 index a47dc7ba4..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/ShimViews.scala +++ /dev/null @@ -1,30 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.param.Params -import org.apache.spark.ml.param.shared.{HasInputCol, HasInputCols, HasOutputCol} - -object ShimViews { - - implicit class ParamsView(params: Params) { - def trySetInputCols(v: Seq[String]): Params = { - params match { - case s: HasInputCol => - require(v.size == 1, s"${s.getClass.getSimpleName} can only have 1 inputCol") - s.set(s.inputCol, v.head) - case ss: HasInputCols => - ss.set(ss.inputCols, v.toArray) - case _ => - params - } - } - - def trySetOutputCol(v: String): Params = { - params match { - case s: HasOutputCol => - s.set(s.outputCol, v) - case _ => - params - } - } - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepGraph.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepGraph.scala deleted file mode 100644 index 8c87c9ec6..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepGraph.scala +++ /dev/null @@ -1,198 +0,0 @@ -package org.apache.spark.ml.dsl - -import scala.collection.mutable -import scala.language.implicitConversions - -trait StepGraph { - - def coll: StepMap[String, StepLike] - - def sourceColl: StepMap[String, Source] = { - this.coll.collect { - case (id: String, src: Source) => id -> src - } - } - - // generate new copy for each PipelineStage in this collection to - // prevent one set of parameters (particularly InputCol & OutputCol) being used in multiple steps in the pipeline, - // and attempts to set them interfere with each other - def replicate(suffix: String = ""): StepGraph - - protected def replicateColl( - idConversion: mutable.Map[String, String] = mutable.Map[String, String](), - suffix: String = "", - condition: ((String, StepLike)) => Boolean = { _: (String, StepLike) => - true - } // TODO: use it! - ): StepMap[String, StepLike] = { - - val replicatedSteps = coll.map { tuple => - val step = tuple._2.replicate(suffix) - idConversion += (tuple._1 -> step.id) - step - } - - val newStepList = replicatedSteps.map { step => - step.wth( - dependencyIDs = step.dependencyIDs.map(idConversion), - usageIDs = step.usageIDs.map(idConversion) - ) - }.toSeq - - val newSteps: StepMap[String, StepLike] = StepMap(newStepList.map { step => - step.id -> step - }: _*) - newSteps - } - - def connect(fromID: String, toID: String): StepMap[String, StepLike] = { - val from = coll(fromID) - val to = coll(toID) - - require(from != PASSTHROUGH) - require(to != PASSTHROUGH) - - val updatedFrom = from.wth(usageIDs = from.usageIDs + toID) - val updatedTo = to.wth(dependencyIDs = to.dependencyIDs :+ fromID) - val updatedSteps = coll ++ Seq(fromID -> updatedFrom, toID -> updatedTo) - updatedSteps - } - - // TODO: optimize - def connectAll(fromIDs: Seq[String], toIDs: Seq[String]): StepMap[String, StepLike] = { - var result = coll - for ( - i <- fromIDs; - j <- toIDs - ) { - result = result.connect(i, j) - } - result - } - - def cutInputs(id: String): StepMap[String, StepLike] = { - val step = coll(id) - val inSteps = step.dependencyIDs.map(coll) - coll + - (id -> step.wth(dependencyIDs = Nil)) ++ - inSteps.map(in => in.id -> in.wth(usageIDs = in.usageIDs - id)) - } - - def cutOutputs(id: String): StepMap[String, StepLike] = { - val step = coll(id) - val outSteps = step.usageIDs.map(coll) - coll + - (id -> step.wth(usageIDs = Set.empty)) ++ - outSteps.map(out => out.id -> out.wth(dependencyIDs = out.dependencyIDs.toBuffer.-=(id).toSeq)) - } - - def remove1(id: String): StepMap[String, StepLike] = { - this.cutInputs(id).cutOutputs(id) - id - } - - def remove(ids: String*): StepMap[String, StepLike] = ids.foldLeft(coll) { (coll, id) => - coll.remove1(id) - } - - protected def unionImpl(coll2: StepMap[String, StepLike]): StepMap[String, StepLike] = { - val allSteps = coll ++ coll2 - val result: StepMap[String, StepLike] = StepMap[String, StepLike](allSteps.mapValues { step => - val id = step.id - step.wth( - dependencyIDs = (coll.get(id) ++ coll2.get(id)).map(_.dependencyIDs).reduce(_ ++ _).distinct, - usageIDs = (coll.get(id) ++ coll2.get(id)).map(_.usageIDs).reduce(_ ++ _) - ) - }.toSeq: _*) - result - } - - def UU(another: StepMap[String, StepLike]): StepMap[String, StepLike] = unionImpl(another) - - implicit def stepsToView(steps: StepMap[String, StepLike]): StepMapView = new StepMapView(steps) -} - -object StepGraph { - - trait MayHaveTails extends StepGraph { - - def leftTailIDs: Seq[String] - final lazy val leftTails: Seq[StepLike] = leftTailIDs.map(coll) - final lazy val leftConnectors: Seq[Connector] = leftTails.collect { - case v: Connector => v - } - - // root: has no src itself & is not a right tail - final lazy val leftRoots: Seq[StepLike] = leftTails.collect { - case v if v.dependencyIDs.isEmpty && (!rightTails.contains(v)) => v - } - - // detached: a source that has no target, it is a tail but already end of the lineage - // always a source - final lazy val leftDetached: Seq[Source] = leftTails.collect { - case v: Source if v.usageIDs.isEmpty => v - } - - // intake: if tail is a source (rather than a step) go 1 step ahead to reach the real step - // always a step - final lazy val leftIntakes: Seq[Step] = leftTails.flatMap { - case tail: Step => - Seq(tail) - case source: Source => - source.usageIDs.map(coll).map(_.asInstanceOf[Step]) - case PASSTHROUGH => Nil - } - - final def canConnectFromLeft: Boolean = leftIntakes.nonEmpty || leftTails.contains(PASSTHROUGH) - - def rightTailIDs: Seq[String] - final lazy val rightTails: Seq[StepLike] = rightTailIDs.map(coll) - final lazy val rightConnectors: Seq[Connector] = rightTails.collect { - case v: Connector => v - } - final lazy val rightRoots: Seq[StepLike] = rightTails.collect { - case v if v.dependencyIDs.isEmpty && (!leftTails.contains(v)) => v - } - final lazy val rightDetached: Seq[Source] = rightTails.collect { - case v: Source if v.usageIDs.isEmpty => v - } - final lazy val rightIntakes: Seq[Step] = rightTails.flatMap { - case tail: Step => - Seq(tail) - case source: Source => - source.usageIDs.map(coll).map(_.asInstanceOf[Step]) - case PASSTHROUGH => Nil - } - - final def canConnectFromRight: Boolean = rightIntakes.nonEmpty || rightTails.contains(PASSTHROUGH) - - def tailIDs: Seq[String] = leftTailIDs ++ rightTailIDs - def tails: Seq[StepLike] = leftTails ++ rightTails - } - - trait MayHaveHeads extends StepGraph { - - def headIDs: Seq[String] - def fromIDs: Seq[String] = headIDs - def headExists: Boolean = headIDs.nonEmpty - - final lazy val heads: Seq[StepLike] = headIDs.map(coll) - final lazy val PASSTHROUGHOutput: Option[Connector] = heads.find(_ == PASSTHROUGH) map (_.asInstanceOf[Connector]) - final lazy val hasPASSTHROUGHOutput: Boolean = heads.contains(PASSTHROUGH) - // all heads must have outIDs - - // TODO: separate outlet (head with outIDs) with head, which should simply denotes end of a pipe - heads.foreach(v => require(v.canBeHead)) - - protected def checkConnectivity_>(fromIDs: Seq[String], right: MayHaveTails): Unit = { - val froms: Seq[StepLike] = fromIDs.map(coll) - require(froms.nonEmpty, "has no from") - require(right.canConnectFromLeft, "has no left intake") - } - - protected def checkConnectivity_<(fromIDs: Seq[String], left: MayHaveTails): Unit = { - val froms = fromIDs.map(coll) - require(froms.nonEmpty, "has no from") - require(left.canConnectFromRight, "has no right intake") - } - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepLike.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepLike.scala deleted file mode 100644 index 032eb7e99..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepLike.scala +++ /dev/null @@ -1,259 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.PipelineStage -import com.tribbloids.spookystuff.relay.{MessageAPI, Relay, TreeIR} -import org.apache.spark.ml.param.{ParamPair, Params} -import org.apache.spark.sql.ColumnName -import org.apache.spark.sql.types.DataType -import org.apache.spark.util.Utils -import org.json4s.JsonAST.JObject -import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} -import org.json4s.{JArray, JBool, JDecimal, JDouble, JInt, JNull, JString, JValue} - -import scala.collection.mutable -import scala.util.Try -import scala.collection.immutable.ListMap - -/** - * Created by peng on 24/04/16. - */ -trait StepLike extends DFDComponent { - - def id: String - def name: String - - override def coll: ListMap[String, StepLike] = StepMap(id -> this) - - override def replicate(suffix: String = ""): StepLike - - // TODO: generalized into Map[Param, Seq[String]] - def dependencyIDs: Seq[String] - - // unlike inIDs, sequence of outIDs & parameter types (if not InputCol(s)) are not important - def usageIDs: Set[String] - def canBeHead: Boolean - - if (!canBeHead) assert(usageIDs.isEmpty) - - def wth(dependencyIDs: Seq[String] = dependencyIDs, usageIDs: Set[String] = usageIDs): StepLike - - override def headIDs: Seq[String] = - if (canBeHead) Seq(id) - else Nil - - override def leftTailIDs: Seq[String] = Seq(id) - - override def rightTailIDs: Seq[String] = Seq(id) -} - -object Step extends Relay.<<[Step] { - - val paramMap: Option[JValue] = None - - override def toMessage_>>(v: Step): IR_>> = { - import org.json4s.JsonDSL._ - import v._ - - val instance = stage.stage - val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] - val jsonParams: JValue = paramMap.getOrElse( - render( - params.map { - case ParamPair(p, vv) => - p.name -> parse(p.jsonEncode(vv)) - }.toList - ) - ) - - TreeIR - .leaf( - Msg( - id, - stage.name, - stage.tags, - stage.outputColOverride, - instance.getClass.getCanonicalName, - Some(instance.uid), - params = Some(jsonParams) - ) - ) - } - - case class Msg( - id: String, - name: String, - tag: Set[String], - forceOutput: Option[String], - implementation: String, - uid: Option[String] = None, - params: Option[JValue] = None - ) extends MessageAPI.<< { - - override lazy val toProto_<< : Step = { - - val cls = Utils.classForName(implementation) - val instance = cls.getConstructor(classOf[String]).newInstance(uid.toSeq: _*).asInstanceOf[PipelineStage] - getAndSetParams(instance, params.getOrElse(JNull)) - - val stage = NamedStage( - instance, - name, - tag, - forceOutput, - id - ) - - Step(stage) - } - - // TODO: can we merge this into Relay? - def getAndSetParams(instance: Params, params: JValue): Unit = { -// implicit val format = Xml.defaultFormats - params match { - case JObject(pairs) => - pairs.foreach { - case (paramName, jsonValue) => - val param = instance.getParam(paramName) - val valueTry = Try { - param.jsonDecode(compact(render(jsonValue))) - }.orElse { - Try { - param.jsonDecode(compact(render(JArray(List(jsonValue))))) - } - } - - val value = jsonValue match { - case js: JString => - valueTry - .orElse { - Try { - param.jsonDecode(compact(render(JInt(js.values.toLong)))) - } - } - .orElse { - Try { - param.jsonDecode(compact(render(JDouble(js.values.toDouble)))) - } - } - .orElse { - Try { - param.jsonDecode(compact(render(JDecimal(js.values.toDouble)))) - } - } - .orElse { - Try { - param.jsonDecode(compact(render(JBool(js.values.toBoolean)))) - } - } - .get - case _ => - valueTry.get - } - - instance.set(param, value) - } - case _ => - throw new IllegalArgumentException(s"Cannot recognize JSON metadata:\n ${pretty(params)}.") - } - } - } -} - -case class Step( - stage: NamedStage, - dependencyIDs: Seq[String] = Seq(), - usageIDs: Set[String] = Set.empty -) extends StepLike { - - { - assert(this.id != PASSTHROUGH.id) - assert(!this.dependencyIDs.contains(PASSTHROUGH.id)) - assert(!this.usageIDs.contains(PASSTHROUGH.id)) - } - - override def canBeHead: Boolean = stage.hasOutputs - - val replicas: mutable.Set[Step] = mutable.Set.empty - - override def replicate(suffix: String = ""): Step = { - val replica = stage.replicate - val newStage = replica.copy(name = replica.name + suffix, outputColOverride = replica.outputColOverride.map(_ + "")) - - val result = this.copy( - stage = newStage - ) - this.replicas += result - result - } - - def id: String = stage.id - def name: String = stage.name - - def recursiveReplicas: Set[Step] = { - val set = this.replicas.toSet - set ++ set.flatMap(_.recursiveReplicas) - } - - override def wth(inputIDs: Seq[String], outputIDs: Set[String]): Step = this.copy( - dependencyIDs = inputIDs, - usageIDs = outputIDs - ) -} - -abstract class StepWrapperLike(val self: StepLike) { - - def copy(self: StepLike = self): StepWrapperLike -} - -case class SimpleStepWrapper(override val self: StepLike) extends StepWrapperLike(self) { - - override def copy(self: StepLike): StepWrapperLike = SimpleStepWrapper(self) -} - -trait Connector extends StepLike - -case class Source( - name: String, - dataTypes: Set[DataType] = - Set.empty, // used to validate & fail early when stages for different data types are appended. - usageIDs: Set[String] = Set.empty -) extends ColumnName(name) - with Connector { - - { - assert(this.id != PASSTHROUGH.id) - assert(!this.usageIDs.contains(PASSTHROUGH.id)) - } - - override def dependencyIDs: Seq[String] = Nil - - override def canBeHead: Boolean = true - - override def replicate(suffix: String = ""): Source = this - - def id: String = name - - override def wth(inputIDs: Seq[String], outputIDs: Set[String]): Source = { - this.copy( - usageIDs = outputIDs - ) - } - - override def toString: String = "'" + name -} - -case object PASSTHROUGH extends Connector { - - override def name: String = this.getClass.getSimpleName.stripSuffix("$") - - override val id: String = name // unique & cannot be referenced by others - - def dependencyIDs: Seq[String] = Nil - def usageIDs: Set[String] = Set.empty - - override def wth(inIDs: Seq[String], outIDs: Set[String]): this.type = this - - override def canBeHead: Boolean = true - - override def replicate(suffix: String = ""): this.type = this -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepMapView.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepMapView.scala deleted file mode 100644 index 85eae08d3..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepMapView.scala +++ /dev/null @@ -1,9 +0,0 @@ -package org.apache.spark.ml.dsl - -class StepMapView(val coll: StepMap[String, StepLike]) extends StepGraph { - - // generate new copy for each PipelineStage in this collection to - override def replicate(suffix: String = ""): StepMapView = new StepMapView( - coll = this.replicateColl(suffix = suffix) - ) -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepTreeNode.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepTreeNode.scala deleted file mode 100644 index 760bb85a7..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/StepTreeNode.scala +++ /dev/null @@ -1,77 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.relay.{MessageAPI, Relay, TreeIR} -import com.tribbloids.spookystuff.tree.TreeView -import org.apache.spark.sql.utils.DataTypeRelay - -trait StepTreeNode[BaseType <: StepTreeNode[BaseType]] extends TreeView.Immutable[StepTreeNode[BaseType]] { - - val self: StepLike - - lazy val paths: Seq[Seq[String]] = { - val rootPath = Seq(self.name) - if (children.nonEmpty) { - children.flatMap { child => - child.paths.map(_ ++ rootPath) - } - } else Seq(rootPath) - } - - lazy val mergedPath: Seq[String] = { - - val numPaths = paths.map(_.size) - assert(numPaths.nonEmpty, "impossible") - val result = { - val maxBranchLength = numPaths.max - val commonAncestorLength = maxBranchLength - .to(0, -1) - .find { v => - paths.map(_.slice(0, v)).distinct.size == 1 - } - .getOrElse(0) - val commonAncestor = paths.head.slice(0, commonAncestorLength) - - val commonParentLength = maxBranchLength - .to(0, -1) - .find { v => - paths.map(_.reverse.slice(0, v)).distinct.size == 1 - } - .getOrElse(0) - val commonParent = paths.head.reverse.slice(0, commonParentLength).reverse - - if (commonAncestor.size + commonParent.size > maxBranchLength) commonParent - else commonAncestor ++ commonParent - } - result - } -} - -object StepTreeNode extends Relay.<<[StepTreeNode[_]] { - - override def toMessage_>>(v: StepTreeNode[_]): IR_>> = { - val base = v.self match { - case source: Source => - Msg( - source.id, - dataTypes = source.dataTypes - .map(DataTypeRelay.toMessageBody) - ) - case _ => - Msg(v.self.id) - } - TreeIR - .leaf( - base.copy( - stage = v.children.map(this.toMessageBody) - ) - ) - } - - case class Msg( - id: String, - dataTypes: Set[DataTypeRelay.Msg] = Set.empty, - stage: Seq[Msg] = Nil - ) extends MessageAPI.<< { - override def toProto_<< : StepTreeNode[_] = ??? - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/TrieNode.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/TrieNode.scala deleted file mode 100644 index fd0df5505..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/TrieNode.scala +++ /dev/null @@ -1,88 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.tree.TreeView - -import scala.annotation.tailrec - -/** - * Can be compressed into radix tree - */ -object TrieNode { - - private def buildChildren[K, V]( - map: Iterable[(Seq[K], V)], - prefix: Seq[K] = Seq(), - depth: Int = 0 - ): Seq[TrieNode[K, Option[V]]] = { - - val grouped = map.groupBy(_._1.head).toSeq.sortBy(v => "" + v._1) - val result = grouped.map { triplet => - val key = prefix ++ Seq(triplet._1) - val value = map.toMap.get(Seq(triplet._1)) - val children = buildChildren[K, V]( - triplet._2 - .map(tuple => tuple._1.slice(1, Int.MaxValue) -> tuple._2) - .filter(_._1.nonEmpty), - key, - depth + 1 - ) - TrieNode(key, value, children, depth + 1) - } - result - } - - def build[K, V](map: Iterable[(Seq[K], V)]): TrieNode[K, Option[V]] = { - TrieNode( - key = Nil, - value = map.toMap.get(Nil), - children = buildChildren(map), - 0 - ) - } -} - -case class TrieNode[K, V]( - key: Seq[K], - value: V, - children: Seq[TrieNode[K, V]], - depth: Int -) extends TreeView[TrieNode[K, V]] { - - @tailrec - final def lastSingleDescendant: TrieNode[K, V] = - if (this.children.size == 1) this.children.head.lastSingleDescendant - else this - - def compact: TrieNode[K, V] = { - this.transform { - case vv if vv.children.size == 1 => - vv.lastSingleDescendant - } - } - - def pruneUp: TrieNode[K, V] = { - this.transform { - case vv if vv.children.size == 1 => - vv.copy(children = vv.children.map { v => - v.copy[K, V](key = vv.key).pruneUp - }) - case vv if vv.children.size > 1 => - vv.copy(children = vv.children.map { v => - v.copy[K, V](key = vv.key ++ v.key.lastOption).pruneUp - }) - } - } - - def rebuildDepth(i: Int = 0): TrieNode[K, V] = { - this.copy( - depth = i, - children = this.children.map { child => - child.rebuildDepth(i + 1) - } - ) - } - - override protected def withNewChildrenInternal(newChildren: IndexedSeq[TrieNode[K, V]]): TrieNode[K, V] = { - this.copy(children = newChildren) - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/UDFTransformer.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/UDFTransformer.scala deleted file mode 100644 index 1681ffc2a..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/UDFTransformer.scala +++ /dev/null @@ -1,73 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol} -import org.apache.spark.ml.param.{Param, ParamMap} -import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} -import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedFunction} -import org.apache.spark.sql.types.{DataType, StructField, StructType} -import org.apache.spark.sql.{DataFrame, Dataset} - -abstract class UDFTransformerLike extends Transformer with HasOutputCol with DynamicParamsMixin { - - def udfImpl: UserDefinedFunction - - def setUDFSafely(_udfImpl: UserDefinedFunction): UDFTransformerLike.this.type = { - this.setUDF(_udfImpl) - } - - def getInputCols: Array[String] - - import org.apache.spark.sql.functions._ - - override def transform(dataset: Dataset[_]): DataFrame = { - val newCol = udfImpl( - (getInputCols: Array[String]) - .map(v => col(v)): _* - ) - - val result = dataset.withColumn(outputCol, newCol) - result - } - - lazy val outDataType: DataType = udfImpl match { - case v: SparkUserDefinedFunction => - v.dataType - case _ => - throw new UnsupportedOperationException(s"$udfImpl is not a SparkUserDefinedFunction") - } - - @DeveloperApi - override def transformSchema(schema: StructType): StructType = { - StructType(schema.fields :+ StructField(getOutputCol, outDataType, nullable = true)) - } -} - -object UDFTransformer extends DefaultParamsReadable[UDFTransformer] { - - def apply(udf: UserDefinedFunction): UDFTransformer = new UDFTransformer().setUDFSafely(udf) - - override def load(path: String): UDFTransformer = super.load(path) -} - -/** - * Created by peng on 09/04/16. TODO: use UDF registry's name as uid & name - */ -case class UDFTransformer( - uid: String = Identifiable.randomUID("udf") -) extends UDFTransformerLike - with HasInputCols - with DefaultParamsWritable { - - lazy val UDF: Param[UserDefinedFunction] = GenericParam[UserDefinedFunction]() - def udfImpl: UserDefinedFunction = UDF - - override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) - - @DeveloperApi - override def transformSchema(schema: StructType): StructType = { - StructType(schema.fields :+ StructField(outputCol, outDataType, nullable = true)) - } - -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/package.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/package.scala deleted file mode 100644 index ae5398eee..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/package.scala +++ /dev/null @@ -1,24 +0,0 @@ -package org.apache.spark.ml - -import scala.collection.immutable.ListMap -import scala.collection.mutable - -/** - * Created by peng on 10/04/16. - */ -package object dsl { - - type NamedStage = AbstractNamedStage[PipelineStage] - val NamedStage: AbstractNamedStage.type = AbstractNamedStage - - type StepMap[A, B] = ListMap[A, B] - val StepMap: ListMap.type = ListMap - - type StepBuffer[A, B] = scala.collection.mutable.LinkedHashMap[A, B] - val StepBuffer: mutable.LinkedHashMap.type = scala.collection.mutable.LinkedHashMap - - type MultiPartCompaction[V] = Set[Seq[V]] => Map[Seq[V], Seq[V]] - - type PathCompaction = MultiPartCompaction[String] - -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixin.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixin.scala new file mode 100644 index 000000000..e8b9b9e2f --- /dev/null +++ b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixin.scala @@ -0,0 +1,33 @@ +package org.apache.spark.ml.dsl.utils + +import scala.language.implicitConversions + +trait ClassOpsMixin {} + +object ClassOpsMixin { + + implicit def toClassOps[T](self: Class[T]): ClassOps[T] = { + // it will be automatically included in scope when working on Class[_ <: ClassOpsMixin] + // see __ImplicitSearchOrder + + ClassOpsMixin.ClassOps(self) + } + + case class ClassOps[T](self: Class[T]) { + + lazy val simpleName_Scala: String = { + + // TODO: need to decode to Scala name instead of JVM name + self.getSimpleName + .stripSuffix("$") + .split('$') + .filter(_.nonEmpty) + .head + + } + } + + object ClassOps { + implicit def unbox[T](self: ClassOps[T]): Unit = self.self + } +} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/EnumMixin.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/EnumMixin.scala deleted file mode 100644 index fa59d48b2..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/EnumMixin.scala +++ /dev/null @@ -1,6 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -trait EnumMixin extends ObjectSimpleNameMixin with Serializable { - - override def toString: String = objectSimpleName -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/LazyVar.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/LazyVar.scala index 2213eac0c..abe7d7d10 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/LazyVar.scala +++ b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/LazyVar.scala @@ -16,7 +16,7 @@ class LazyVar[T]( ) extends Serializable with EqualBy { - protected val cached: T `?` Var = None + @volatile protected var cached: T ?? _ = null.asInstanceOf[T] def peek: Option[T] = cached.asOption @@ -34,7 +34,7 @@ class LazyVar[T]( } def :=(v: T): Unit = { - cached := v + cached = v } def isCached: Boolean = cached.asOption.nonEmpty diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnet.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnet.scala new file mode 100644 index 000000000..c7adfa9ea --- /dev/null +++ b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnet.scala @@ -0,0 +1,62 @@ +package org.apache.spark.ml.dsl.utils + +import scala.language.implicitConversions + +sealed trait NullSafeMagnet[+T] extends Product with Serializable { + + def asOption: Option[T] +} + +/** + * a magnet wrapper for Option + */ +object NullSafeMagnet { + // TODO: should be a special case of + // the following design can reduce overhead and improve Scala 3 readiness: + // https://github.com/sjrd/scala-unboxed-option + + /** + * capability mixin + */ + sealed trait Cap extends Serializable + + case class CanBeNull[T, +M <: Cap](private var _self: Option[T]) extends NullSafeMagnet[T] { + + override def asOption: Option[T] = _self + } + + trait CanBeNull_Imp0 {} + + object CanBeNull extends CanBeNull_Imp0 { + + implicit def fromV[T, M <: Cap](v: T): CanBeNull[T, M] = CanBeNull[T, M](Option(v)) + + implicit def fromOpt[T, M <: Cap](v: Option[T]): CanBeNull[T, M] = CanBeNull[T, M](v) + + implicit def toOption[T](magnet: CanBeNull[T, _]): Option[T] = magnet.asOption + } + + case class NotNull[T, +M <: Cap](var value: T) extends NullSafeMagnet[T] { + + { + validate(value) + } + + def validate(value: T): Unit = { + require(value != null, "value cannot be null") + } + + override def asOption: Some[T] = Some(value) + } + + object NotNull extends CanBeNull_Imp0 { + + implicit def fromV[T, M <: Cap](v: T): NotNull[T, M] = NotNull[T, M](v) + + implicit def fromSome[T, M <: Cap](v: Some[T]): NotNull[T, M] = NotNull[T, M](v.get) + + implicit def toOption[T](magnet: NotNull[T, _]): Some[T] = Some(magnet.value) + + implicit def toV[T](magnet: NotNull[T, _]): T = magnet.value + } +} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafety.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafety.scala deleted file mode 100644 index 813c0c4b8..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/NullSafety.scala +++ /dev/null @@ -1,75 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -import scala.language.implicitConversions - -/** - * a magnet wrapper for Option - */ -object NullSafety { - -// object Val extends IsMutable - final class Var extends Serializable - -// def apply[T](asOption: Option[T]): Immutable[T] = Immutable(asOption) - - trait Magnet[+T] extends Product with Serializable { - - def asOption: Option[T] - } - - case class CanBeNull[T, +M](private var _self: Option[T]) extends Magnet[T] { - - override def asOption: Option[T] = _self - - def :=(v: T)( - implicit - ev: M <:< Var - ): Unit = { - _self = Option(v) - } - } - - trait LowLevelImplicits {} - - object CanBeNull extends LowLevelImplicits { - - implicit def fromV[T, M](v: T): CanBeNull[T, M] = CanBeNull[T, M](Option(v)) - - implicit def fromOpt[T, M](v: Option[T]): CanBeNull[T, M] = CanBeNull[T, M](v) - - implicit def toOption[T](magnet: CanBeNull[T, _]): Option[T] = magnet.asOption - } - - case class CannotBeNull[T, +M](var value: T) extends Magnet[T] { - - { - validate(value) - } - - def validate(value: T): Unit = { - require(value != null, "value cannot be null") - } - - override def asOption: Some[T] = Some(value) - - def :=(value: T)( - implicit - ev: M <:< Var - ): Unit = { - validate(value) - - this.value = value - } - } - - object CannotBeNull extends LowLevelImplicits { - - implicit def fromV[T, M](v: T): CannotBeNull[T, M] = CannotBeNull[T, M](v) - - implicit def fromSome[T, M](v: Some[T]): CannotBeNull[T, M] = CannotBeNull[T, M](v.get) - - implicit def toOption[T](magnet: CannotBeNull[T, _]): Some[T] = Some(magnet.value) - - implicit def toV[T](magnet: CannotBeNull[T, _]): T = magnet.value - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ObjectSimpleNameMixin.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ObjectSimpleNameMixin.scala deleted file mode 100644 index 41307ecda..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/ObjectSimpleNameMixin.scala +++ /dev/null @@ -1,22 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -trait ObjectSimpleNameMixin { - // TODO: cleanup, encoding may change in scala 3 - @transient lazy val objectSimpleName: String = ObjectSimpleNameMixin.get(this) -} - -object ObjectSimpleNameMixin { - - def get(v: Any): String = { - - if (v == null) "null" - else { - // TODO: need to decode to Scala name instead of JVM name - v.getClass.getSimpleName - .stripSuffix("$") - .split('$') - .filter(_.nonEmpty) - .head - } - } -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/OptionConversion.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/OptionConversion.scala deleted file mode 100644 index 7162d14c3..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/OptionConversion.scala +++ /dev/null @@ -1,8 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -import scala.language.implicitConversions - -trait OptionConversion { - - implicit def box[T](v: T): Option[T] = Option(v) -} diff --git a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/package.scala b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/package.scala index a99ed0597..3d60a726d 100644 --- a/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/package.scala +++ b/parent/commons/src/main/scala/org/apache/spark/ml/dsl/utils/package.scala @@ -1,20 +1,20 @@ package org.apache.spark.ml.dsl -package object utils { +import org.apache.spark.ml.dsl.utils.NullSafeMagnet.Cap - type Var = NullSafety.Var +package object utils { - type `?`[T, M] = NullSafety.CanBeNull[T, M] + type ??[T, M <: Cap] = NullSafeMagnet.CanBeNull[T, M] - type ![T, M] = NullSafety.CannotBeNull[T, M] + type !![T, M <: Cap] = NullSafeMagnet.NotNull[T, M] // TODO: the following should be obsolete - type Nullable[T] = NullSafety.CanBeNull[T, Any] + type Nullable[T] = NullSafeMagnet.CanBeNull[T, Cap] object Nullable { - type NOT[T] = NullSafety.CannotBeNull[T, Any] + type NOT[T] = NullSafeMagnet.NotNull[T, Cap] - def NOT: NullSafety.CannotBeNull.type = NullSafety.CannotBeNull + def NOT: NullSafeMagnet.NotNull.type = NullSafeMagnet.NotNull } } diff --git a/parent/commons/src/main/scala/org/apache/spark/sql/utils/DataTypeRelay.scala b/parent/commons/src/main/scala/org/apache/spark/sql/utils/DataTypeRelay.scala deleted file mode 100644 index 30fc08bbb..000000000 --- a/parent/commons/src/main/scala/org/apache/spark/sql/utils/DataTypeRelay.scala +++ /dev/null @@ -1,34 +0,0 @@ -package org.apache.spark.sql.utils - -import com.tribbloids.spookystuff.relay.{MessageAPI, Relay, TreeIR} -import org.apache.spark.sql.types.DataType -import org.json4s.JValue - -/** - * Created by peng on 31/01/17. - */ -object DataTypeRelay extends Relay.<<[DataType] { - - def toJsonAST(dataType: DataType): JValue = { - dataType.jsonValue - } - - def fromJsonAST(jv: JValue): DataType = { - DataType.parseDataType(jv) - } - - override def toMessage_>>(v: DataType): IR_>> = { - - TreeIR - .leaf( - Msg(toJsonAST(v)) - ) - } - - case class Msg( - dataType: JValue - ) extends MessageAPI.<< { - - override def toProto_<< : DataType = fromJsonAST(dataType) - } -} diff --git a/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/RelaySuite.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/RelaySuite.scala index 316838082..d4a256584 100644 --- a/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/RelaySuite.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/RelaySuite.scala @@ -1,27 +1,14 @@ package com.tribbloids.spookystuff.relay -import java.util.Date -import org.apache.spark.ml.dsl.AbstractDFDSuite import com.tribbloids.spookystuff.relay.TestBeans._ import com.tribbloids.spookystuff.relay.io.Encoder +import com.tribbloids.spookystuff.testutils.BaseSpec import org.json4s.MappingException import org.json4s.reflect.{Executable, ParanamerReader} -class RelaySuite extends AbstractDFDSuite { - - // TODO: disabled before FallbackSerializer is really put to use - ignore("SerializingParam[Function1] should work") { - val fn = { k: Int => - 2 * k - } - val reader = new Relay.ToSelf[Int => Int]() - val param = reader.Param("id", "name", "") +import java.util.Date - val json = param.jsonEncode(fn) - println(json) // TODO: assert - val fn2 = param.jsonDecode(json) - assert(fn2(2) == 4) - } +class RelaySuite extends BaseSpec { val date: Date = new Date() diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/XMLWeakDeserializerSuite.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/xml/XMLWeakDeserializerSuite.scala similarity index 94% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/XMLWeakDeserializerSuite.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/xml/XMLWeakDeserializerSuite.scala index 7052fc4d0..84f2f6af5 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/XMLWeakDeserializerSuite.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/relay/xml/XMLWeakDeserializerSuite.scala @@ -1,7 +1,6 @@ -package org.apache.spark.ml.dsl.utils +package com.tribbloids.spookystuff.relay.xml -import org.apache.spark.ml.dsl.AbstractDFDSuite -import com.tribbloids.spookystuff.relay.xml.XMLFormats +import com.tribbloids.spookystuff.testutils.BaseSpec import org.json4s.{DefaultFormats, Formats, JObject} object XMLWeakDeserializerSuite { @@ -40,7 +39,7 @@ object XMLWeakDeserializerSuite { ) } -class XMLWeakDeserializerSuite extends AbstractDFDSuite { +class XMLWeakDeserializerSuite extends BaseSpec { implicit val formats: Formats = XMLFormats.defaultFormats diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/RecursiveEitherAsUnionToJSONSpike.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/spike/RecursiveEitherAsUnionToJSONSpike.scala similarity index 92% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/RecursiveEitherAsUnionToJSONSpike.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/spike/RecursiveEitherAsUnionToJSONSpike.scala index 1079d460f..1749b2465 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/RecursiveEitherAsUnionToJSONSpike.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/spike/RecursiveEitherAsUnionToJSONSpike.scala @@ -1,10 +1,9 @@ -package org.apache.spark.ml.dsl.utils +package com.tribbloids.spookystuff.spike -import com.tribbloids.spookystuff.testutils.BaseSpec -import org.apache.spark.ml.dsl.utils.RecursiveEitherAsUnionToJSONSpike._ import com.tribbloids.spookystuff.relay.Relay import com.tribbloids.spookystuff.relay.io.Encoder -import org.scalatest.Ignore +import com.tribbloids.spookystuff.spike.RecursiveEitherAsUnionToJSONSpike._ +import com.tribbloids.spookystuff.testutils.BaseSpec import org.slf4j.LoggerFactory object RecursiveEitherAsUnionToJSONSpike { @@ -26,7 +25,6 @@ object RecursiveEitherAsUnionToJSONSpike { case class InclusiveOpt(v: Option[Union], x: String) } -@Ignore class RecursiveEitherAsUnionToJSONSpike extends BaseSpec { val u1: Union = Right(Test1("abc", 2)) diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/PairwiseConversionMixin.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/PairwiseConversionMixin.scala similarity index 98% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/PairwiseConversionMixin.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/PairwiseConversionMixin.scala index 679a0a516..3a4cdd95b 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/PairwiseConversionMixin.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/PairwiseConversionMixin.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils +package com.tribbloids.spookystuff.utils import com.tribbloids.spookystuff.testutils.BaseSpec import com.tribbloids.spookystuff.utils.collection.MultiMapOps diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSpike.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSpike.scala similarity index 96% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSpike.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSpike.scala index ba9801bd2..307c6c929 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSpike.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSpike.scala @@ -1,6 +1,7 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import com.tribbloids.spookystuff.testutils.BaseSpec +import com.tribbloids.spookystuff.utils.refl.TypeUtils import com.tribbloids.spookystuff.utils.serialization.AssertSerializable import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.ScalaReflection.universe diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSuite.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSuite.scala similarity index 96% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSuite.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSuite.scala index b6a1ea2e7..c302e2828 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeMagnetSuite.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeMagnetSuite.scala @@ -1,13 +1,13 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import java.sql.Timestamp - import com.tribbloids.spookystuff.testutils.BaseSpec import com.tribbloids.spookystuff.utils.serialization.AssertSerializable -import org.apache.spark.ml.dsl.utils.PairwiseConversionMixin -import org.apache.spark.ml.dsl.utils.PairwiseConversionMixin.Repr +import com.tribbloids.spookystuff.utils.PairwiseConversionMixin.Repr import com.tribbloids.spookystuff.relay.TestBeans._ -import org.apache.spark.ml.dsl.utils.refl.TypeMagnetSuite.TypeTagRepr +import com.tribbloids.spookystuff.utils.PairwiseConversionMixin +import com.tribbloids.spookystuff.utils.refl.TypeMagnetSuite.TypeTagRepr +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, ToCatalyst, TypeMagnet} import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.ScalaReflection.universe import org.apache.spark.sql.types._ @@ -46,6 +46,7 @@ object TypeMagnetSuite { implicit def fromDelegate[T](v: Repr[TypeTag[T]]): TypeTagRepr = new TypeTagRepr(v.copy()) } + } class TypeMagnetSuite extends BaseSpec with PairwiseConversionMixin { diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeSpike.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeSpike.scala similarity index 91% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeSpike.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeSpike.scala index 63c3cebd3..ee6b55525 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/TypeSpike.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/TypeSpike.scala @@ -1,4 +1,4 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import com.tribbloids.spookystuff.testutils.BaseSpec import org.apache.spark.sql.catalyst.ScalaReflection diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/UnReifiedObjectTypeSuite.scala b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectTypeSuite.scala similarity index 64% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/UnReifiedObjectTypeSuite.scala rename to parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectTypeSuite.scala index 6c3f67f24..bfc179b03 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/refl/UnReifiedObjectTypeSuite.scala +++ b/parent/commons/src/test/scala/com/tribbloids/spookystuff/utils/refl/UnreifiedObjectTypeSuite.scala @@ -1,9 +1,10 @@ -package org.apache.spark.ml.dsl.utils.refl +package com.tribbloids.spookystuff.utils.refl import com.tribbloids.spookystuff.testutils.BaseSpec +import com.tribbloids.spookystuff.utils.refl.UnreifiedObjectType import org.apache.spark.sql.catalyst.ScalaReflection -class UnReifiedObjectTypeSuite extends BaseSpec { +class UnreifiedObjectTypeSuite extends BaseSpec { import ScalaReflection.universe._ diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AbstractDFDSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AbstractDFDSuite.scala deleted file mode 100644 index 8f16c75c5..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AbstractDFDSuite.scala +++ /dev/null @@ -1,84 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.testutils.{BaseSpec, TestHelper} -import org.apache.spark.ml.PipelineStage -import org.apache.spark.ml.param.shared.{HasInputCol, HasInputCols, HasOutputCol} -import org.scalatest.BeforeAndAfterAll - -import scala.util.matching.Regex - -/** - * Created by peng on 18/04/16. - */ -abstract class AbstractDFDSuite extends BaseSpec with BeforeAndAfterAll { - - implicit class StringView(str: String) extends super._StringOps(str) { - - def treeNodeShouldBe( - groundTruth: String = null, - sort: Boolean = false - ): Unit = { - val compactedGT = Option(groundTruth).map(compactGroundTruth).orNull - this.shouldBe(compactedGT, sort = sort) - } - } - - def compaction: PathCompaction = Compactions.DoNotCompact - lazy val compactionOpt: Some[PathCompaction] = Some(compaction) - - def compactGroundTruth(str: String): String = { - - val regex: Regex = "(?<=[\\[\\,])[\\w\\$]*(?=[\\]\\,])".r - val matches = regex.findAllIn(str).toList - val cols = matches.map(_.split('$').toSeq).toSet - val lookup = compaction(cols) - - val replaced = regex.replaceAllIn( - str, - { m => - val original: String = m.matched - val multiPart = original.split('$').toSeq - lookup(multiPart).mkString("\\$") - } - ) - - replaced - } - - def getInputsOutputs(stages: Seq[PipelineStage]): Seq[(String, String, String)] = { - val input_output = stages.map { v => - val className = v.getClass.getSimpleName - val input: Array[String] = v match { - case v: HasInputCol => Array(v.getInputCol) - case v: HasInputCols => v.getInputCols - case _ => Array[String]() - } - - val output = v match { - case v: HasOutputCol => Array(v.getOutputCol) - case _ => Array[String]() - } - - (className, input.toSeq.mkString("|"), output.toSeq.mkString("|")) - } - input_output - } - - override def afterAll(): Unit = { - - TestHelper.cleanTempDirs() - super.afterAll() - } -} - -trait UsePruneDownPath { - self: AbstractDFDSuite => - - override def compaction: Compactions.PruneDownPath.type = Compactions.PruneDownPath -} - -trait UsePruneDownPathKeepRoot { - self: AbstractDFDSuite => - - override def compaction: Compactions.PruneDownPathKeepRoot.type = Compactions.PruneDownPathKeepRoot -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AppendSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AppendSuite.scala deleted file mode 100644 index a25c6b7ab..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/AppendSuite.scala +++ /dev/null @@ -1,178 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.feature._ - -/** - * Created by peng on 27/04/16. - */ -class AppendSuite extends AbstractDFDSuite { - - import DFDComponent._ - - it("can automatically generate names") { - - val flow = ( - 'input - :>> new Tokenizer() - :=>> new Tokenizer() - :-> new Tokenizer() - :>> new Tokenizer() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | +- > ForwardNode [input$Tokenizer] > Tokenizer > [input$Tokenizer$Tokenizer] - | +- > ForwardNode [input$Tokenizer$Tokenizer] > Tokenizer > [input$Tokenizer$Tokenizer$Tokenizer] - | +- > ForwardNode (HEAD)( Tokenizer > [input$Tokenizer$Tokenizer$Tokenizer$Tokenizer] - |/ right < - |> ForwardNode (HEAD)( Tokenizer > [input$Tokenizer$Tokenizer$Tokenizer$Tokenizer] - """.stripMargin - ) - } - - it("pincer topology can be defined by A :-> B <-: A") { - val input: DFDComponent = 'input - val flow = input :-> new VectorAssembler() <-: input - - flow - .show(showID = false, forward = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |< BackwardNode (HEAD) [input,input] > VectorAssembler > [input$VectorAssembler] - |:- < BackwardNode (TAIL) [input] - |+- < BackwardNode (TAIL) [input] - """.stripMargin - ) - } - - it("A :-> B :-> Source is associative") { - val flow1 = 'input :-> new Tokenizer() :-> 'dummy // resolve to rebase then union - val flow2 = 'input :-> (new Tokenizer() :-> 'dummy) // resolve to union then rebase - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe(flow2.show(showID = false, compactionOpt = compactionOpt)) - } - - it("A <-: B <-: Source is associative") { - val flow1 = 'dummy <-: new Tokenizer() <-: 'input - val flow2 = 'dummy <-: (new Tokenizer() <-: 'input) - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe(flow2.show(showID = false, compactionOpt = compactionOpt)) - } - - it("A :-> B :-> detached Stage is associative") { - val flow1 = 'input :-> new Tokenizer() :-> new NGram() // resolve to rebase then union - val flow2 = 'input :-> (new Tokenizer() :-> new NGram()) // resolve to union then rebase - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe(flow2.show(showID = false, compactionOpt = compactionOpt)) - } - - it("A <-: B <-: detached Stage is associative") { - val flow1 = new NGram() <-: new Tokenizer() <-: 'input - val flow2 = new NGram() <-: (new Tokenizer() <-: 'input) - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe(flow2.show(showID = false, compactionOpt = compactionOpt)) - } - - it(":-> Stage is cast to rebase") { - - val flow = ( - ( - 'input - :-> new Tokenizer() - :-> new StopWordsRemover() - ).from("Tokenizer") - .and("StopWordsRemover") - :-> new NGram() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode (HEAD) [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode (HEAD)( NGram > [input$Tokenizer$StopWordsRemover$NGram] - |/ right < - |> ForwardNode (HEAD)( NGram > [input$Tokenizer$StopWordsRemover$NGram] - """.stripMargin - ) - } - - it("<-: Stage is cast to rebase") { - - val flow = - new SQLTransformer() <-: - new NGram() <-: ( - new StopWordsRemover() <-: new Tokenizer() <-: 'input - ).from("Tokenizer") - .and("StopWordsRemover") - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - |+- > ForwardNode [] > SQLTransformer > [] - |/ right < - |> ForwardNode ( ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode (HEAD) [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | : +- > ForwardNode [] > SQLTransformer > [] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - | +- > ForwardNode [] > SQLTransformer > [] - """.stripMargin - ) - } - - it(":-> Source is cast to union") { - val flow = 'input :-> new Tokenizer() :-> 'dummy - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode (HEAD)( Tokenizer > [input$Tokenizer] - |> ForwardNode (HEAD)(TAIL) [dummy] - |/ right < - |> ForwardNode (HEAD)( Tokenizer > [input$Tokenizer] - |> ForwardNode (HEAD)(TAIL) [dummy] - """.stripMargin - ) - } - - it("<-: Source is cast to union") { - val flow = 'dummy <-: new Tokenizer() <-: 'input - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input] > Tokenizer > [input$Tokenizer] - |> ForwardNode (HEAD)(TAIL) [dummy] - |/ right < - |> ForwardNode ( ForwardNode (HEAD)(TAIL>) [input] > Tokenizer > [input$Tokenizer] - |> ForwardNode (HEAD)(TAIL) [dummy] - """.stripMargin - ) - } -} - -class AppendSuite_PruneDownPath extends AppendSuite with UsePruneDownPath - -class AppendSuite_PruneDownPathKeepRoot extends AppendSuite with UsePruneDownPathKeepRoot diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/CompactionSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/CompactionSuite.scala deleted file mode 100644 index c7c1c0f84..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/CompactionSuite.scala +++ /dev/null @@ -1,138 +0,0 @@ -package org.apache.spark.ml.dsl - -/** - * Created by peng on 27/04/16. - */ -class CompactionSuite extends AbstractDFDSuite { - - trait TestCase extends Product { - - def original: String - - def compact1: String - - def compact2: String - - val s1: Array[Seq[String]] = original.trim.stripMargin - .split("\n") - .map(v => v.split(" ").toSeq) - } - - case object Case1 extends TestCase { - - override def original: String = - """ - |A B - |A B C - |A B C D - |A B C D E - |A B K - |A B C K - |A B C D K - |A B C D E K - |""".stripMargin - - override def compact1: String = - """ - |B - |C - |D - |E - |B K - |C K - |D K - |E K - |""".stripMargin - - override def compact2: String = - """ - |A B - |A C - |A D - |A E - |A B K - |A C K - |A D K - |A E K - |""".stripMargin - } - - case object Case2 extends TestCase { - override def original: String = - """ - |input Tokenizer1 - |input Tokenizer0 - |input Tokenizer HashingTF - |input Tokenizer StopWordsRemover HashingTF - |input Tokenizer HashingTF VectorAssembler - |input Tokenizer StopWordsRemover HashingTF VectorAssembler - |input - |input Tokenizer StopWordsRemover - |""".stripMargin - - override def compact1: String = - """ - |Tokenizer1 - |Tokenizer0 - |Tokenizer HashingTF - |StopWordsRemover HashingTF - |Tokenizer VectorAssembler - |StopWordsRemover VectorAssembler - |input - |StopWordsRemover - |""".stripMargin - - override def compact2: String = - """ - |input Tokenizer1 - |input Tokenizer0 - |input Tokenizer HashingTF - |input StopWordsRemover HashingTF - |input Tokenizer VectorAssembler - |input StopWordsRemover VectorAssembler - |input - |input StopWordsRemover - |""".stripMargin - } - - def testCompaction(compaction: PathCompaction, source: Array[Seq[String]], expected: String = null): Unit = { - - assert(source.distinct.length == source.length) - - val lookup = compaction(source.toSet) - val compare = source.map { v => - v -> lookup(v) - } - compare - .map { v => - v._2.mkString(" ") - } - .mkString("\n") - .shouldBe( - expected - ) - val compactedValues = compare.map(_._2).toList - assert(compactedValues.distinct.size == compactedValues.size) - } - - Seq( - Compactions.DoNotCompact -> { v: TestCase => - v.original - }, - Compactions.PruneDownPath -> { v: TestCase => - v.compact1 - }, - Compactions.PruneDownPathKeepRoot -> { v: TestCase => - v.compact2 - } - ).foreach { - case (v, selector) => - Seq(Case1, Case2).foreach { c => - it(s"${v.getClass.getSimpleName.stripSuffix("$")} should work on ${c.toString} ...") { - - testCompaction(v, c.s1, selector(c)) - } - } - - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/ComposeSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/ComposeSuite.scala deleted file mode 100644 index 7bad4312e..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/ComposeSuite.scala +++ /dev/null @@ -1,481 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.feature._ - -class ComposeSuite extends AbstractDFDSuite { - - import DFDComponent._ - - it("compose_> Source doesn't work") { - intercept[IllegalArgumentException] { - 'input :>> new Tokenizer() :>> 'dummy - } - } - - it("compose_< Source doesn't work") { - intercept[IllegalArgumentException] { - 'input :>> new Tokenizer() :>> 'dummy - } - } - - it("compose_> PASSTHROUGH doesn't change the flow") { - val flow = 'input :>> new Tokenizer() :>> PASSTHROUGH - val flow2 = 'input :>> new Tokenizer() - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - flow2.show(showID = false, compactionOpt = compactionOpt) - ) - } - - it("PASSTHROUGH compose_> Stage doesn't change the flow") { - val flow1 = 'input :>> (PASSTHROUGH :>> new Tokenizer()) - val flow2 = 'input :>> new Tokenizer() - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - flow2.show(showID = false, compactionOpt = compactionOpt) - ) - } - - it("compose_> (PASSTHROUGH || Stage) generates 2 heads") { - val flow = ( - 'input - :-> new Tokenizer() - :>> ( - PASSTHROUGH U - new StopWordsRemover() - ) - :=>> new HashingTF() - ) - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode (HEAD) [input$Tokenizer] > HashingTF > [input$Tokenizer$HashingTF] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode (HEAD)( HashingTF > [input$Tokenizer$StopWordsRemover$HashingTF] - |/ right < - |> ForwardNode (HEAD)( HashingTF > [input$Tokenizer$StopWordsRemover$HashingTF] - """.stripMargin - ) - } - - it("declare API is equally effective") { - val flow1 = ( - new VectorAssembler() - <<: (new HashingTF() - <<=: ( - PASSTHROUGH - U new StopWordsRemover() - ) - <<: new Tokenizer() - <<: 'input) - ) - val part1 = declare( - new Tokenizer() <<: 'input - ) - val part2: DFDComponent = new HashingTF() - val part3: DFDComponent = new VectorAssembler() - - val flow2 = declare( - part3 <<: part2 <<: part1, - part3 <<: part2.replicate() <<: new StopWordsRemover() <<: part1 - ) - - // val flow3 = declare( - // part3 < part2 < part1, - // part3 < part2.replicate("_2") < new StopWordsRemover() < part1 - // ) - - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - flow2.show(showID = false, compactionOpt = compactionOpt) - ) - } - - it("result of compose_> can be the first operand of compose_<") { - val flow = new VectorAssembler() <<: ( - 'input :>> new Tokenizer() :>> new HashingTF() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe(""" - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$Tokenizer$HashingTF] > VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | +- > ForwardNode ( HashingTF > [input$Tokenizer$HashingTF] - | +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer$HashingTF] > VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - |/ right < - |> ForwardNode ( HashingTF > [input$Tokenizer$HashingTF] - |+- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer$HashingTF] > VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - """.stripMargin) -// flow.show(showID = false, compactionOpt = compactionOpt, asciiArt = true).treeNodeShouldBe() - } - - it("result of compose_< can be the first operand of compose_>") { - val flow = ( - new HashingTF() <<: new Tokenizer() <<: 'input - :>> new VectorAssembler() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$VectorAssembler$Tokenizer] > HashingTF > [input$VectorAssembler$Tokenizer$HashingTF] - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode ( VectorAssembler > [input$VectorAssembler] - | +- > ForwardNode [input$VectorAssembler] > Tokenizer > [input$VectorAssembler$Tokenizer] - | +- > ForwardNode (HEAD)(TAIL>) [input$VectorAssembler$Tokenizer] > HashingTF > [input$VectorAssembler$Tokenizer$HashingTF] - |/ right < - |> ForwardNode ( VectorAssembler > [input$VectorAssembler] - |+- > ForwardNode [input$VectorAssembler] > Tokenizer > [input$VectorAssembler$Tokenizer] - | +- > ForwardNode (HEAD)(TAIL>) [input$VectorAssembler$Tokenizer] > HashingTF > [input$VectorAssembler$Tokenizer$HashingTF] - """.stripMargin - ) - } - - it("A compose_> (PASSTHROUGH || Stage) rebase_> B is associative") { - val flow1 = ( - new Tokenizer() - :>> ( - PASSTHROUGH U - new StopWordsRemover() - ) - :=>> new HashingTF() - ) - val flow2 = ( - new Tokenizer() - :>> (( - PASSTHROUGH U - new StopWordsRemover() - ) - :=>> new HashingTF()) - ) - - flow1 - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - flow2.show(showID = false, compactionOpt = compactionOpt) - ) - } - - it("compose_> can append a stage to 2 heads") { - val flow = ( - ('input1 U 'input2) - :>> new VectorAssembler() - ) - - flow - .show(showID = false) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input1] - |+- > ForwardNode (HEAD)( VectorAssembler > [VectorAssembler] - |> ForwardNode (TAIL>) [input2] - |+- > ForwardNode (HEAD)( VectorAssembler > [VectorAssembler] - |/ right < - |> ForwardNode (HEAD)( VectorAssembler > [VectorAssembler] - """.stripMargin - ) - } - - it("compose_< can append a stage to 2 heads") { - - val flow = ( - new VectorAssembler() - <<: ('input1 U 'input2) - ) - - flow - .show(showID = false) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input1,input2] > VectorAssembler > [VectorAssembler] - |/ right < - |> ForwardNode ( ForwardNode (HEAD)(TAIL>) [input1,input2] > VectorAssembler > [VectorAssembler] - |> ForwardNode ( ForwardNode (HEAD)(TAIL>) [input1,input2] > VectorAssembler > [VectorAssembler] - """.stripMargin - ) - } - - it("compose_> can append a stage to 2 heads from 1 tail") { - - val flow = ( - 'input - :>> new Tokenizer() - :>> ( - PASSTHROUGH - U new StopWordsRemover() - ) - :=>> new HashingTF() - :>> new VectorAssembler() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode [input$Tokenizer] > HashingTF > [input$Tokenizer$HashingTF] - | : +- > ForwardNode (HEAD)( VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode [input$Tokenizer$StopWordsRemover] > HashingTF > [input$Tokenizer$StopWordsRemover$HashingTF] - | +- > ForwardNode (HEAD)( VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - |/ right < - |> ForwardNode (HEAD)( VectorAssembler > [input$Tokenizer$HashingTF$VectorAssembler] - """.stripMargin - ) - } - - it("compose_< can append a stage to 2 heads from 1 tail") { - - val flow = ( - new VectorAssembler() - <<: ( - PASSTHROUGH - U new StopWordsRemover() - ) - <<: new Tokenizer() - <<: 'input - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$Tokenizer,input$Tokenizer$StopWordsRemover] > VectorAssembler > [input$Tokenizer$VectorAssembler] - |/ right < - |> ForwardNode ( ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | : +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer,input$Tokenizer$StopWordsRemover] > VectorAssembler > [input$Tokenizer$VectorAssembler] - | +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer,input$Tokenizer$StopWordsRemover] > VectorAssembler > [input$Tokenizer$VectorAssembler] - """.stripMargin - ) - - flow - .show(showID = false, asciiArt = true) - .shouldBe( - """ - | ┌───────────────┐ - | │( Tokenizer > [Tokenizer]│ - | └────────────┬──────────┬──────────┘ - | │ │ - | │ └────────────────────────┐ - | v │ - | ┌────────────────────────────────────────────────────┐ │ - | │ [Tokenizer] > StopWordsRemover > [StopWordsRemover]│ │ - | └──────────────────┬─────────────────────────────────┘ │ - | │ │ - | v v - | ┌────────────────────────────────────────────────────────────────────────────────┐ - | │(HEAD)(TAIL>) [Tokenizer,StopWordsRemover] > VectorAssembler > [VectorAssembler]│ - | └────────────────────────────────────────────────────────────────────────────────┘ - |""".stripMargin - ) - } - - it("compose_> can append a stage to merged heads") { - val flow = ( - ('input1 U 'input2) - :>> new VectorAssembler() - :>> new IndexToString() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input1] - |+- > ForwardNode [input1,input2] > VectorAssembler > [VectorAssembler] - | +- > ForwardNode (HEAD)( IndexToString > [VectorAssembler$IndexToString] - |> ForwardNode (TAIL>) [input2] - |+- > ForwardNode [input1,input2] > VectorAssembler > [VectorAssembler] - | +- > ForwardNode (HEAD)( IndexToString > [VectorAssembler$IndexToString] - |/ right < - |> ForwardNode (HEAD)( IndexToString > [VectorAssembler$IndexToString] - """.stripMargin - ) - } - - it("compose_< can append a stage to merged heads") { - - val flow = ( - new IndexToString() - <<: new VectorAssembler() - <<: ('input1 U 'input2) - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [VectorAssembler] > IndexToString > [VectorAssembler$IndexToString] - |/ right < - |> ForwardNode ( ForwardNode [input1,input2] > VectorAssembler > [VectorAssembler] - | +- > ForwardNode (HEAD)(TAIL>) [VectorAssembler] > IndexToString > [VectorAssembler$IndexToString] - |> ForwardNode ( ForwardNode [input1,input2] > VectorAssembler > [VectorAssembler] - | +- > ForwardNode (HEAD)(TAIL>) [VectorAssembler] > IndexToString > [VectorAssembler$IndexToString] - """.stripMargin - ) - } - - it("compose_> can bypass Source of downstream") { - val flow = ( - 'input - :>> ( - 'dummy :>> - new Tokenizer() - ) - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode (HEAD)( Tokenizer > [input$Tokenizer] - |/ right < - |> ForwardNode (HEAD)( Tokenizer > [input$Tokenizer] - """.stripMargin - ) - } - - it("compose_< can bypass Source of downstream") { - val flow = ( - new Tokenizer() <<: - 'dummy - ) <<: - 'input - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input] > Tokenizer > [input$Tokenizer] - |/ right < - |> ForwardNode ( ForwardNode (HEAD)(TAIL>) [input] > Tokenizer > [input$Tokenizer] - """.stripMargin - ) - } - - // test("from can select by name") { - // - // "input") - // } - // - // test("from can select by qualified name") { - // - // } - // - // test("from can select by * wildcard qualified name") { - // - // } - // - // test("from can select by ** wildcard qualified name") { - // - // } - - it("Compose works when operand2 is type consistent") { - - val flow = ( - 'input.string - :>> new Tokenizer() - :>> new StopWordsRemover() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | +- > ForwardNode (HEAD)( StopWordsRemover > [input$Tokenizer$StopWordsRemover] - |/ right < - |> ForwardNode (HEAD)( StopWordsRemover > [input$Tokenizer$StopWordsRemover] - |""".stripMargin - ) - } - - it("Compose throws an exception when operand2 is type inconsistent with output of operand1 as a Source") { - - intercept[IllegalArgumentException] { - ( - 'input.string - :>> new VectorAssembler() - ) - } - } - - it("Compose throws an exception when operand2 is type inconsistent with output of operand1 as a Flow") { - - intercept[IllegalArgumentException] { - ( - 'input.string - :>> new Tokenizer() - :>> new VectorAssembler() - ) - } - } - - it("Union throws an exception when a stage in result is type inconsistent") { - - val part1 = declare( - new Tokenizer() <<: 'input.string - ) - val part2: DFDComponent = new HashingTF() - val part3: DFDComponent = new StopWordsRemover() - - intercept[IllegalArgumentException] { - (part2 <<: part1) U - (part3 <<: part2) - } - } - - it("Union throws an exception when a stage in result has incompatible number of inputCols") { - - val part1 = declare( - new Tokenizer() <<: 'input.string - ) - val part2: DFDComponent = new HashingTF() - val part3: DFDComponent = new IDF() - - intercept[IllegalArgumentException] { - (part3 <<: part2 <<: part1) U - (part3 <<: part2.replicate() <<: new StopWordsRemover() <<: part1) - } - } -} - -class ComposeSuite_PruneDownPath extends ComposeSuite with UsePruneDownPath - -class ComposeSuite_PruneDownPathKeepRoot extends ComposeSuite with UsePruneDownPathKeepRoot diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDReadWriteSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDReadWriteSuite.scala deleted file mode 100644 index 60d68e638..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDReadWriteSuite.scala +++ /dev/null @@ -1,100 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.testutils.TestHelper -import org.apache.spark.ml.feature._ -import org.apache.spark.ml.{Pipeline, PipelineModel} -import org.json4s.JsonAST.JObject - -/** - * Created by peng on 06/10/16. - */ -class DFDReadWriteSuite extends AbstractDFDSuite { - - import DFDComponent._ - import org.apache.spark.ml.dsl.DFDSuite._ - import com.tribbloids.spookystuff.relay.io.MLReadWriteSupports._ - - TestHelper.TestSC - - val pipelinePath: String = "temp/pipeline/pipeline" -// def sc: SparkContext = TestHelper.TestSpark - - it("Pipeline can be saved and loaded") { - val flow = (DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :-> new IDF() -> IDF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED) - .from(STEMMED) :&& IDF :>> UDFTransformer(zipping) -> IDF_ZIPPED - - val pipeline: Pipeline = flow.build() - - pipeline.write.overwrite().save(pipelinePath) - val pipeline2 = Pipeline.read.load(pipelinePath) - - pipeline.toString().shouldBe(pipeline2.toString()) - } - - it("PipelineModel can be saved and loaded") { - val model = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED - ).buildModel() - - model.write.overwrite().save(pipelinePath) - val model2 = PipelineModel.read.load(pipelinePath) - - model.toString().shouldBe(model2.toString()) - } - - it("Flow can be serialized into JSON and back") { - - val flow = DFD('input.string) :-> - new Tokenizer() -> 'token :-> - stemming -> 'stemmed :-> - tf -> 'tf :-> - new IDF() -> 'idf - - val prettyJSON = flow.write.message.prettyJSON - -// prettyJSON.shouldBe() - - val flow2 = DFD.fromJSON(prettyJSON) - - flow - .show() - .shouldBe( - flow2.show() - ) - } - - it("Flow can be serialized into XML and back") { - - val flow = DFD('input.string) :-> - new Tokenizer() -> 'token :-> - stemming -> 'stemmed :-> - tf -> 'tf :-> - new IDF() -> 'idf - - JObject("root" -> flow.write.message.toJValue) -// val jValue2 = Xml.toJson(Xml.toXml(jValue)) - - // pretty(jValue).shouldBe(pretty(jValue2)) - - val prettyXML = flow.write.message.prettyXML - -// prettyXML.shouldBe() - - val flow2 = DFD.fromXML(prettyXML) - - flow - .show() - .shouldBe( - flow2.show() - ) - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDSuite.scala deleted file mode 100644 index ea69e16c1..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/DFDSuite.scala +++ /dev/null @@ -1,504 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.testutils.TestHelper -import org.apache.spark.ml.feature._ -import org.apache.spark.ml.linalg.{Vector => MLVector} -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions._ - -object DFDSuite { - - val TOKEN: Symbol = Symbol("token") - val STEMMED: String = "stemmed" - val TF: String = "tf" - val IDF: String = "idf" - val TF_ZIPPED: String = "tf_zipped" - val IDF_ZIPPED: String = "idf_zipped" - - val stemming: StopWordsRemover = new StopWordsRemover() - val tf: HashingTF = new HashingTF() - - val zipping: UserDefinedFunction = udf { (v1s: Seq[String], v2: MLVector) => - v1s.map { v => - val index = tf.indexOf(v) - v -> v2(index) - } - } -} - -class DFDSuite extends AbstractDFDSuite { - - import DFDComponent._ - import DFDSuite._ - - override lazy val compactionOpt: Some[PathCompaction] = Some(Compactions.PruneDownPath) - - val training: DataFrame = TestHelper.TestSQL - .createDataFrame( - Seq( - (0L, "a b c d e spark", 1.0), - (1L, "b d", 0.0), - (2L, "spark f g h", 1.0), - (3L, "hadoop mapreduce", 0.0), - (3L, "hadoop mapreduce", 1.0), - (4L, "b spark who", 1.0), - (5L, "g d a y", 0.0), - (6L, "spark fly", 1.0), - (7L, "was mapreduce", 0.0), - (8L, "e spark program", 1.0), - (9L, "a e c l", 0.0), - (10L, "spark compile", 1.0), - (11L, "hadoop software", 0.0) - ) - ) - .toDF("id", "input", "label") - - it("Flow can build Pipeline") { - val part1 = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :-> new IDF() -> DFDSuite.IDF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED - ) - - val flow = part1 - .from(STEMMED) :&& DFDSuite.IDF :>> UDFTransformer(zipping) -> IDF_ZIPPED - - flow - .show(showID = false, compactionOpt = compactionOpt, asciiArt = true) - .shouldBe( - """ - | ┌───────────────┐ - | │(TAIL>) [input]│ - | └────────┬──────┘ - | │ - | v - | ┌──────────────────────────┐ - | │ [input] > token > [token]│ - | └─────────────┬────────────┘ - | │ - | v - | ┌──────────────────────────────┐ - | │ [token] > stemmed > [stemmed]│ - | └───────┬───────────────┬─────┬┘ - | │ │ │ - | ┌─────────────┘ │ │ - | │ │ │ - | v │ │ - | ┌──────────────────────┐ │ │ - | │ [stemmed] > tf > [tf]│ │ │ - | └─────┬─────────┬──────┘ │ │ - | │ │ │ │ - | │ │ └─────┼───────────────┐ - | │ │ ┌───────────────────────────┘ │ - | │ └───┼───────────────────────────────────┐ │ - | v │ │ │ - | ┌───────────────────┐ │ │ │ - | │ [tf] > idf > [idf]│ │ │ │ - | └────┬──────────────┘ │ │ │ - | │ │ │ │ - | v v v v - | ┌───────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────┐ - | │(HEAD)( idf_zipped > [idf_zipped]│ │(HEAD) [stemmed,tf] > tf_zipped > [tf_zipped]│ - | └───────────────────────────────────────────────────────┘ └─────────────────────────────────────────────┘ - |""".stripMargin - ) - - val pipeline = flow.build() - - val stages = pipeline.getStages - val input_output = getInputsOutputs(stages) - input_output - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(StopWordsRemover,token,stemmed) - |(HashingTF,stemmed,tf) - |(IDF,tf,idf) - |(UDFTransformer,stemmed|tf,tf_zipped) - |(UDFTransformer,stemmed|idf,idf_zipped) - """.stripMargin - ) - -// val outDF = pipeline.fit(training).transform(training) -// outDF.show() - } - - it("Pipeline can be visualized as ASCII art") { - val flow = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :-> new IDF() -> DFDSuite.IDF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED - ).from(STEMMED) :&& DFDSuite.IDF :>> UDFTransformer(zipping) -> IDF_ZIPPED - - flow - .show(showID = false, showInputs = false, asciiArt = true) - .shouldBe( - """ - | ┌───────────────┐ - | │(TAIL>) [input]│ - | └───────┬───────┘ - | │ - | v - | ┌────────────────┐ - | │ token > [token]│ - | └────────┬───────┘ - | │ - | v - | ┌────────────────────┐ - | │ stemmed > [stemmed]│ - | └─────┬───────┬─┬────┘ - | │ │ │ - | ┌───────────┘ │ └─────────────────┐ - | │ ┌───────────┘ │ - | v │ │ - | ┌──────────┐ │ │ - | │ tf > [tf]│ │ │ - | └───┬───┬──┘ │ │ - | │ │ │ │ - | │ └─────┼─────────────────────────┐ │ - | v │ │ │ - | ┌────────────┐ │ │ │ - | │ idf > [idf]│ │ │ │ - | └───┬────────┘ │ │ │ - | │ │ │ │ - | v v v v - | ┌───────────────────────────────────────┐ ┌──────────────────────────────┐ - | │(HEAD)( [idf_zipped]│ │(HEAD) tf_zipped > [tf_zipped]│ - | └───────────────────────────────────────┘ └──────────────────────────────┘ - """.stripMargin - ) - } - - it("Pipeline can be visualized as ASCII art backwards") { - val flow = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :-> new IDF() -> DFDSuite.IDF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED - ).from(STEMMED) :&& DFDSuite.IDF :>> UDFTransformer(zipping) -> IDF_ZIPPED - - flow - .show(showID = false, forward = false, asciiArt = true) - .shouldBe( - """ - | ┌───────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────┐ - | │(HEAD)( idf_zipped > [idf_zipped]│ │(HEAD) [stemmed,tf] > tf_zipped > [tf_zipped]│ - | └───────────────────────────────────────────────────────┘ └─────────────────────────────────────────────┘ - | ^ ^ ^ ^ - | │ │ │ │ - | ┌────┴──────────────┐ │ │ │ - | │ [tf] > idf > [idf]│ │ │ │ - | └───────────────────┘ │ │ │ - | ^ │ │ │ - | │ ┌───┼───────────────────────────────────┘ │ - | │ │ └───────────────────────────┐ │ - | │ │ ┌─────┼───────────────┘ - | │ │ │ │ - | ┌─────┴─────────┴──────┐ │ │ - | │ [stemmed] > tf > [tf]│ │ │ - | └──────────────────────┘ │ │ - | ^ │ │ - | │ │ │ - | └─────────────┐ │ │ - | │ │ │ - | ┌───────┴───────────────┴─────┴┐ - | │ [token] > stemmed > [stemmed]│ - | └──────────────────────────────┘ - | ^ - | │ - | ┌─────────────┴────────────┐ - | │ [input] > token > [token]│ - | └──────────────────────────┘ - | ^ - | │ - | ┌────────┴──────┐ - | │(TAIL>) [input]│ - | └───────────────┘ - """.stripMargin - ) - } - - it("Flow can build PipelineModel") { - val model = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> stemming -> STEMMED - :-> tf -> TF - :>- STEMMED :&& TF :>> UDFTransformer(zipping) -> TF_ZIPPED - ).buildModel() - - val stages = model.stages - val input_output = getInputsOutputs(stages) - input_output - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(StopWordsRemover,token,stemmed) - |(HashingTF,stemmed,tf) - |(UDFTransformer,stemmed|tf,tf_zipped) - """.stripMargin - ) - - val transformed = model.transform(training) - - transformed.schema.treeString.shouldBe( - """ - |root - | |-- id: long (nullable = false) - | |-- input: string (nullable = true) - | |-- label: double (nullable = false) - | |-- token: array (nullable = true) - | | |-- element: string (containsNull = true) - | |-- stemmed: array (nullable = true) - | | |-- element: string (containsNull = true) - | |-- tf: vector (nullable = true) - | |-- tf_zipped: array (nullable = true) - | | |-- element: struct (containsNull = true) - | | | |-- _1: string (nullable = true) - | | | |-- _2: double (nullable = false) - |""".stripMargin - ) - - transformed.show(false) - - transformed - .collect() - .mkString("\n") - .shouldBe( - """ - |[0,a b c d e spark,1.0,ArraySeq(a, b, c, d, e, spark),ArraySeq(b, c, d, e, spark),(262144,[74920,89530,148981,167694,173558],[1.0,1.0,1.0,1.0,1.0]),ArraySeq([b,1.0], [c,1.0], [d,1.0], [e,1.0], [spark,1.0])] - |[1,b d,0.0,ArraySeq(b, d),ArraySeq(b, d),(262144,[89530,148981],[1.0,1.0]),ArraySeq([b,1.0], [d,1.0])] - |[2,spark f g h,1.0,ArraySeq(spark, f, g, h),ArraySeq(spark, f, g, h),(262144,[36803,173558,209078,228158],[1.0,1.0,1.0,1.0]),ArraySeq([spark,1.0], [f,1.0], [g,1.0], [h,1.0])] - |[3,hadoop mapreduce,0.0,ArraySeq(hadoop, mapreduce),ArraySeq(hadoop, mapreduce),(262144,[132966,198017],[1.0,1.0]),ArraySeq([hadoop,1.0], [mapreduce,1.0])] - |[3,hadoop mapreduce,1.0,ArraySeq(hadoop, mapreduce),ArraySeq(hadoop, mapreduce),(262144,[132966,198017],[1.0,1.0]),ArraySeq([hadoop,1.0], [mapreduce,1.0])] - |[4,b spark who,1.0,ArraySeq(b, spark, who),ArraySeq(b, spark),(262144,[148981,173558],[1.0,1.0]),ArraySeq([b,1.0], [spark,1.0])] - |[5,g d a y,0.0,ArraySeq(g, d, a, y),ArraySeq(g, d, y),(262144,[36803,89530,220451],[1.0,1.0,1.0]),ArraySeq([g,1.0], [d,1.0], [y,1.0])] - |[6,spark fly,1.0,ArraySeq(spark, fly),ArraySeq(spark, fly),(262144,[39928,173558],[1.0,1.0]),ArraySeq([spark,1.0], [fly,1.0])] - |[7,was mapreduce,0.0,ArraySeq(was, mapreduce),ArraySeq(mapreduce),(262144,[132966],[1.0]),ArraySeq([mapreduce,1.0])] - |[8,e spark program,1.0,ArraySeq(e, spark, program),ArraySeq(e, spark, program),(262144,[76285,167694,173558],[1.0,1.0,1.0]),ArraySeq([e,1.0], [spark,1.0], [program,1.0])] - |[9,a e c l,0.0,ArraySeq(a, e, c, l),ArraySeq(e, c, l),(262144,[1303,74920,167694],[1.0,1.0,1.0]),ArraySeq([e,1.0], [c,1.0], [l,1.0])] - |[10,spark compile,1.0,ArraySeq(spark, compile),ArraySeq(spark, compile),(262144,[109869,173558],[1.0,1.0]),ArraySeq([spark,1.0], [compile,1.0])] - |[11,hadoop software,0.0,ArraySeq(hadoop, software),ArraySeq(hadoop, software),(262144,[123474,198017],[1.0,1.0]),ArraySeq([hadoop,1.0], [software,1.0])] - |""".stripMargin - ) - - } - - val validPart: DFD = ( - DFD('input) - :-> new Tokenizer() -> TOKEN - :-> tf -> TF - ) - - val validPart2: DFD = DFD('label) :>> new OneHotEncoder() -> "label_one_hot" - val irrelevantPart: DFD = DFD('dummy) :>> new OneHotEncoder() -> "dummy_one_hot" - val typeInconsistentPart: DFD = DFD('label) :>> new Tokenizer() -> "label_cannot_be_tokenized" - - it("If adaptation = IgnoreIrrelevant, Flow can build a full pipeline given a valid schema evidence") { - - val complete = ((validPart U validPart2) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant - ) - - getInputsOutputs(complete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - |(OneHotEncoder,label,label_one_hot) - |(VectorAssembler,tf|label_one_hot,VectorAssembler) - """.stripMargin - ) - } - - it("If adaptation = IgnoreIrrelevant, Flow can build an incomplete pipeline when some of the sources are missing") { - - val incomplete = ((validPart U irrelevantPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant - ) - - getInputsOutputs(incomplete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - """.stripMargin - ) - } - - it( - "If adaptation = IgnoreIrrelevant, Flow can build an incomplete pipeline when some of the sources have inconsistent type" - ) { - - val incomplete = ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant - ) - - getInputsOutputs(incomplete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - """.stripMargin - ) - } - - it( - "If adaptation = IgnoreIrrelevant_TypeUnsafe, Flow can still build a full pipeline when some of the sources have inconsistent type" - ) { - - val incomplete = ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant_TypeUnsafe - ) - - getInputsOutputs(incomplete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - |(Tokenizer,label,label_cannot_be_tokenized) - |(VectorAssembler,tf|label_cannot_be_tokenized,VectorAssembler) - """.stripMargin - ) - } - - it("If adaptation = Force, Flow can still build a full pipeline when some of the sources are missing") { - - val forced = ((validPart U irrelevantPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.Force - ) - - getInputsOutputs(forced.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - |(OneHotEncoder,dummy,dummy_one_hot) - |(VectorAssembler,tf|dummy_one_hot,VectorAssembler) - """.stripMargin - ) - } - - it("If adaptation = Force, Flow can still build a full pipeline when some of the sources have inconsistent type") { - - val forced = ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.Force - ) - - getInputsOutputs(forced.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - |(Tokenizer,label,label_cannot_be_tokenized) - |(VectorAssembler,tf|label_cannot_be_tokenized,VectorAssembler) - """.stripMargin - ) - } - - it("If adaption = FailFast, throw an exception when some of the sources are missing") { - - intercept[IllegalArgumentException]( - ((validPart U irrelevantPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.FailFast - ) - ) - } - - it("If adaption = FailFast, throw an exception when some of the sources have inconsistent type") { - - intercept[IllegalArgumentException]( - ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.FailFast - ) - ) - } - - it( - "If adaptation = FailFast_TypeUnsafe, Flow can still build a full pipeline when some of the sources have inconsistent type" - ) { - - val incomplete = ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.FailFast_TypeUnsafe - ) - - getInputsOutputs(incomplete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - |(Tokenizer,label,label_cannot_be_tokenized) - |(VectorAssembler,tf|label_cannot_be_tokenized,VectorAssembler) - """.stripMargin - ) - } - - it( - "If adaption = IgnoreIrrelevant_ValidateSchema, Flow can build an incomplete pipeline when some of the sources are missing" - ) { - - val incomplete = ((validPart U irrelevantPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant_ValidateSchema - ) - - getInputsOutputs(incomplete.getStages) - .mkString("\n") - .shouldBe( - """ - |(Tokenizer,input,token) - |(HashingTF,token,tf) - """.stripMargin - ) - } - - it( - "If adaption = IgnoreIrrelevant_ValidateSchema, throw an exception when some of the sources have inconsistent type" - ) { - - intercept[IllegalArgumentException]( - ((validPart U typeInconsistentPart) :>> new VectorAssembler()) - .build( - dfEvidence = training, - adaptation = SchemaAdaptation.IgnoreIrrelevant_ValidateSchema - ) - ) - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/MapHeadSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/MapHeadSuite.scala deleted file mode 100644 index b55ce155d..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/MapHeadSuite.scala +++ /dev/null @@ -1,231 +0,0 @@ -package org.apache.spark.ml.dsl - -import org.apache.spark.ml.feature._ - -/** - * Created by peng on 27/04/16. - */ -class MapHeadSuite extends AbstractDFDSuite { - - import DFDComponent._ - - it("mapHead_> Source doesn't work") { - intercept[IllegalArgumentException] { - 'input :=>> new Tokenizer() :=>> 'dummy - } - } - - it("mapHead_< Source doesn't work") { - intercept[IllegalArgumentException] { - 'input <<=: new Tokenizer() <<=: 'dummy - } - } - - it("mapHead_> can append to 2 heads") { - val flow = ( - ('input1 U 'input2) - :=>> new VectorAssembler() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input1] - |+- > ForwardNode (HEAD) [input1] > VectorAssembler > [input1$VectorAssembler] - |> ForwardNode (TAIL>) [input2] - |+- > ForwardNode (HEAD)( VectorAssembler > [input2$VectorAssembler] - |/ right < - |> ForwardNode (HEAD)( VectorAssembler > [input2$VectorAssembler] - """.stripMargin - ) - } - - it("mapHead_< can append to 2 heads") { - - val flow = ( - new VectorAssembler() - <<=: ('input1 U 'input2) - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input2] > VectorAssembler > [input2$VectorAssembler] - |/ right < - |> ForwardNode ( ForwardNode (HEAD) [input1] > VectorAssembler > [input1$VectorAssembler] - |> ForwardNode ( ForwardNode (HEAD)(TAIL>) [input2] > VectorAssembler > [input2$VectorAssembler] - """.stripMargin - ) - } - - it("mapHead_> can generate 2 stage replicas and append to 2 selected") { - - val flow = ( - ( - 'input - :>> new Tokenizer() - :>> new StopWordsRemover() - ).from("Tokenizer") - .and("StopWordsRemover") - :=>> new NGram() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode (HEAD) [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode (HEAD)( NGram > [input$Tokenizer$StopWordsRemover$NGram] - |/ right < - |> ForwardNode (HEAD)( NGram > [input$Tokenizer$StopWordsRemover$NGram] - """.stripMargin - ) - } - - it("mapHead_< can generate 2 stage replicas and append to 2 selected") { - - val flow = ( - new NGram() - <<=: ( - new StopWordsRemover() - <<: new Tokenizer() - <<: 'input - ).from("Tokenizer") - .and("StopWordsRemover") - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - |/ right < - |> ForwardNode ( ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode (HEAD) [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - """.stripMargin - ) - } - - it("mapHead_> can generate 2 stage replicas and append to 2 heads") { - - val flow = ( - ( - 'input - :>> new Tokenizer() - :>> new StopWordsRemover() - ).from("Tokenizer") - .and("StopWordsRemover") - :=>> new NGram() - :=>> new HashingTF() - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (TAIL>) [input] - |+- > ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | : +- > ForwardNode (HEAD) [input$Tokenizer$NGram] > HashingTF > [input$Tokenizer$NGram$HashingTF] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - | +- > ForwardNode (HEAD)( HashingTF > [input$Tokenizer$StopWordsRemover$NGram$HashingTF] - |/ right < - |> ForwardNode (HEAD)( HashingTF > [input$Tokenizer$StopWordsRemover$NGram$HashingTF] - """.stripMargin - ) - } - - it("mapHead_< can generate 2 stage replicas and append to 2 heads") { - - val flow = ( - new HashingTF() - <<=: new NGram() - <<=: ( - new StopWordsRemover() - <<: new Tokenizer() - <<: 'input - ).from("Tokenizer") - .and("StopWordsRemover") - ) - - flow - .show(showID = false, compactionOpt = compactionOpt) - .treeNodeShouldBe( - """ - |\ left > - |> ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover$NGram] > HashingTF > [input$Tokenizer$StopWordsRemover$NGram$HashingTF] - |/ right < - |> ForwardNode ( ForwardNode [input] > Tokenizer > [input$Tokenizer] - | :- > ForwardNode [input$Tokenizer] > NGram > [input$Tokenizer$NGram] - | : +- > ForwardNode (HEAD) [input$Tokenizer$NGram] > HashingTF > [input$Tokenizer$NGram$HashingTF] - | +- > ForwardNode [input$Tokenizer] > StopWordsRemover > [input$Tokenizer$StopWordsRemover] - | +- > ForwardNode [input$Tokenizer$StopWordsRemover] > NGram > [input$Tokenizer$StopWordsRemover$NGram] - | +- > ForwardNode (HEAD)(TAIL>) [input$Tokenizer$StopWordsRemover$NGram] > HashingTF > [input$Tokenizer$StopWordsRemover$NGram$HashingTF] - """.stripMargin - ) - } - - it("mapHead_> won't remove Source of downstream if it's in tails of both side") { - - val dummy: DFDComponent = 'dummy - val down = dummy :-> new VectorAssembler() <-: dummy - - val flow = ( - 'input - :=>> down - ) - - flow - .show(showID = false, compactionOpt = compactionOpt, forward = false) - .treeNodeShouldBe( - """ - |< BackwardNode (HEAD) [dummy,dummy,input] > VectorAssembler > [VectorAssembler] - |:- < BackwardNode () [input] - """.stripMargin - ) - } - - it("mapHead_< won't remove Source of downstream if it's in tails of both side") { - - val dummy: DFDComponent = 'dummy - val down = dummy :-> new VectorAssembler() <-: dummy - - val flow = - down <<=: - 'input - - flow - .show(showID = false, compactionOpt = compactionOpt, forward = false) - .treeNodeShouldBe( - """ - |< BackwardNode (HEAD) [dummy,dummy,input] > VectorAssembler > [VectorAssembler] - |:- < BackwardNode (TAIL>) [dummy] - |:- < BackwardNode (TAIL>) [dummy] - |+- < BackwardNode ( v -> v.mkString) - - val trie = TrieNode.build(map) - - trie - .toString() - .shouldBe( - """ - |TrieNode 0 - |:- TrieNode [1], 1, 1 - |: +- TrieNode [1, 2], 12, 2 - |: :- TrieNode [1, 2, 3], 123, 3 - |: +- TrieNode [1, 2, 4], 124, 3 - |+- TrieNode [A], A, 1 - | +- TrieNode [A, B], AB, 2 - | +- TrieNode [A, B, C], ABC, 3 - | +- TrieNode [A, B, C, D], ABCD, 4 - | :- TrieNode [A, B, C, D, E], ABCDE, 5 - | +- TrieNode [A, B, C, D, F], ABCDF, 5 - """.stripMargin - ) - - trie.compact - .rebuildDepth() - .toString() - .shouldBe( - """ - |TrieNode 0 - |:- TrieNode [1, 2], 12, 1 - |: :- TrieNode [1, 2, 3], 123, 2 - |: +- TrieNode [1, 2, 4], 124, 2 - |+- TrieNode [A, B, C, D], ABCD, 1 - | :- TrieNode [A, B, C, D, E], ABCDE, 2 - | +- TrieNode [A, B, C, D, F], ABCDF, 2 - """.stripMargin - ) - } - - it("pruneUp can rename single children") { - val map = Seq( - "A", - "AB", - "ABC", - "ABCD", - "ABCDE", - "ABCDF", - "1", - "12", - "123", - "124" - ).map(_.split("").toSeq) - .map(v => v -> v.mkString) - - val trie = TrieNode.build(map) - - trie - .toString() - .shouldBe( - """ - |TrieNode 0 - |:- TrieNode [1], 1, 1 - |: +- TrieNode [1, 2], 12, 2 - |: :- TrieNode [1, 2, 3], 123, 3 - |: +- TrieNode [1, 2, 4], 124, 3 - |+- TrieNode [A], A, 1 - | +- TrieNode [A, B], AB, 2 - | +- TrieNode [A, B, C], ABC, 3 - | +- TrieNode [A, B, C, D], ABCD, 4 - | :- TrieNode [A, B, C, D, E], ABCDE, 5 - | +- TrieNode [A, B, C, D, F], ABCDF, 5 - """.stripMargin - ) - - trie.pruneUp - .rebuildDepth() - .toString() - .shouldBe( - """ - |TrieNode 0 - |:- TrieNode [1], 1, 1 - |: +- TrieNode [1], 12, 2 - |: :- TrieNode [1, 3], 123, 3 - |: +- TrieNode [1, 4], 124, 3 - |+- TrieNode [A], A, 1 - | +- TrieNode [A], AB, 2 - | +- TrieNode [A], ABC, 3 - | +- TrieNode [A], ABCD, 4 - | :- TrieNode [A, E], ABCDE, 5 - | +- TrieNode [A, F], ABCDF, 5 - """.stripMargin - ) - } - - it("reversed pruneUp can minimize names") { - val names = - """ - |AB - |ABC - |ABCD - |ABCDE - |ABK - |ABCK - |ABCDK - |ABCDEK - """.trim.stripMargin - .split("\n") - .map(_.split("").toSeq) - - val trie = TrieNode.build( - names - .map(_.reverse) - .map(v => v -> v) - ) - - val pairs = trie.pruneUp - .flatMap { node => - val k = node.key - node.value.map(_ -> k) - } - .map(tuple => tuple._1.reverse -> tuple._2.reverse) - - val map = Map(pairs: _*) - val result = names.map { v => - v.mkString -> map(v).mkString - } - result - .mkString("\n") - .shouldBe( - """ - |(AB,B) - |(ABC,C) - |(ABCD,D) - |(ABCDE,E) - |(ABK,BK) - |(ABCK,CK) - |(ABCDK,DK) - |(ABCDEK,EK) - """.stripMargin - ) - } - - it("reversed compact can minimize repeated names") { - val names = - """ - |A - |AA - |AAA - |AAAA - |AAAAA - |AAAAAA - |AAAAAB - """.trim.stripMargin - .split("\n") - .map(_.split("").toSeq) - - val trie = TrieNode.build( - names - .map(_.reverse) - .map(v => v -> v) - ) - - val pairs = trie.pruneUp - .flatMap { node => - val k = node.key - node.value.map(_ -> k) - } - .map(tuple => tuple._1.reverse -> tuple._2.reverse) - - val map = Map(pairs: _*) - val result = names.map { v => - v.mkString -> map(v).mkString - } - result - .mkString("\n") - .shouldBe( - """ - |(A,A) - |(AA,A) - |(AAA,A) - |(AAAA,A) - |(AAAAA,A) - |(AAAAAA,A) - |(AAAAAB,B) - """.stripMargin - ) - } - - it("reversed compact can minimize some names") { - val names = - """ - |input Tokenizer - |input Tokenizer HashingTF - |input Tokenizer StopWordsRemover HashingTF - |input Tokenizer HashingTF VectorAssembler - |input Tokenizer StopWordsRemover HashingTF VectorAssembler - |input - |input Tokenizer StopWordsRemover - """.trim.stripMargin - .split("\n") - .map(_.split(" ").toSeq) - - val trie = TrieNode.build( - names - .map(_.reverse) - .map(v => v -> v) - ) - - val pairs = trie.pruneUp - .flatMap { node => - val k = node.key - node.value.map(_ -> k) - } - .map(tuple => tuple._1.reverse -> tuple._2.reverse) - - val map = Map(pairs: _*) - val result = names.map { v => - v.mkString(" ") -> map(v).mkString(" ") - } - result - .mkString("\n") - .shouldBe( - """ - |(input Tokenizer,Tokenizer) - |(input Tokenizer HashingTF,Tokenizer HashingTF) - |(input Tokenizer StopWordsRemover HashingTF,StopWordsRemover HashingTF) - |(input Tokenizer HashingTF VectorAssembler,Tokenizer VectorAssembler) - |(input Tokenizer StopWordsRemover HashingTF VectorAssembler,StopWordsRemover VectorAssembler) - |(input,input) - |(input Tokenizer StopWordsRemover,StopWordsRemover) - """.stripMargin - ) - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/UDFTransformerSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/UDFTransformerSuite.scala deleted file mode 100644 index 932015f09..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/UDFTransformerSuite.scala +++ /dev/null @@ -1,49 +0,0 @@ -package org.apache.spark.ml.dsl - -import com.tribbloids.spookystuff.testutils.{BaseSpec, TestHelper} -import org.apache.spark.ml.feature.Tokenizer -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.expressions.UserDefinedFunction - -case class User( - name: String, - age: Int -) - -class UDFTransformerSuite extends BaseSpec { - - val df1 = TestHelper.TestSQL.createDataFrame( - Seq( - User("Reza$", 25), - User("Holden$", 25) - ) - ) - - val tokenizer: Tokenizer = new Tokenizer().setInputCol("name").setOutputCol("name_token") - val stemming: UserDefinedFunction = udf { v: Seq[String] => - v.map(_.stripSuffix("$")) - } - val arch = UDFTransformer().setUDFSafely(stemming).setInputCols(Array("name_token")).setOutputCol("name_stemmed") - val src: DataFrame = tokenizer.transform(df1) - - it("transformer has consistent schema") { - val end = arch.transform(src) - val endSchema = end.schema - val endSchema2 = arch.transformSchema(src.schema) - assert(endSchema.toString() == endSchema2.toString()) - } - - it("transformer can add new column") { - val end = arch.transform(src) - end - .collect() - .mkString("\n") - .shouldBe( - """ - |[Reza$,25,ArraySeq(reza$),ArraySeq(reza)] - |[Holden$,25,ArraySeq(holden$),ArraySeq(holden)] - |""".stripMargin - ) - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ScalaNameMixinSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixinSpec.scala similarity index 53% rename from parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ScalaNameMixinSuite.scala rename to parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixinSpec.scala index 5e5556436..faf7aefc8 100644 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ScalaNameMixinSuite.scala +++ b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/ClassOpsMixinSpec.scala @@ -2,24 +2,26 @@ package org.apache.spark.ml.dsl.utils import com.tribbloids.spookystuff.testutils.BaseSpec -class ScalaNameMixinSuite extends BaseSpec { +class ClassOpsMixinSpec extends BaseSpec { it("can process anonymous function dependent object") { - object impl extends ObjectSimpleNameMixin + object impl extends ClassOpsMixin def getImpl = { - object impl extends ObjectSimpleNameMixin + object impl extends ClassOpsMixin impl } - val vs = (0 to 3).flatMap { _ => + val vs: Seq[ClassOpsMixin] = (0 to 3).flatMap { _ => Seq(impl, getImpl) } vs.foreach { v => - v.objectSimpleName.shouldBe( + val clz = v.getClass + + clz.simpleName_Scala.shouldBe( "impl" ) } diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/DSLUtilsSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/DSLUtilsSuite.scala deleted file mode 100644 index a703a76f9..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/DSLUtilsSuite.scala +++ /dev/null @@ -1,31 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -import com.tribbloids.spookystuff.testutils.BaseSpec - -/** - * Created by peng on 10/04/16. - */ -class DSLUtilsSuite extends BaseSpec { - - def get1(): Array[StackTraceElement] = { - DSLUtils.getBreakpointInfo() - } - - lazy val get2: Array[StackTraceElement] = get1() - - val get3: Array[StackTraceElement] = get2 - - def defaultParamCaller( - c: Array[StackTraceElement] = get2 - ): Array[StackTraceElement] = c - - it("methodName should return caller's name") { - assert(get3.head.getMethodName == "get1") - assert(get3(1).getMethodName == "get2") - assert(get3(2).isNativeMethod) - - val dpc = defaultParamCaller() - assert(dpc.head.getMethodName == "get1") - assert(dpc.apply(1).getMethodName == "get2") - } -} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnetSuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnetSuite.scala new file mode 100644 index 000000000..b3323324f --- /dev/null +++ b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafeMagnetSuite.scala @@ -0,0 +1,32 @@ +package org.apache.spark.ml.dsl.utils + +import com.tribbloids.spookystuff.testutils.BaseSpec + +class NullSafeMagnetSuite extends BaseSpec { + + val v: String = "abc" + + def validate(nullSafe: => NullSafeMagnet[String]): NullSafeMagnet[String] = { + + assert(nullSafe.asOption.get == v) + nullSafe + } + + it("can be converted from option") { + + validate(Some("abc"): String ?? _) + } + + it("can be converted from value") { + + validate("abc": String ?? _) + } + + it("CannotBeNull can only be converted from Some") { + + validate(Some("abc"): String !! _) + + // this will fail +// validate(Option("abc"): String ! _) + } +} diff --git a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafetySuite.scala b/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafetySuite.scala deleted file mode 100644 index dc04dad17..000000000 --- a/parent/commons/src/test/scala/org/apache/spark/ml/dsl/utils/NullSafetySuite.scala +++ /dev/null @@ -1,44 +0,0 @@ -package org.apache.spark.ml.dsl.utils - -import com.tribbloids.spookystuff.testutils.BaseSpec - -class NullSafetySuite extends BaseSpec { - - val v: String = "abc" - - def validate(nullSafe: => NullSafety.Magnet[String]): NullSafety.Magnet[String] = { - - assert(nullSafe.asOption.get == v) - nullSafe - } - - it("can be converted from option") { - - validate(Some("abc"): String `?` _) - } - - it("can be converted from value") { - - validate("abc": String `?` _) - } - - it("CannotBeNull can only be converted from Some") { - - validate(Some("abc"): String ! _) - - // this will fail -// validate(Option("abc"): String ! _) - } - - it("String ? Var supports mutation") { - validate { - val v: String `?` Var = "def" - `v` := "abc" - v - } - - // this will fail -// val v: String ? _ = "def" -// v := "abc" - } -} diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/SpookyContext.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/SpookyContext.scala index ee6b59b30..7b46e0ea5 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/SpookyContext.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/SpookyContext.scala @@ -9,12 +9,12 @@ import com.tribbloids.spookystuff.relay.io.Encoder import com.tribbloids.spookystuff.row._ import com.tribbloids.spookystuff.agent.Agent import com.tribbloids.spookystuff.utils.io.HDFSResolver +import com.tribbloids.spookystuff.utils.refl.{ToCatalyst, TypeMagnet} import com.tribbloids.spookystuff.utils.serialization.{NOTSerializable, SerializerOverride} import com.tribbloids.spookystuff.utils.{ShippingMarks, TreeThrowable} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.dsl.utils.refl.{ToCatalyst, TypeMagnet} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/actions/actions.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/actions/actions.scala index 825e3bcec..62e98fc9f 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/actions/actions.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/actions/actions.scala @@ -4,8 +4,8 @@ import com.tribbloids.spookystuff.actions.Trace.DryRun import com.tribbloids.spookystuff.doc.{Doc, Observation} import com.tribbloids.spookystuff.agent.Agent import com.tribbloids.spookystuff.utils.CommonUtils +import com.tribbloids.spookystuff.utils.refl.ScalaUDT import com.tribbloids.spookystuff.{ActionException, SpookyContext} -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT import org.apache.spark.sql.types.SQLUserDefinedType import org.slf4j.LoggerFactory diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/FetchedUDT.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/FetchedUDT.scala index 8d6caf9d8..7a4d18f03 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/FetchedUDT.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/FetchedUDT.scala @@ -1,5 +1,5 @@ package com.tribbloids.spookystuff.doc -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT +import com.tribbloids.spookystuff.utils.refl.ScalaUDT class FetchedUDT extends ScalaUDT[Observation] diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/UnstructuredUDT.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/UnstructuredUDT.scala index 17b39bac8..b9266b1b0 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/UnstructuredUDT.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/doc/UnstructuredUDT.scala @@ -1,5 +1,5 @@ package com.tribbloids.spookystuff.doc -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT +import com.tribbloids.spookystuff.utils.refl.ScalaUDT class UnstructuredUDT extends ScalaUDT[Unstructured] diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/Delta.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/Delta.scala index d8212d10a..6f97f814d 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/Delta.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/Delta.scala @@ -7,7 +7,7 @@ import com.tribbloids.spookystuff.dsl.ForkType import com.tribbloids.spookystuff.extractors.impl.Get import com.tribbloids.spookystuff.extractors.{Extractor, Resolved} import com.tribbloids.spookystuff.row.{DataRow, Field, Sampler, SpookySchema, SquashedRow, TypedField} -import org.apache.spark.ml.dsl.utils.refl.CatalystTypeOps +import com.tribbloids.spookystuff.utils.refl.CatalystTypeOps import org.apache.spark.sql.types.{ArrayType, IntegerType} trait Delta extends Serializable { diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/DeltaPlan.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/DeltaPlan.scala index d9cb07d5e..e70a74c85 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/DeltaPlan.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/execution/DeltaPlan.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.execution import com.tribbloids.spookystuff.execution.Delta.ToDelta import com.tribbloids.spookystuff.row._ -import org.apache.spark.ml.dsl.utils.refl.CatalystTypeOps +import com.tribbloids.spookystuff.utils.refl.CatalystTypeOps case class DeltaPlan( override val child: ExecutionPlan, diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractor.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractor.scala index b12558cae..c702357fd 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractor.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractor.scala @@ -4,7 +4,7 @@ import com.tribbloids.spookystuff.Const import com.tribbloids.spookystuff.row.Field import com.tribbloids.spookystuff.tree.TreeView import com.tribbloids.spookystuff.utils.SpookyUtils -import org.apache.spark.ml.dsl.utils.refl.{CatalystTypeOps, TypeMagnet, UnreifiedObjectType} +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, TypeMagnet, UnreifiedObjectType} import org.apache.spark.sql.catalyst.ScalaReflection.universe import org.apache.spark.sql.catalyst.ScalaReflection.universe.TypeTag diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractorImplicits.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractorImplicits.scala index 1778deb6f..632db7689 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractorImplicits.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/GenExtractorImplicits.scala @@ -6,7 +6,7 @@ import com.tribbloids.spookystuff.extractors.GenExtractor.AndThen import com.tribbloids.spookystuff.extractors.impl.Extractors._ import com.tribbloids.spookystuff.extractors.impl.{AppendSeq, Get, Zipped} import com.tribbloids.spookystuff.row.Field -import org.apache.spark.ml.dsl.utils.refl.{CatalystTypeOps, UnreifiedObjectType} +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, UnreifiedObjectType} import org.apache.spark.sql.types.MapType import java.sql.Timestamp diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractor.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractor.scala index ddb570503..17c49e969 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractor.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractor.scala @@ -1,7 +1,7 @@ package com.tribbloids.spookystuff.extractors +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, TypeMagnet, TypeUtils, UnreifiedObjectType} import org.apache.spark.ml.dsl.utils.DSLUtils -import org.apache.spark.ml.dsl.utils.refl._ import org.apache.spark.sql.catalyst.ScalaReflection.universe._ import java.lang.reflect.Method diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Get.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Get.scala index e4120d47c..7707b7ef6 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Get.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Get.scala @@ -3,7 +3,7 @@ package com.tribbloids.spookystuff.extractors.impl import com.tribbloids.spookystuff.extractors.GenExtractor.Leaf import com.tribbloids.spookystuff.extractors._ import com.tribbloids.spookystuff.row._ -import org.apache.spark.ml.dsl.utils.refl.CatalystTypeOps +import com.tribbloids.spookystuff.utils.refl.CatalystTypeOps import org.apache.spark.sql.types._ /** diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Lit.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Lit.scala index 999d64bcc..ad6876f99 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Lit.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/extractors/impl/Lit.scala @@ -5,7 +5,7 @@ import com.tribbloids.spookystuff.extractors.GenExtractor.Static import com.tribbloids.spookystuff.extractors._ import com.tribbloids.spookystuff.relay.IR.Aux import com.tribbloids.spookystuff.relay.{Relay, TreeIR} -import org.apache.spark.ml.dsl.utils.refl.UnreifiedObjectType +import com.tribbloids.spookystuff.utils.refl.UnreifiedObjectType import org.apache.spark.sql.catalyst.ScalaReflection.universe.TypeTag import org.apache.spark.sql.types._ diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/AbstractMetrics.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/AbstractMetrics.scala index 2e981f8e9..59869bf3c 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/AbstractMetrics.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/AbstractMetrics.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.metrics import com.tribbloids.spookystuff.relay.TreeIR import com.tribbloids.spookystuff.utils.CommonUtils -import org.apache.spark.ml.dsl.utils.refl.ReflectionUtils +import com.tribbloids.spookystuff.utils.refl.ReflectionUtils import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/Acc.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/Acc.scala index d38afbe6e..6355a0c52 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/Acc.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/Acc.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.metrics import com.tribbloids.spookystuff.utils.accumulator.MapAccumulator import org.apache.spark.SparkContext -import org.apache.spark.ml.dsl.utils.? +import org.apache.spark.ml.dsl.utils.?? import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.EventTimeStatsAccum import org.apache.spark.util.{AccumulatorV2, DoubleAccumulator, LongAccumulator} @@ -144,7 +144,7 @@ object Acc { } } - def create[IN, T <: AccumulatorV2[_, _]](value: IN, displayNameOvrd: String `?` _ = None)( + def create[IN, T <: AccumulatorV2[_, _]](value: IN, displayNameOvrd: String ?? _ = None)( implicit canBuild: CanInit[IN, T] ): Acc[T] = { diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/MetricLike.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/MetricLike.scala index 0ab746655..25db3c363 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/MetricLike.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/metrics/MetricLike.scala @@ -1,11 +1,11 @@ package com.tribbloids.spookystuff.metrics import ai.acyclic.prover.commons.same.EqualBy -import org.apache.spark.ml.dsl.utils.ObjectSimpleNameMixin +import org.apache.spark.ml.dsl.utils.ClassOpsMixin -trait MetricLike extends Product with ObjectSimpleNameMixin with Serializable with EqualBy { +trait MetricLike extends Product with ClassOpsMixin with Serializable with EqualBy { def displayNameOvrd: Option[String] = None - lazy val displayName: String = displayNameOvrd.getOrElse(this.objectSimpleName) + lazy val displayName: String = displayNameOvrd.getOrElse(this.getClass.simpleName_Scala) } diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/python/ref/ClassRef.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/python/ref/ClassRef.scala index 717f0e5ab..051e58dce 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/python/ref/ClassRef.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/python/ref/ClassRef.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.python.ref import com.tribbloids.spookystuff.utils.SpookyUtils import com.tribbloids.spookystuff.relay.MessageAPI -import org.apache.spark.ml.dsl.utils.refl.ReflectionUtils +import com.tribbloids.spookystuff.utils.refl.ReflectionUtils trait ClassRef extends PyRef { diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/rdd/FetchedDataset.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/rdd/FetchedDataset.scala index 492461679..3a73a42b4 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/rdd/FetchedDataset.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/rdd/FetchedDataset.scala @@ -11,10 +11,10 @@ import com.tribbloids.spookystuff.extractors._ import com.tribbloids.spookystuff.extractors.impl.Get import com.tribbloids.spookystuff.row._ import com.tribbloids.spookystuff.utils.SpookyViews +import com.tribbloids.spookystuff.utils.refl.CatalystTypeOps import com.tribbloids.spookystuff.{Const, SpookyContext} import org.apache.spark.SparkContext import org.apache.spark.sql._SQLHelper -import org.apache.spark.ml.dsl.utils.refl.CatalystTypeOps import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/row/SpookySchema.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/row/SpookySchema.scala index 9747ced94..cd9df64df 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/row/SpookySchema.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/row/SpookySchema.scala @@ -3,7 +3,7 @@ package com.tribbloids.spookystuff.row import com.tribbloids.spookystuff.SpookyContext import com.tribbloids.spookystuff.execution._ import com.tribbloids.spookystuff.extractors._ -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT +import com.tribbloids.spookystuff.utils.refl.ScalaUDT import org.apache.spark.sql.types.{DataType, StructField, StructType} import scala.collection.immutable.ListMap diff --git a/parent/core/src/main/scala/com/tribbloids/spookystuff/utils/SpookyUtils.scala b/parent/core/src/main/scala/com/tribbloids/spookystuff/utils/SpookyUtils.scala index d01e51bd4..9d1adbf48 100644 --- a/parent/core/src/main/scala/com/tribbloids/spookystuff/utils/SpookyUtils.scala +++ b/parent/core/src/main/scala/com/tribbloids/spookystuff/utils/SpookyUtils.scala @@ -2,10 +2,9 @@ package com.tribbloids.spookystuff.utils import java.io.File import java.net._ -import java.nio.file.{Files, _} +import java.nio.file._ import com.tribbloids.spookystuff.utils.io.LocalResolver import org.apache.commons.io.IOUtils -import org.apache.spark.ml.dsl.UnsafeUtils import org.apache.spark.rdd.RDD import org.slf4j.LoggerFactory @@ -126,7 +125,7 @@ object SpookyUtils { assert(url.toString.startsWith("file")) - UnsafeUtils.invoke( + UnsafeReflections.invoke( classOf[URLClassLoader], ClassLoader.getSystemClassLoader, "addURL", diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/TestBeans.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/TestBeans.scala index a60a7a8f4..453deecc5 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/TestBeans.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/TestBeans.scala @@ -1,7 +1,7 @@ package com.tribbloids.spookystuff +import com.tribbloids.spookystuff.utils.refl.ScalaUDT import com.tribbloids.spookystuff.utils.serialization.{AssertSerializable, NOTSerializable} -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT import org.apache.spark.sql.types.SQLUserDefinedType import org.scalatest.Assertions diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/execution/ExplodeDataPlanSpec.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/execution/ExplodeDataPlanSpec.scala index ac94b31b1..089e2573f 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/execution/ExplodeDataPlanSpec.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/execution/ExplodeDataPlanSpec.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.execution import com.tribbloids.spookystuff.extractors.impl.Lit import com.tribbloids.spookystuff.testutils.SpookyBaseSpec -import org.apache.spark.ml.dsl.utils.refl.UnreifiedObjectType +import com.tribbloids.spookystuff.utils.refl.UnreifiedObjectType import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} /** diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractorSuite.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractorSuite.scala index b74b65b5a..846311661 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractorSuite.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaDynamicExtractorSuite.scala @@ -8,7 +8,7 @@ import com.tribbloids.spookystuff.rdd.FetchedDataset import com.tribbloids.spookystuff.row.SpookySchema import com.tribbloids.spookystuff.testutils.{FileDocsFixture, SpookyBaseSpec} import com.tribbloids.spookystuff.utils.CommonUtils -import org.apache.spark.ml.dsl.utils.refl.CatalystTypeOps +import com.tribbloids.spookystuff.utils.refl.CatalystTypeOps import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaReflectionSpike.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaReflectionSpike.scala index c398f0c75..15c2c41e4 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaReflectionSpike.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/extractors/ScalaReflectionSpike.scala @@ -4,7 +4,7 @@ import com.tribbloids.spookystuff.extractors.impl.Lit import com.tribbloids.spookystuff.row.FetchedRow import com.tribbloids.spookystuff.TestBeans.{Example, GenericExample} import com.tribbloids.spookystuff.testutils.BaseSpec -import org.apache.spark.ml.dsl.utils.refl.{CatalystTypeOps, TypeUtils, UnreifiedObjectType} +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, TypeUtils, UnreifiedObjectType} import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaReflectionSpike extends BaseSpec with CatalystTypeOps.ImplicitMixin { diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/RDDDisperseSuite.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/RDDDisperseSuite.scala index 74dcbed11..13620a0f0 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/RDDDisperseSuite.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/RDDDisperseSuite.scala @@ -2,7 +2,7 @@ package com.tribbloids.spookystuff.utils import com.tribbloids.spookystuff.testutils.{SpookyBaseSpec, TestHelper} import com.tribbloids.spookystuff.utils.collection.BufferedShuffleIteratorV1 -import org.apache.spark.ml.dsl.utils.ObjectSimpleNameMixin +import org.apache.spark.ml.dsl.utils.ClassOpsMixin import org.apache.spark.rdd.RDD import org.apache.spark.rdd.spookystuff.NarrowDispersedRDD import org.apache.spark.storage.StorageLevel @@ -117,7 +117,7 @@ class RDDDisperseSuite extends SpookyBaseSpec { rdd.unpersist(true) } - trait Facet extends ObjectSimpleNameMixin { + trait Facet extends ClassOpsMixin { val acc: LongAccumulator = sc.longAccumulator(this.facetName) var nPart: Int = -1 @@ -144,7 +144,7 @@ class RDDDisperseSuite extends SpookyBaseSpec { def doAssert(rdd: RDD[Int]): scalatest.Assertion - def facetName: String = this.objectSimpleName + def facetName: String = this.getClass.simpleName_Scala describe(facetName) { diff --git a/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/ScalaUDTSuite.scala b/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/ScalaUDTSuite.scala index 9cbbe53c1..b7ea36189 100644 --- a/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/ScalaUDTSuite.scala +++ b/parent/core/src/test/scala/com/tribbloids/spookystuff/utils/ScalaUDTSuite.scala @@ -3,8 +3,8 @@ package com.tribbloids.spookystuff.utils import com.tribbloids.spookystuff.actions.Action import com.tribbloids.spookystuff.doc.{Observation, Unstructured} import com.tribbloids.spookystuff.testutils.{BaseSpec, SpookyBaseSpec} +import com.tribbloids.spookystuff.utils.refl.{CatalystTypeOps, TypeUtils, UnreifiedObjectType} import com.tribbloids.spookystuff.utils.serialization.AssertSerializable -import org.apache.spark.ml.dsl.utils.refl.{CatalystTypeOps, TypeUtils, UnreifiedObjectType} import org.apache.spark.sql.types.DataType /** diff --git a/parent/parsing/build.gradle.kts b/parent/parsing/build.gradle.kts new file mode 100644 index 000000000..0b3474255 --- /dev/null +++ b/parent/parsing/build.gradle.kts @@ -0,0 +1,7 @@ +val vs = versions() + +dependencies { + + api(project(":parent:commons")) + testFixturesApi(testFixtures(project(":parent:commons"))) +} diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Algebra.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Algebra.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Algebra.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Algebra.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/DataAlgebra.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/DataAlgebra.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/DataAlgebra.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/DataAlgebra.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Domain.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Domain.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Domain.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Domain.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/EdgeFilter.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/EdgeFilter.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/EdgeFilter.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/EdgeFilter.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala similarity index 97% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala index a50d703a7..5dcdd0c73 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala +++ b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Element.scala @@ -44,7 +44,7 @@ object Element { override protected def _replicate(m: DataMutator)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): Edge[T] = { val newIDs = idRotator(from) -> idRotator(to) if (newIDs == from_to) this @@ -118,7 +118,7 @@ object Element { override def _replicate(m: DataMutator)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): _Module = { val newID = idRotator(this.samenessDelegatedTo) if (newID == this.samenessDelegatedTo) @@ -142,7 +142,7 @@ object Element { override protected def _replicate(m: DataMutator)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): _NodeTriplet = { new NodeTriplet[D]( node.replicate(m), diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/ElementTreeNode.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/ElementTreeNode.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/ElementTreeNode.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/ElementTreeNode.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/ElementView.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/ElementView.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/ElementView.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/ElementView.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/FlowLayout.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/FlowLayout.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/FlowLayout.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/FlowLayout.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/IDAlgebra.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/IDAlgebra.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/IDAlgebra.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/IDAlgebra.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Layout.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Layout.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Layout.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Layout.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala similarity index 96% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala index 357319ac4..ab405bdb8 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala +++ b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/LocalGraph.scala @@ -24,7 +24,7 @@ case class LocalGraph[D <: Domain] private ( override def _replicate(m: DataMutator)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): LocalGraph[D] = { new LocalGraph.BuilderImpl[D]() @@ -96,7 +96,7 @@ object LocalGraph { override def fromSeq( nodes: Seq[_NodeLike], edges: Seq[_Edge], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): GG = { val existingIDs = nodes.map(_.samenessDelegatedTo).toSet @@ -151,7 +151,7 @@ object LocalGraph { protected def linkedNode_+( v1: _NodeTriplet, v2: _NodeTriplet, - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): _NodeTriplet = { require( @@ -170,7 +170,7 @@ object LocalGraph { def union( v1: GG, v2: GG, - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): GG = { val v2Reduced: mutable.Map[D#ID, _NodeTriplet] = v2.nodeMap.map { diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala similarity index 95% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala index 5113fcd62..d4633dc86 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala +++ b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Module.scala @@ -11,13 +11,13 @@ trait Module[T <: Domain] extends Algebra.Aliases[T] { protected def _replicate(m: DataMutator)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] + node_+ : Types.Compose[NodeData] ): _Module def replicate(m: DataMutator = DataAlgebra.Mutator.identity)( implicit idRotator: Rotator[ID], - node_+ : Types.Binary[NodeData] = nodeAlgebra.add + node_+ : Types.Compose[NodeData] = nodeAlgebra.add ): this.type = _replicate(m).asInstanceOf[this.type] } diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala similarity index 89% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala index 73dbe1270..41e6669a9 100644 --- a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala +++ b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/StaticGraph.scala @@ -33,7 +33,7 @@ object StaticGraph { def fromSeq( nodes: Seq[_NodeLike], edges: Seq[_Edge], - node_+ : Types.Binary[NodeData] = nodeAlgebra.add + node_+ : Types.Compose[NodeData] = nodeAlgebra.add ): GG final def fromModule(graph: _Module): GG = { @@ -44,7 +44,7 @@ object StaticGraph { case _ctg(v) => v } } - def union(v1: GG, v2: GG, node_+ : Types.Binary[NodeData] = nodeAlgebra.add): GG + def union(v1: GG, v2: GG, node_+ : Types.Compose[NodeData] = nodeAlgebra.add): GG // TODO: this API need to change to facilitate big Heads and Tails in the format of RDD /** @@ -59,8 +59,8 @@ object StaticGraph { def serial( base: (GG, _Heads), top: (GG, _Tails), - node_+ : Types.Binary[NodeData] = nodeAlgebra.add, - edge_+ : Types.Binary[EdgeData] = edgeAlgebra.add + node_+ : Types.Compose[NodeData] = nodeAlgebra.add, + edge_+ : Types.Compose[EdgeData] = edgeAlgebra.add ): (GG, Map[_Edge, _Edge]) = { val uu: GG = union(base._1, top._1, node_+) diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Visualisation.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Visualisation.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/Visualisation.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/graph/Visualisation.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/BacktrackingManager.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/BacktrackingManager.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/BacktrackingManager.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/BacktrackingManager.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserDSL.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserDSL.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserDSL.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserDSL.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserGraph.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserGraph.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserGraph.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FSMParserGraph.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FState.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FState.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/FState.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/FState.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/ParsingRun.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/ParsingRun.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/ParsingRun.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/ParsingRun.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/Pattern.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/Pattern.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/Pattern.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/Pattern.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/PhaseVec.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/PhaseVec.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/PhaseVec.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/PhaseVec.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleIO.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleIO.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleIO.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleIO.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleInput.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleInput.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleInput.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleInput.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleOutcome.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleOutcome.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/RuleOutcome.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/RuleOutcome.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/Transitions.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/Transitions.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/Transitions.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/Transitions.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableFailure.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableFailure.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableFailure.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableFailure.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableMixin.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableMixin.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableMixin.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/BacktrackableMixin.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/ParsingError.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/ParsingError.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/exception/ParsingError.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/exception/ParsingError.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/package.scala b/parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/package.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/parsing/package.scala rename to parent/parsing/src/main/scala/com/tribbloids/spookystuff/parsing/package.scala diff --git a/parent/commons/src/test/scala/com/tribbloids/spookystuff/graph/FlowLayoutSuite.scala b/parent/parsing/src/test/scala/com/tribbloids/spookystuff/graph/FlowLayoutSuite.scala similarity index 100% rename from parent/commons/src/test/scala/com/tribbloids/spookystuff/graph/FlowLayoutSuite.scala rename to parent/parsing/src/test/scala/com/tribbloids/spookystuff/graph/FlowLayoutSuite.scala diff --git a/parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/example/SimpleFlowGraph.scala b/parent/parsing/src/test/scala/com/tribbloids/spookystuff/graph/example/SimpleFlowGraph.scala similarity index 100% rename from parent/commons/src/main/scala/com/tribbloids/spookystuff/graph/example/SimpleFlowGraph.scala rename to parent/parsing/src/test/scala/com/tribbloids/spookystuff/graph/example/SimpleFlowGraph.scala diff --git a/parent/commons/src/test/scala/com/tribbloids/spookystuff/parsing/FSMParserDSLSuite.scala b/parent/parsing/src/test/scala/com/tribbloids/spookystuff/parsing/FSMParserDSLSuite.scala similarity index 100% rename from parent/commons/src/test/scala/com/tribbloids/spookystuff/parsing/FSMParserDSLSuite.scala rename to parent/parsing/src/test/scala/com/tribbloids/spookystuff/parsing/FSMParserDSLSuite.scala diff --git a/parent/commons/src/test/scala/com/tribbloids/spookystuff/parsing/ParsingRunSuite.scala b/parent/parsing/src/test/scala/com/tribbloids/spookystuff/parsing/ParsingRunSuite.scala similarity index 100% rename from parent/commons/src/test/scala/com/tribbloids/spookystuff/parsing/ParsingRunSuite.scala rename to parent/parsing/src/test/scala/com/tribbloids/spookystuff/parsing/ParsingRunSuite.scala diff --git a/parent/web/src/main/scala/com/tribbloids/spookystuff/web/actions/SelectorUDT.scala b/parent/web/src/main/scala/com/tribbloids/spookystuff/web/actions/SelectorUDT.scala index 1260cfaab..ed5732960 100644 --- a/parent/web/src/main/scala/com/tribbloids/spookystuff/web/actions/SelectorUDT.scala +++ b/parent/web/src/main/scala/com/tribbloids/spookystuff/web/actions/SelectorUDT.scala @@ -1,5 +1,5 @@ package com.tribbloids.spookystuff.web.actions -import org.apache.spark.ml.dsl.utils.refl.ScalaUDT +import com.tribbloids.spookystuff.utils.refl.ScalaUDT class SelectorUDT extends ScalaUDT[Selector] diff --git a/prover-commons b/prover-commons index e4098cdd6..56b3a6392 160000 --- a/prover-commons +++ b/prover-commons @@ -1 +1 @@ -Subproject commit e4098cdd679d35217687870325c85e27c7a4e9cf +Subproject commit 56b3a639209e3e549a40a841feb070a4ca19ec1b diff --git a/settings.gradle.kts b/settings.gradle.kts index 50b3b26cc..a7da7a310 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -35,6 +35,7 @@ include( // uses unstable & experimental scala features, should be modified very slowly & carefully ":parent:commons", + ":parent:parsing", ":parent:core", ":parent:web", ":parent:integration",