From b1c5ec7857650e28603fe2dfec79db55963b7950 Mon Sep 17 00:00:00 2001 From: Timur Abishev Date: Thu, 23 Sep 2021 23:54:20 -0400 Subject: [PATCH] reformat sbt and scala files --- build.sbt | 820 +-- project/plugins.sbt | 34 +- .../scala/com/twitter/scalding/Args.scala | 139 +- .../com/twitter/scalding/RangedArgs.scala | 28 +- .../scala/com/twitter/scalding/ArgTest.scala | 32 +- .../com/twitter/scalding/RangedArgsSpec.scala | 8 +- .../twitter/scalding/avro/AvroSource.scala | 47 +- .../twitter/scalding/avro/SchemaType.scala | 21 +- .../com/twitter/scalding/avro/package.scala | 32 +- .../scalding/beam_backend/BeamBackend.scala | 23 +- .../scalding/beam_backend/BeamFunctions.scala | 15 +- .../scalding/beam_backend/BeamMode.scala | 32 +- .../scalding/beam_backend/BeamOp.scala | 182 +- .../scalding/beam_backend/BeamWriter.scala | 32 +- .../scalding/beam_backend/KryoCoder.scala | 18 +- .../beam_backend/BeamBackendTests.scala | 221 +- .../twitter/scalding/hellcats/HellCats.scala | 64 +- .../scalding/hellcats/HellCatsTests.scala | 30 +- .../commons/extensions/Checkpoint.scala | 105 +- .../scheme/CombinedSequenceFileScheme.scala | 14 +- .../commons/source/BinaryConverters.scala | 9 +- .../commons/source/DailySources.scala | 85 +- .../commons/source/FixedPathSources.scala | 8 +- .../commons/source/GeneratedLzoTypedTsv.scala | 68 +- .../commons/source/HourlySources.scala | 23 +- .../source/LongThriftTransformer.scala | 12 +- .../commons/source/LzoCodecSource.scala | 5 +- .../commons/source/LzoGenericScheme.scala | 58 +- .../commons/source/LzoGenericSource.scala | 13 +- .../scalding/commons/source/LzoTraits.scala | 31 +- .../commons/source/LzoTypedText.scala | 53 +- .../commons/source/TsvWithHeader.scala | 19 +- .../source/VersionedKeyValSource.scala | 100 +- .../twitter/scalding/examples/KMeans.scala | 114 +- .../twitter/scalding/examples/MergeTest.scala | 21 +- .../twitter/scalding/examples/PageRank.scala | 112 +- .../scalding/examples/WeightedPageRank.scala | 127 +- .../examples/WeightedPageRankFromMatrix.scala | 69 +- .../scalding/examples/WordCountJob.scala | 7 +- .../scalding/ExecutionKMeansTest.scala | 23 +- .../com/twitter/scalding/PageRankTest.scala | 10 +- .../WeightedPageRankFromMatrixTest.scala | 79 +- .../scalding/WeightedPageRankTest.scala | 16 +- .../com/twitter/scalding/WordCountTest.scala | 6 +- .../commons/VersionedKeyValSourceTest.scala | 84 +- .../commons/extensions/CheckpointSpec.scala | 16 +- .../commons/scheme/ExecutionTest.scala | 22 +- .../commons/source/LzoGenericSourceSpec.scala | 4 +- .../source/typedtext/TypedTextTest.scala | 3 +- .../src/main/scala/com/twitter/package.scala | 26 +- .../scala/com/twitter/scalding/ArgHelp.scala | 93 +- .../BijectedOrderedSerialization.scala | 13 +- .../scala/com/twitter/scalding/CFuture.scala | 16 +- .../scala/com/twitter/scalding/CPromise.scala | 6 +- .../scalding/CancellationHandler.scala | 10 +- .../com/twitter/scalding/CascadeJob.scala | 7 +- .../com/twitter/scalding/CascadingMode.scala | 71 +- .../scalding/CascadingTokenUpdater.scala | 45 +- .../com/twitter/scalding/CoGroupBuilder.scala | 15 +- .../scala/com/twitter/scalding/Config.scala | 305 +- .../com/twitter/scalding/CumulativeSum.scala | 138 +- .../main/scala/com/twitter/scalding/Dsl.scala | 14 +- .../com/twitter/scalding/Execution.scala | 756 +-- .../com/twitter/scalding/ExecutionApp.scala | 14 +- .../twitter/scalding/ExecutionContext.scala | 63 +- .../scalding/ExecutionOptimizationRules.scala | 243 +- .../com/twitter/scalding/ExecutionUtil.scala | 73 +- .../twitter/scalding/FieldConversions.scala | 126 +- .../com/twitter/scalding/FileSource.scala | 333 +- .../com/twitter/scalding/FlowState.scala | 57 +- .../com/twitter/scalding/FoldOperations.scala | 21 +- .../twitter/scalding/FunctionImplicits.scala | 327 +- .../com/twitter/scalding/FutureCache.scala | 10 +- .../scalding/GeneratedConversions.scala | 1930 +++--- .../twitter/scalding/GeneratedMappable.scala | 104 +- .../scalding/GeneratedTupleAdders.scala | 5349 ++++++++++++++--- .../com/twitter/scalding/GroupBuilder.scala | 224 +- .../scalding/HfsConfPropertySetter.scala | 35 +- .../twitter/scalding/IntegralComparator.scala | 15 +- .../com/twitter/scalding/IterableSource.scala | 34 +- .../main/scala/com/twitter/scalding/Job.scala | 262 +- .../scala/com/twitter/scalding/JobStats.scala | 68 +- .../scala/com/twitter/scalding/JobTest.scala | 57 +- .../com/twitter/scalding/JoinAlgorithms.scala | 417 +- .../twitter/scalding/LibJarsExpansion.scala | 21 +- .../com/twitter/scalding/LineNumber.scala | 44 +- .../com/twitter/scalding/MemoryTap.scala | 13 +- .../scala/com/twitter/scalding/Mode.scala | 27 +- .../com/twitter/scalding/Operations.scala | 392 +- .../com/twitter/scalding/OptionalSource.scala | 9 +- .../twitter/scalding/PartitionSource.scala | 162 +- .../com/twitter/scalding/PipeDebug.scala | 22 +- .../twitter/scalding/ReduceOperations.scala | 318 +- .../scalding/ReferencedClassFinder.scala | 70 +- .../com/twitter/scalding/RichFlowDef.scala | 33 +- .../scala/com/twitter/scalding/RichPipe.scala | 442 +- .../twitter/scalding/SkewReplication.scala | 15 +- .../scala/com/twitter/scalding/Sortable.scala | 2 +- .../scala/com/twitter/scalding/Source.scala | 144 +- .../scala/com/twitter/scalding/Stats.scala | 52 +- .../twitter/scalding/StreamOperations.scala | 69 +- .../com/twitter/scalding/StringUtility.scala | 21 +- .../com/twitter/scalding/TemplateSource.scala | 85 +- .../com/twitter/scalding/TestTapFactory.scala | 50 +- .../twitter/scalding/TimePathedSource.scala | 64 +- .../scala/com/twitter/scalding/Tool.scala | 43 +- .../scala/com/twitter/scalding/Tracing.scala | 77 +- .../com/twitter/scalding/TupleArity.scala | 29 +- .../twitter/scalding/TupleConversions.scala | 2 +- .../com/twitter/scalding/TupleConverter.scala | 55 +- .../com/twitter/scalding/TupleGetter.scala | 13 +- .../com/twitter/scalding/TuplePacker.scala | 50 +- .../com/twitter/scalding/TupleSetter.scala | 26 +- .../com/twitter/scalding/TupleUnpacker.scala | 75 +- .../com/twitter/scalding/TypeDescriptor.scala | 41 +- .../com/twitter/scalding/TypedDelimited.scala | 63 +- .../twitter/scalding/TypedPipeChecker.scala | 7 +- .../scalding/WritableSequenceFile.scala | 53 +- .../scala/com/twitter/scalding/XHandler.scala | 27 +- .../com/twitter/scalding/bdd/BddDsl.scala | 20 +- .../bdd/PipeOperationsConversions.scala | 75 +- .../com/twitter/scalding/bdd/TBddDsl.scala | 53 +- .../bdd/TypedPipeOperationsConversions.scala | 43 +- .../twitter/scalding/estimation/Common.scala | 6 +- .../scalding/estimation/Estimator.scala | 17 +- .../scalding/estimation/HistoryService.scala | 57 +- .../memory/MemoryEstimatorConfig.scala | 8 +- .../memory/MemoryEstimatorStepStrategy.scala | 42 +- .../SmoothedHistoryMemoryEstimator.scala | 34 +- .../filecache/DistributedCacheFile.scala | 72 +- .../scalding/macros/MacroImplicits.scala | 10 +- .../com/twitter/scalding/macros/Macros.scala | 12 +- .../impl/CaseClassBasedSetterImpl.scala | 41 +- .../macros/impl/CaseClassFieldSetter.scala | 4 +- .../macros/impl/FieldsProviderImpl.scala | 67 +- .../macros/impl/TupleConverterImpl.scala | 35 +- .../macros/impl/TupleFieldSetter.scala | 20 +- .../macros/impl/TupleSetterImpl.scala | 13 +- .../impl/TypeDescriptorProviderImpl.scala | 128 +- .../scalding/mathematics/Combinatorics.scala | 234 +- .../scalding/mathematics/Histogram.scala | 8 +- .../twitter/scalding/mathematics/Matrix.scala | 698 ++- .../scalding/mathematics/Matrix2.scala | 403 +- .../scalding/mathematics/MatrixProduct.scala | 289 +- .../scalding/mathematics/Poisson.scala | 5 +- .../scalding/mathematics/SizeHint.scala | 70 +- .../mathematics/TypedSimilarity.scala | 239 +- .../InputSizeReducerEstimator.scala | 52 +- .../RatioBasedEstimator.scala | 43 +- .../ReducerEstimatorConfig.scala | 9 +- .../ReducerEstimatorStepStrategy.scala | 40 +- .../ReducerHistoryEstimator.scala | 2 +- .../RuntimeReducerEstimator.scala | 79 +- .../CascadingBinaryComparator.scala | 53 +- .../scalding/serialization/Externalizer.scala | 5 +- .../scalding/serialization/KryoHadoop.scala | 71 +- .../serialization/KryoSerializers.scala | 22 +- .../RequiredBinaryComparators.scala | 19 +- .../RequiredBinaryComparatorsConfig.scala | 5 +- .../serialization/WrappedSerialization.scala | 70 +- .../scalding/source/CheckedInversion.scala | 7 +- .../twitter/scalding/source/CodecSource.scala | 29 +- .../scalding/source/DailySources.scala | 73 +- .../scalding/source/HourlySources.scala | 45 +- .../scalding/source/MaxFailuresCheck.scala | 11 +- .../twitter/scalding/source/NullSink.scala | 7 +- .../scalding/source/TypedSequenceFile.scala | 17 +- .../twitter/scalding/source/TypedText.scala | 92 +- .../scalding/typed/BijectedSourceSink.scala | 13 +- .../typed/GeneratedFlattenGroup.scala | 1449 ++++- .../scalding/typed/GeneratedTypedSource.scala | 203 +- .../com/twitter/scalding/typed/Grouped.scala | 494 +- .../typed/HashEqualsArrayWrapper.scala | 143 +- .../com/twitter/scalding/typed/Joiner.scala | 129 +- .../twitter/scalding/typed/KeyedList.scala | 179 +- .../twitter/scalding/typed/KeyedPipe.scala | 7 +- .../twitter/scalding/typed/LookupJoin.scala | 203 +- .../twitter/scalding/typed/MemorySink.scala | 2 +- .../twitter/scalding/typed/MultiJoin.scala | 1692 +++++- .../scalding/typed/MultiJoinFunction.scala | 58 +- .../scalding/typed/NoStackAndThen.scala | 42 +- .../scalding/typed/OptimizationPhases.scala | 3 +- .../scalding/typed/OptimizationRules.scala | 805 +-- .../scalding/typed/PartitionSchemed.scala | 24 +- .../scalding/typed/PartitionUtil.scala | 17 +- .../typed/PartitionedDelimitedSource.scala | 73 +- .../scalding/typed/PartitionedTextLine.scala | 55 +- .../com/twitter/scalding/typed/Resolver.scala | 12 +- .../com/twitter/scalding/typed/Sketched.scala | 85 +- .../com/twitter/scalding/typed/TDsl.scala | 13 +- .../scalding/typed/TemplatePartition.scala | 13 +- .../twitter/scalding/typed/TypedPipe.scala | 666 +- .../scalding/typed/TypedPipeDiff.scala | 106 +- .../twitter/scalding/typed/TypedSink.scala | 13 +- .../twitter/scalding/typed/TypedSource.scala | 13 +- .../twitter/scalding/typed/ValuePipe.scala | 32 +- .../scalding/typed/WithDescription.scala | 5 +- .../twitter/scalding/typed/WithReducers.scala | 22 +- .../scalding/typed/WritePartitioner.scala | 361 +- .../AsyncFlowDefRunner.scala | 168 +- .../cascading_backend/CascadingBackend.scala | 550 +- .../cascading_backend/CoGroupJoiner.scala | 44 +- .../DistinctCoGroupJoiner.scala | 9 +- .../typed/cascading_backend/HashJoiner.scala | 16 +- .../scalding/typed/functions/EqTypes.scala | 4 +- .../typed/functions/FlatMappedFn.scala | 28 +- .../typed/functions/FlatMapping.scala | 7 +- .../scalding/typed/functions/Functions.scala | 50 +- .../scalding/typed/functions/SubTypes.scala | 12 +- .../typed/memory_backend/AtomicBox.scala | 4 +- .../typed/memory_backend/MemoryBackend.scala | 54 +- .../typed/memory_backend/MemoryMode.scala | 25 +- .../typed/memory_backend/MemoryPlanner.scala | 46 +- .../typed/memory_backend/MemoryWriter.scala | 64 +- .../scalding/typed/memory_backend/Op.scala | 38 +- .../scalding/AlgebraicReductionsTest.scala | 4 +- .../com/twitter/scalding/ArgHelpTest.scala | 5 +- .../com/twitter/scalding/BlockJoinTest.scala | 11 +- .../com/twitter/scalding/CascadeTest.scala | 34 +- .../com/twitter/scalding/CoGroupTest.scala | 15 +- .../com/twitter/scalding/ConfigTest.scala | 50 +- .../scala/com/twitter/scalding/CoreTest.scala | 633 +- .../twitter/scalding/CumulativeSumTest.scala | 34 +- .../com/twitter/scalding/DistinctByTest.scala | 7 +- .../scalding/ExecutionAppProperties.scala | 50 +- .../ExecutionOptimizationRulesTest.scala | 77 +- .../com/twitter/scalding/ExecutionTest.scala | 500 +- .../twitter/scalding/ExecutionUtilTest.scala | 2 +- .../scalding/ExpandLibJarsGlobsTest.scala | 40 +- .../com/twitter/scalding/FieldImpsTest.scala | 37 +- .../com/twitter/scalding/FileSourceTest.scala | 89 +- .../twitter/scalding/FlowStateMapTest.scala | 2 +- .../twitter/scalding/IntegralCompTest.scala | 12 +- .../IterableExecutionSerializationTest.scala | 5 +- .../com/twitter/scalding/JobTestTest.scala | 14 +- .../scala/com/twitter/scalding/KryoTest.scala | 70 +- .../com/twitter/scalding/LargePlanTest.scala | 43 +- .../com/twitter/scalding/LookupJoinTest.scala | 97 +- .../scala/com/twitter/scalding/PackTest.scala | 30 +- .../scalding/PartitionSourceTest.scala | 17 +- .../com/twitter/scalding/PathFilterTest.scala | 4 +- .../scalding/ReduceOperationsTest.scala | 40 +- .../scalding/ReferencedClassFinderTest.scala | 13 +- .../twitter/scalding/RegressionTests.scala | 10 +- .../scalding/RichPipeSpecification.scala | 104 +- .../com/twitter/scalding/ScanLeftTest.scala | 27 +- .../com/twitter/scalding/SideEffectTest.scala | 60 +- .../com/twitter/scalding/SkewJoinTest.scala | 37 +- .../com/twitter/scalding/SourceSpec.scala | 64 +- .../com/twitter/scalding/StatsTest.scala | 17 +- .../twitter/scalding/StringUtilityTest.scala | 13 +- .../twitter/scalding/TemplateSourceTest.scala | 8 +- .../twitter/scalding/TestTapFactoryTest.scala | 14 +- .../scalding/TimePathedSourceTest.scala | 10 +- .../com/twitter/scalding/TupleTest.scala | 30 +- .../twitter/scalding/TypedDelimitedTest.scala | 4 +- .../twitter/scalding/TypedFieldsTest.scala | 21 +- .../scalding/TypedPipeCheckerTest.scala | 6 +- .../com/twitter/scalding/TypedPipeTest.scala | 477 +- ...TypedSinkWithTypedImplementationTest.scala | 32 +- .../TypedSketchJoinJobForEmptyKeysTest.scala | 9 +- .../twitter/scalding/WrappedJoinerTest.scala | 8 +- .../com/twitter/scalding/XHandlerTest.scala | 28 +- .../bdd/MultipleSourcesSpecTest.scala | 170 +- .../scalding/bdd/SingleSourceSpecTest.scala | 110 +- .../scalding/bdd/SourceListSpecTest.scala | 137 +- .../twitter/scalding/bdd/TypedApiTest.scala | 209 +- .../MemoryEstimatorStepStrategyTest.scala | 7 +- .../SmoothedHistoryMemoryEstimatorTest.scala | 122 +- .../filecache/DistributedCacheFileSpec.scala | 8 +- .../scalding/macros/MacrosUnitTests.scala | 81 +- .../mathematics/CombinatoricsTest.scala | 4 +- .../scalding/mathematics/HistogramTest.scala | 14 +- .../mathematics/Matrix2OptimizationTest.scala | 244 +- .../scalding/mathematics/Matrix2Test.scala | 110 +- .../scalding/mathematics/MatrixTest.scala | 137 +- .../scalding/mathematics/SizeHintTest.scala | 47 +- .../mathematics/TypedSimilarityTest.scala | 34 +- .../typed/BijectedSourceSinkTest.scala | 24 +- .../scalding/typed/CoGroupableTest.scala | 4 +- .../typed/HashEqualsArrayWrapperTest.scala | 13 +- .../scalding/typed/InAnotherPackage.scala | 8 +- .../scalding/typed/MultiJoinTest.scala | 28 +- .../typed/NoStackLineNumberTest.scala | 9 +- .../typed/OptimizationRulesTest.scala | 279 +- .../PartitionedDelimitedSourceTest.scala | 6 +- .../typed/PartitionedTextLineTest.scala | 12 +- .../RequireOrderedSerializationTest.scala | 18 +- .../twitter/scalding/typed/ResolverTest.scala | 36 +- .../scalding/typed/TypedPipeDiffTest.scala | 62 +- .../scalding/typed/TypedPipeMonoidTest.scala | 2 +- .../scalding/typed/WritePartitionerTest.scala | 302 +- .../typed/memory_backend/MemoryTest.scala | 28 +- .../twitter/scalding/AbsoluteDuration.scala | 58 +- .../com/twitter/scalding/CalendarOps.scala | 15 +- .../scala/com/twitter/scalding/DateOps.scala | 49 +- .../com/twitter/scalding/DateParser.scala | 52 +- .../com/twitter/scalding/DateRange.scala | 59 +- .../scala/com/twitter/scalding/Duration.scala | 43 +- .../com/twitter/scalding/Globifier.scala | 29 +- .../scala/com/twitter/scalding/RichDate.scala | 45 +- .../twitter/scalding/CalendarOpsTest.scala | 110 +- .../com/twitter/scalding/DateProperties.scala | 63 +- .../scala/com/twitter/scalding/DateTest.scala | 154 +- .../com/twitter/scalding/GlobifierOps.scala | 94 +- .../scalding/GlobifierProperties.scala | 8 +- .../twitter/scalding/db/ColumnDefiner.scala | 72 +- .../scalding/db/ColumnDefinition.scala | 14 +- .../scalding/db/DBColumnTransformer.scala | 54 +- .../com/twitter/scalding/db/DBOptions.scala | 13 +- .../scalding/db/DBTypeDescriptor.scala | 2 +- .../db/extensions/VerticaExtensions.scala | 10 +- .../twitter/scalding/db/macros/DBMacro.scala | 2 +- .../impl/ColumnDefinitionProviderImpl.scala | 250 +- .../db/macros/impl/DBTypeDescriptorImpl.scala | 9 +- .../db/macros/impl/JdbcFieldSetter.scala | 20 +- .../macros/impl/JdbcStatementSetterImpl.scala | 6 +- .../impl/handler/AnnotationHelper.scala | 21 +- .../macros/impl/handler/BlobTypeHandler.scala | 24 +- .../db/macros/impl/handler/ColumnFormat.scala | 13 +- .../macros/impl/handler/DateTypeHandler.scala | 16 +- .../impl/handler/NumericTypeHandler.scala | 18 +- .../impl/handler/StringTypeHandler.scala | 47 +- .../com/twitter/scalding/db/package.scala | 9 +- .../twitter/scalding/db/DBOptionsTest.scala | 2 +- .../scalding/db/macros/MacrosUnitTests.scala | 512 +- .../memory/MemoryEstimatorTest.scala | 115 +- .../RatioBasedEstimatorTest.scala | 57 +- .../ReducerEstimatorTest.scala | 81 +- .../RuntimeReducerEstimatorTest.scala | 14 +- .../scalding/platform/HadoopPlatform.scala | 37 +- .../HadoopPlatformExecutionTest.scala | 17 +- .../platform/HadoopPlatformJobTest.scala | 34 +- .../platform/HadoopSharedPlatformTest.scala | 7 +- .../scalding/platform/LocalCluster.scala | 28 +- .../twitter/scalding/platform/MakeJar.scala | 18 +- .../twitter/scalding/platform/Scalatest.scala | 11 +- .../platform/PlatformExecutionTest.scala | 10 +- .../scalding/platform/PlatformTest.scala | 178 +- .../platform/TestJobsWithDescriptions.scala | 5 +- .../estimation/HRavenHistoryService.scala | 144 +- .../memory/HRavenMemoryService.scala | 24 +- .../HRavenBasedEstimator.scala | 23 +- .../estimation/HRavenHistoryServiceTest.scala | 38 +- .../twitter/scalding/jdbc/ColumnDefiner.scala | 36 +- .../scalding/jdbc/DriverColumnDefiner.scala | 18 +- .../twitter/scalding/jdbc/JDBCDriver.scala | 44 +- .../twitter/scalding/jdbc/JDBCSource.scala | 42 +- .../scalding/jdbc/JDBCSourceCompileTest.scala | 6 +- .../scala/com/twitter/scalding/JsonLine.scala | 72 +- .../com/twitter/scalding/TypedJson.scala | 43 +- .../com/twitter/scalding/JsonLineTest.scala | 58 +- .../scrooge/Parquet346ScroogeScheme.scala | 74 +- .../parquet/scrooge/ParquetScrooge.scala | 21 +- .../PartitionedParquetScroogeSource.scala | 28 +- .../parquet/scrooge/ParquetScroogeTests.scala | 53 +- ...PartitionedParquetScroogeSourceTests.scala | 32 +- .../parquet/scrooge/PlanningTests.scala | 41 +- .../scrooge/ScroogeReadSupportTests.scala | 56 +- .../parquet/HasColumnProjection.scala | 10 +- .../thrift/Parquet346TBaseScheme.scala | 106 +- .../parquet/thrift/ParquetThrift.scala | 71 +- .../PartitionedParquetThriftSource.scala | 31 +- .../scalding/parquet/tuple/ParquetTuple.scala | 35 +- .../scalding/parquet/tuple/TypedParquet.scala | 67 +- .../parquet/tuple/macros/Macros.scala | 41 +- .../impl/ParquetReadSupportProvider.scala | 90 +- .../macros/impl/ParquetSchemaProvider.scala | 20 +- .../macros/impl/WriteSupportProvider.scala | 68 +- .../tuple/scheme/ParquetTupleConverter.scala | 80 +- .../scheme/TypedParquetTupleScheme.scala | 115 +- .../parquet/ParquetSourcesTests.scala | 103 +- .../PartitionedParquetThriftSourceTests.scala | 33 +- .../parquet/tuple/TypedParquetTupleTest.scala | 89 +- .../parquet/tuple/macros/MacroUnitTests.scala | 28 +- .../scalding/quotation/Liftables.scala | 30 +- .../scalding/quotation/Projection.scala | 44 +- .../scalding/quotation/ProjectionMacro.scala | 19 +- .../twitter/scalding/quotation/Quoted.scala | 12 +- .../scalding/quotation/QuotedMacro.scala | 21 +- .../scalding/quotation/TextMacro.scala | 13 +- .../twitter/scalding/quotation/TreeOps.scala | 11 +- .../scalding/quotation/LimitationsTest.scala | 16 +- .../twitter/scalding/quotation/Person.scala | 2 +- .../quotation/ProjectionMacroTest.scala | 60 +- .../scalding/quotation/ProjectionTest.scala | 14 +- .../scalding/quotation/TextMacroTest.scala | 18 +- .../com/twitter/scalding/ILoopCompat.scala | 3 +- .../com/twitter/scalding/ReplImplicits.scala | 168 +- .../com/twitter/scalding/ScaldingILoop.scala | 47 +- .../com/twitter/scalding/ScaldingShell.scala | 74 +- .../com/twitter/scalding/ShellPipe.scala | 18 +- .../scala/com/twitter/scalding/ReplTest.scala | 29 +- .../scalding/serialization/Boxed.scala | 28 +- .../scalding/serialization/Hasher.scala | 24 +- .../serialization/JavaStreamEnrichments.scala | 76 +- .../twitter/scalding/serialization/Laws.scala | 6 +- .../serialization/MurmurHashUtils.scala | 2 +- .../serialization/OrderedSerialization.scala | 148 +- .../serialization/PositionInputStream.scala | 6 +- .../scalding/serialization/Reader.scala | 16 +- .../serialization/Serialization.scala | 95 +- .../serialization/Serialization2.scala | 39 +- .../StringOrderedSerialization.scala | 37 +- .../serialization/UnsignedComparisons.scala | 14 +- .../scalding/serialization/Writer.scala | 17 +- .../macros/impl/BinaryOrdering.scala | 3 +- .../impl/OrderedBufferableProviderImpl.scala | 8 +- .../ordered_serialization/ProductLike.scala | 92 +- .../SealedTraitLike.scala | 177 +- .../TreeOrderedBuf.scala | 127 +- .../providers/CaseClassOrderedBuf.scala | 27 +- .../providers/CaseObjectOrderedBuf.scala | 3 +- .../providers/EitherOrderedBuf.scala | 16 +- .../providers/ImplicitOrderedBuf.scala | 6 +- .../providers/OptionOrderedBuf.scala | 10 +- .../providers/PrimitiveOrderedBuf.scala | 8 +- .../providers/ProductOrderedBuf.scala | 247 +- .../providers/SealedTraitOrderedBuf.scala | 28 +- .../StableKnownDirectSubclasses.scala | 9 +- .../providers/TraversablesOrderedBuf.scala | 29 +- .../runtime_helpers/LengthCalculations.scala | 11 +- .../MacroEqualityOrderedSerialization.scala | 8 +- .../runtime_helpers/TraversableHelpers.scala | 39 +- .../JavaStreamEnrichmentsProperties.scala | 41 +- .../SerializationProperties.scala | 7 +- .../UnsignedComparisonLaws.scala | 15 +- .../WriterReaderProperties.scala | 36 +- .../macros/MacroOrderingProperties.scala | 137 +- .../macros/TraversableHelperLaws.scala | 32 +- .../scalding/spark_backend/Iterators.scala | 46 +- .../twitter/scalding/spark_backend/Op.scala | 217 +- .../scalding/spark_backend/SparkBackend.scala | 71 +- .../scalding/spark_backend/SparkMode.scala | 76 +- .../scalding/spark_backend/SparkWriter.scala | 94 +- .../spark_backend/SparkBackendTests.scala | 165 +- .../scalding/thrift/macros/Macros.scala | 2 +- .../macros/RequiredBinaryComparators.scala | 11 +- ...oogeInternalOrderedSerializationImpl.scala | 24 +- .../ScroogeEnumOrderedBuf.scala | 4 +- .../ScroogeOrderedBuf.scala | 43 +- .../ScroogeOuterOrderedBuf.scala | 11 +- .../ScroogeUnionOrderedBuf.scala | 48 +- .../ordered_serialization/UnionLike.scala | 151 +- .../scalding/thrift/macros/PlatformTest.scala | 36 +- .../thrift/macros/ScroogeGenerators.scala | 179 +- .../macros/ScroogeMacrosUnitTests.scala | 5 +- .../scalding/thrift/macros/TestHelper.scala | 19 +- .../ExecutionTutorial.scala | 35 +- version.sbt | 2 +- 450 files changed, 25546 insertions(+), 15932 deletions(-) diff --git a/build.sbt b/build.sbt index 029f6f4e0f..a885b8341a 100644 --- a/build.sbt +++ b/build.sbt @@ -4,9 +4,10 @@ import scala.collection.JavaConverters._ import microsites.ExtraMdFileConfig def scalaBinaryVersion(scalaVersion: String) = scalaVersion match { - case version if version startsWith "2.11" => "2.11" - case version if version startsWith "2.12" => "2.12" - case _ => sys.error("unknown error") + case version if version.startsWith("2.11") => "2.11" + case version if version.startsWith("2.11") => "2.11" + case version if version.startsWith("2.12") => "2.12" + case _ => sys.error("unknown error") } val algebirdVersion = "0.13.4" val apacheCommonsVersion = "2.2" @@ -42,19 +43,17 @@ val printDependencyClasspath = taskKey[Unit]("Prints location of the dependencie val sharedSettings = Seq( organization := "com.twitter", - scalaVersion := "2.11.12", - crossScalaVersions := Seq(scalaVersion.value, "2.12.14"), - javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), - doc / javacOptions := Seq("-source", "1.8"), - Compile / compile / wartremoverErrors ++= Seq( - Wart.OptionPartial, Wart.ExplicitImplicitTypes, Wart.LeakingSealed, - Wart.Return, Wart.EitherProjectionPartial), - + Wart.OptionPartial, + Wart.ExplicitImplicitTypes, + Wart.LeakingSealed, + Wart.Return, + Wart.EitherProjectionPartial + ), libraryDependencies ++= Seq( "org.mockito" % "mockito-all" % "1.8.5" % "test", "org.scalacheck" %% "scalacheck" % scalaCheckVersion % "test", @@ -62,46 +61,36 @@ val sharedSettings = Seq( "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "test", "com.novocode" % "junit-interface" % "0.10" % "test" ), - resolvers ++= Seq( Opts.resolver.sonatypeSnapshots, Opts.resolver.sonatypeReleases, - "Concurrent Maven Repo" at "https://conjars.org/repo", - "Twitter Maven" at "https://maven.twttr.com", - "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos/" + "Concurrent Maven Repo".at("https://conjars.org/repo"), + "Twitter Maven".at("https://maven.twttr.com"), + "Cloudera".at("https://repository.cloudera.com/artifactory/cloudera-repos/") ), - printDependencyClasspath := { val cp = (Compile / dependencyClasspath).value cp.foreach(f => println(s"${f.metadata.get(moduleID.key)} => ${f.data}")) }, - Test / fork := true, - updateOptions := updateOptions.value.withCachedResolution(true), - - update / aggregate := false, - + update / aggregate := false, Test / javaOptions ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m", "-XX:MaxPermSize=384m"), - Global / concurrentRestrictions := Seq( Tags.limitAll(1) ), - Test / parallelExecution := false, - scalacOptions ++= Seq( - "-unchecked", - "-deprecation", - "-language:implicitConversions", - "-language:higherKinds", - "-language:existentials", - "-Ywarn-unused-import" - ), - + "-unchecked", + "-deprecation", + "-language:implicitConversions", + "-language:higherKinds", + "-language:existentials", + "-Ywarn-unused-import" + ), Compile / doc / scalacOptions ++= Seq(scalaVersion.value).flatMap { case v if v.startsWith("2.12") => Seq("-no-java-comments") //workaround for scala/scala-dev#249 - case _ => Seq() + case _ => Seq() }, // Enables full stack traces in scalatest @@ -118,7 +107,6 @@ val sharedSettings = Seq( publishMavenStyle := true, Test / publishArtifact := false, pomIncludeRepository := { x => false }, - releaseProcess := Seq[ReleaseStep]( checkSnapshotDependencies, inquireVersions, @@ -131,40 +119,38 @@ val sharedSettings = Seq( setNextVersion, commitNextVersion, ReleaseStep(action = Command.process("sonatypeReleaseAll", _)), - pushChanges), - + pushChanges + ), publishTo := Some( - if (version.value.trim.endsWith("SNAPSHOT")) - Opts.resolver.sonatypeSnapshots - else Opts.resolver.sonatypeStaging - ), + if (version.value.trim.endsWith("SNAPSHOT")) + Opts.resolver.sonatypeSnapshots + else Opts.resolver.sonatypeStaging + ), // Janino includes a broken signature, and is not needed: - assembly / assemblyExcludedJars := { - val excludes = Set("jsp-api-2.1-6.1.14.jar", "jsp-2.1-6.1.14.jar", - "jasper-compiler-5.5.12.jar", "janino-2.5.16.jar") - (assembly / fullClasspath).value filter { - jar => excludes(jar.data.getName) - } + assembly / assemblyExcludedJars := { + val excludes = + Set("jsp-api-2.1-6.1.14.jar", "jsp-2.1-6.1.14.jar", "jasper-compiler-5.5.12.jar", "janino-2.5.16.jar") + (assembly / fullClasspath).value.filter { jar => + excludes(jar.data.getName) + } }, // Some of these files have duplicates, let's ignore: - assembly / assemblyMergeStrategy := { - case s if s.endsWith(".class") => MergeStrategy.last - case s if s.endsWith("project.clj") => MergeStrategy.concat - case s if s.endsWith(".html") => MergeStrategy.last - case s if s.endsWith(".dtd") => MergeStrategy.last - case s if s.endsWith(".xsd") => MergeStrategy.last + assembly / assemblyMergeStrategy := { + case s if s.endsWith(".class") => MergeStrategy.last + case s if s.endsWith("project.clj") => MergeStrategy.concat + case s if s.endsWith(".html") => MergeStrategy.last + case s if s.endsWith(".dtd") => MergeStrategy.last + case s if s.endsWith(".xsd") => MergeStrategy.last case s if s.endsWith("pom.properties") => MergeStrategy.last - case s if s.endsWith("pom.xml") => MergeStrategy.last - case s if s.endsWith(".jnilib") => MergeStrategy.rename - case s if s.endsWith("jansi.dll") => MergeStrategy.rename - case s if s.endsWith("libjansi.so") => MergeStrategy.rename - case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines - case x => (assembly / assemblyMergeStrategy).value(x) + case s if s.endsWith("pom.xml") => MergeStrategy.last + case s if s.endsWith(".jnilib") => MergeStrategy.rename + case s if s.endsWith("jansi.dll") => MergeStrategy.rename + case s if s.endsWith("libjansi.so") => MergeStrategy.rename + case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines + case x => (assembly / assemblyMergeStrategy).value(x) }, - - pomExtra := ( - https://github.com/twitter/scalding + pomExtra := (https://github.com/twitter/scalding Apache 2 @@ -196,66 +182,61 @@ val sharedSettings = Seq( ) ) ++ mimaDefaultSettings -lazy val scalding = Project( - id = "scalding", - base = file(".")) - .settings(sharedSettings ++ noPublishSettings) - .aggregate( - scaldingArgs, - scaldingDate, - scaldingQuotation, - scaldingCats, - scaldingCore, - scaldingCommons, - scaldingAvro, - scaldingParquet, - scaldingParquetScrooge, - scaldingHRaven, - scaldingRepl, - scaldingJson, - scaldingJdbc, - scaldingHadoopTest, - scaldingEstimatorsTest, - scaldingDb, - maple, - executionTutorial, - scaldingSerialization, - scaldingSpark, - scaldingBeam, - scaldingThriftMacros -) +lazy val scalding = Project(id = "scalding", base = file(".")) + .settings(sharedSettings ++ noPublishSettings) + .aggregate( + scaldingArgs, + scaldingDate, + scaldingQuotation, + scaldingCats, + scaldingCore, + scaldingCommons, + scaldingAvro, + scaldingParquet, + scaldingParquetScrooge, + scaldingHRaven, + scaldingRepl, + scaldingJson, + scaldingJdbc, + scaldingHadoopTest, + scaldingEstimatorsTest, + scaldingDb, + maple, + executionTutorial, + scaldingSerialization, + scaldingSpark, + scaldingBeam, + scaldingThriftMacros + ) -lazy val scaldingAssembly = Project( - id = "scalding-assembly", - base = file("assembly")) - .settings(sharedSettings ++ noPublishSettings) - .aggregate( - scaldingArgs, - scaldingDate, - scaldingQuotation, - scaldingCore, - scaldingCommons, - scaldingAvro, - scaldingParquet, - scaldingParquetScrooge, - scaldingHRaven, - scaldingRepl, - scaldingJson, - scaldingJdbc, - maple, - scaldingSerialization -) +lazy val scaldingAssembly = Project(id = "scalding-assembly", base = file("assembly")) + .settings(sharedSettings ++ noPublishSettings) + .aggregate( + scaldingArgs, + scaldingDate, + scaldingQuotation, + scaldingCore, + scaldingCommons, + scaldingAvro, + scaldingParquet, + scaldingParquetScrooge, + scaldingHRaven, + scaldingRepl, + scaldingJson, + scaldingJdbc, + maple, + scaldingSerialization + ) lazy val noPublishSettings = Seq( - publish := (()), - publishLocal := (()), - test := (()), - publishArtifact := false - ) + publish := (()), + publishLocal := (()), + test := (()), + publishArtifact := false +) /** - * This returns the youngest jar we released that is compatible with - * the current. + * This returns the youngest jar we released that is compatible with the current. */ val ignoredModules = Set[String]("benchmarks") @@ -270,9 +251,8 @@ def youngestForwardCompatible(subProj: String) = def module(name: String) = { val id = "scalding-%s".format(name) - Project(id = id, base = file(id)).settings(sharedSettings ++ Seq( - Keys.name := id, - mimaPreviousArtifacts := youngestForwardCompatible(name).toSet) + Project(id = id, base = file(id)).settings( + sharedSettings ++ Seq(Keys.name := id, mimaPreviousArtifacts := youngestForwardCompatible(name).toSet) ) } @@ -289,12 +269,13 @@ lazy val cascadingJDBCVersion = lazy val scaldingBenchmarks = module("benchmarks") .settings( libraryDependencies ++= Seq( - "com.storm-enroute" %% "scalameter" % scalameterVersion % "test", - "org.scalacheck" %% "scalacheck" % scalaCheckVersion % "test" - ), + "com.storm-enroute" %% "scalameter" % scalameterVersion % "test", + "org.scalacheck" %% "scalacheck" % scalaCheckVersion % "test" + ), testFrameworks += new TestFramework("org.scalameter.ScalaMeterFramework"), Test / parallelExecution := false - ).dependsOn(scaldingCore) + ) + .dependsOn(scaldingCore) lazy val scaldingQuotation = module("quotation").settings( libraryDependencies ++= Seq( @@ -303,183 +284,203 @@ lazy val scaldingQuotation = module("quotation").settings( ) ) -lazy val scaldingCore = module("core").settings( - libraryDependencies ++= Seq( - "cascading" % "cascading-core" % cascadingVersion, - "cascading" % "cascading-hadoop" % cascadingVersion, - "cascading" % "cascading-local" % cascadingVersion, - "com.stripe" %% "dagon-core" % dagonVersion, - "com.twitter" % "chill-hadoop" % chillVersion, - "com.twitter" % "chill-java" % chillVersion, - "com.twitter" %% "chill-bijection" % chillVersion, - "com.twitter" %% "algebird-core" % algebirdVersion, - "com.twitter" %% "algebird-test" % algebirdVersion % "test", - "com.twitter" %% "bijection-core" % bijectionVersion, - "com.twitter" %% "bijection-macros" % bijectionVersion, - "com.twitter" %% "chill" % chillVersion, - "com.twitter" %% "chill-algebird" % chillVersion, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "org.scala-lang" % "scala-library" % scalaVersion.value, - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided"), - addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full) -).dependsOn(scaldingArgs, scaldingDate, scaldingSerialization, maple, scaldingQuotation) - -lazy val scaldingCats = module("cats").settings( - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "org.typelevel" %% "cats-core" % catsVersion, - "org.typelevel" %% "cats-laws" % catsVersion % "test", - "org.typelevel" %% "cats-effect" % catsEffectVersion, - "org.typelevel" %% "cats-effect-laws" % catsEffectVersion % "test" - )).dependsOn(scaldingArgs, scaldingDate, scaldingCore) +lazy val scaldingCore = module("core") + .settings( + libraryDependencies ++= Seq( + "cascading" % "cascading-core" % cascadingVersion, + "cascading" % "cascading-hadoop" % cascadingVersion, + "cascading" % "cascading-local" % cascadingVersion, + "com.stripe" %% "dagon-core" % dagonVersion, + "com.twitter" % "chill-hadoop" % chillVersion, + "com.twitter" % "chill-java" % chillVersion, + "com.twitter" %% "chill-bijection" % chillVersion, + "com.twitter" %% "algebird-core" % algebirdVersion, + "com.twitter" %% "algebird-test" % algebirdVersion % "test", + "com.twitter" %% "bijection-core" % bijectionVersion, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" %% "chill" % chillVersion, + "com.twitter" %% "chill-algebird" % chillVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided" + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn(scaldingArgs, scaldingDate, scaldingSerialization, maple, scaldingQuotation) +lazy val scaldingCats = module("cats") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.typelevel" %% "cats-core" % catsVersion, + "org.typelevel" %% "cats-laws" % catsVersion % "test", + "org.typelevel" %% "cats-effect" % catsEffectVersion, + "org.typelevel" %% "cats-effect-laws" % catsEffectVersion % "test" + ) + ) + .dependsOn(scaldingArgs, scaldingDate, scaldingCore) -lazy val scaldingSpark = module("spark").settings( - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % sparkVersion, - "org.apache.spark" %% "spark-sql" % sparkVersion +lazy val scaldingSpark = module("spark") + .settings( + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion, + "org.apache.spark" %% "spark-sql" % sparkVersion ) - ).dependsOn(scaldingCore) + ) + .dependsOn(scaldingCore) -lazy val scaldingBeam = module("beam").settings( - libraryDependencies ++= Seq( - "org.apache.beam" % "beam-sdks-java-core" % beamVersion, - "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, - "org.apache.beam" % "beam-sdks-java-extensions-sorter" % beamVersion, - "org.apache.beam" % "beam-runners-direct-java" % beamVersion % "test", - // Including this dependency since scalding configuration depends on hadoop - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", +lazy val scaldingBeam = module("beam") + .settings( + libraryDependencies ++= Seq( + "org.apache.beam" % "beam-sdks-java-core" % beamVersion, + "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, + "org.apache.beam" % "beam-sdks-java-extensions-sorter" % beamVersion, + "org.apache.beam" % "beam-runners-direct-java" % beamVersion % "test", + // Including this dependency since scalding configuration depends on hadoop + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ) ) -).dependsOn(scaldingCore) + .dependsOn(scaldingCore) -lazy val scaldingCommons = module("commons").settings( - libraryDependencies ++= Seq( - // TODO: split into scalding-protobuf - "com.google.protobuf" % "protobuf-java" % protobufVersion, - "com.twitter" %% "bijection-core" % bijectionVersion, - "com.twitter" %% "algebird-core" % algebirdVersion, - "com.twitter" %% "chill" % chillVersion, - "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion, - "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion, - "com.hadoop.gplcompression" % "hadoop-lzo" % hadoopLzoVersion, - // TODO: split this out into scalding-thrift - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "org.apache.thrift" % "libthrift" % thriftVersion, - // TODO: split this out into a scalding-scrooge - "com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided" - exclude("com.google.guava", "guava"), - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided", - "junit" % "junit" % junitVersion % "test" +lazy val scaldingCommons = module("commons") + .settings( + libraryDependencies ++= Seq( + // TODO: split into scalding-protobuf + "com.google.protobuf" % "protobuf-java" % protobufVersion, + "com.twitter" %% "bijection-core" % bijectionVersion, + "com.twitter" %% "algebird-core" % algebirdVersion, + "com.twitter" %% "chill" % chillVersion, + "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion, + "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion, + "com.hadoop.gplcompression" % "hadoop-lzo" % hadoopLzoVersion, + // TODO: split this out into scalding-thrift + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.apache.thrift" % "libthrift" % thriftVersion, + // TODO: split this out into a scalding-scrooge + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided", + "junit" % "junit" % junitVersion % "test" + ) ) -).dependsOn(scaldingArgs, scaldingDate, scaldingCore, scaldingHadoopTest % "test") + .dependsOn(scaldingArgs, scaldingDate, scaldingCore, scaldingHadoopTest % "test") -lazy val scaldingAvro = module("avro").settings( - libraryDependencies ++= Seq( - "cascading.avro" % "avro-scheme" % cascadingAvroVersion, - "org.apache.avro" % "avro" % avroVersion, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" +lazy val scaldingAvro = module("avro") + .settings( + libraryDependencies ++= Seq( + "cascading.avro" % "avro-scheme" % cascadingAvroVersion, + "org.apache.avro" % "avro" % avroVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ) ) -).dependsOn(scaldingCore) + .dependsOn(scaldingCore) lazy val scaldingParquetFixtures = module("parquet-fixtures") - .settings( - Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", - Test / scroogeLanguages := Seq("java", "scala"), - libraryDependencies ++= Seq( - "com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided" - exclude("com.google.guava", "guava"), - "commons-lang" % "commons-lang" % apacheCommonsVersion, // needed for HashCodeBuilder used in thriftjava - "org.apache.thrift" % "libthrift" % thriftVersion - ) - ) - -lazy val scaldingParquet = module("parquet").settings( - libraryDependencies ++= Seq( - "org.apache.parquet" % "parquet-column" % parquetVersion, - "org.apache.parquet" % "parquet-hadoop" % parquetVersion, - "org.apache.parquet" % "parquet-thrift" % parquetVersion - // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions - exclude("org.apache.parquet", "parquet-pig") - exclude("com.twitter.elephantbird", "elephant-bird-pig") - exclude("com.twitter.elephantbird", "elephant-bird-core"), - "org.scala-lang" % "scala-compiler" % scalaVersion.value, - "org.apache.thrift" % "libthrift" % thriftVersion, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "com.twitter" %% "bijection-macros" % bijectionVersion, - "com.twitter" %% "chill-bijection" % chillVersion, - "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion % "test" + .settings( + Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", + Test / scroogeLanguages := Seq("java", "scala"), + libraryDependencies ++= Seq( + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "commons-lang" % "commons-lang" % apacheCommonsVersion, // needed for HashCodeBuilder used in thriftjava + "org.apache.thrift" % "libthrift" % thriftVersion + ) + ) + +lazy val scaldingParquet = module("parquet") + .settings( + libraryDependencies ++= Seq( + "org.apache.parquet" % "parquet-column" % parquetVersion, + "org.apache.parquet" % "parquet-hadoop" % parquetVersion, + ("org.apache.parquet" % "parquet-thrift" % parquetVersion) + // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions + .exclude("org.apache.parquet", "parquet-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-core"), + "org.scala-lang" % "scala-compiler" % scalaVersion.value, + "org.apache.thrift" % "libthrift" % thriftVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" %% "chill-bijection" % chillVersion, + "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion % "test" ), - addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full)) + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) .dependsOn(scaldingCore, scaldingHadoopTest % "test", scaldingParquetFixtures % "test->test") - - lazy val scaldingParquetScroogeFixtures = module("parquet-scrooge-fixtures") .settings( Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", Test / scroogeLanguages := Seq("java", "scala"), libraryDependencies ++= Seq( - "com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided" - exclude("com.google.guava", "guava"), + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), "commons-lang" % "commons-lang" % apacheCommonsVersion, // needed for HashCodeBuilder used in thriftjava "org.apache.thrift" % "libthrift" % thriftVersion + ) ) -) lazy val scaldingParquetScrooge = module("parquet-scrooge") .settings( libraryDependencies ++= Seq( "org.slf4j" % "slf4j-api" % slf4jVersion, // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions - "org.apache.parquet" % "parquet-thrift" % parquetVersion % "test" classifier "tests" - exclude("org.apache.parquet", "parquet-pig") - exclude("com.twitter.elephantbird", "elephant-bird-pig") - exclude("com.twitter.elephantbird", "elephant-bird-core"), - "com.twitter" %% "scrooge-serializer" % scroogeVersion - exclude("com.google.guava", "guava"), + ("org.apache.parquet" % "parquet-thrift" % parquetVersion % "test") + .classifier("tests") + .exclude("org.apache.parquet", "parquet-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-core"), + ("com.twitter" %% "scrooge-serializer" % scroogeVersion) + .exclude("com.google.guava", "guava"), "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", "com.novocode" % "junit-interface" % "0.11" % "test", "junit" % "junit" % junitVersion % "test" - ) -).dependsOn(scaldingCore, scaldingParquet % "compile->compile;test->test", scaldingParquetScroogeFixtures % "test->test") + ) + .dependsOn( + scaldingCore, + scaldingParquet % "compile->compile;test->test", + scaldingParquetScroogeFixtures % "test->test" + ) -lazy val scaldingHRaven = module("hraven").settings( - libraryDependencies ++= Seq( - "com.twitter.hraven" % "hraven-core" % hravenVersion - // These transitive dependencies cause sbt to give a ResolveException - // because they're not available on Maven. We don't need them anyway. - // See https://github.com/twitter/cassie/issues/13 - exclude("javax.jms", "jms") - exclude("com.sun.jdmk", "jmxtools") - exclude("com.sun.jmx", "jmxri") - - // These transitive dependencies of hRaven cause conflicts when - // running scalding-hraven/*assembly and aren't needed - // for the part of the hRaven API that we use anyway - exclude("com.twitter.common", "application-module-log") - exclude("com.twitter.common", "application-module-stats") - exclude("com.twitter.common", "args") - exclude("com.twitter.common", "application") - // Excluding this dependencies because they get resolved to incorrect version, - // and not needed during compilation. - exclude("com.twitter", "util-registry_2.10") - exclude("com.twitter", "util-core_2.10") - exclude("com.twitter", "util-jvm_2.10"), - "org.apache.hbase" % "hbase" % hbaseVersion, - "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", - "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", - "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" +lazy val scaldingHRaven = module("hraven") + .settings( + libraryDependencies ++= Seq( + ("com.twitter.hraven" % "hraven-core" % hravenVersion) + // These transitive dependencies cause sbt to give a ResolveException + // because they're not available on Maven. We don't need them anyway. + // See https://github.com/twitter/cassie/issues/13 + .exclude("javax.jms", "jms") + .exclude("com.sun.jdmk", "jmxtools") + .exclude("com.sun.jmx", "jmxri") + + // These transitive dependencies of hRaven cause conflicts when + // running scalding-hraven/*assembly and aren't needed + // for the part of the hRaven API that we use anyway + .exclude("com.twitter.common", "application-module-log") + .exclude("com.twitter.common", "application-module-stats") + .exclude("com.twitter.common", "args") + .exclude("com.twitter.common", "application") + // Excluding this dependencies because they get resolved to incorrect version, + // and not needed during compilation. + .exclude("com.twitter", "util-registry_2.10") + .exclude("com.twitter", "util-core_2.10") + .exclude("com.twitter", "util-jvm_2.10"), + "org.apache.hbase" % "hbase" % hbaseVersion, + "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ) ) -).dependsOn(scaldingCore) + .dependsOn(scaldingCore) lazy val scaldingRepl = module("repl") .settings( @@ -495,77 +496,92 @@ lazy val scaldingRepl = module("repl") "org.scala-lang" % "scala-library" % scalaVersion.value, "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided", + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided" ) -).dependsOn(scaldingCore) -.settings(inConfig(Compile)(Classpaths.configSettings ++ Seq( - // This is needed to make "provided" dependencies presented in repl, - // solution borrowed from: http://stackoverflow.com/a/18839656/1404395 - run := Defaults.runTask(Compile / fullClasspath, Compile / run / mainClass, Compile / run / runner).evaluated, - // we need to fork repl task, because scala repl doesn't work well with sbt classloaders. - run / fork := true, - run / connectInput := true, - run / outputStrategy := Some(OutputStrategy.StdoutOutput) -)): _*) + ) + .dependsOn(scaldingCore) + .settings( + inConfig(Compile)( + Classpaths.configSettings ++ Seq( + // This is needed to make "provided" dependencies presented in repl, + // solution borrowed from: http://stackoverflow.com/a/18839656/1404395 + run := Defaults + .runTask(Compile / fullClasspath, Compile / run / mainClass, Compile / run / runner) + .evaluated, + // we need to fork repl task, because scala repl doesn't work well with sbt classloaders. + run / fork := true, + run / connectInput := true, + run / outputStrategy := Some(OutputStrategy.StdoutOutput) + ) + ): _* + ) // zero dependency serialization module lazy val scaldingSerialization = module("serialization").settings( libraryDependencies ++= Seq( "org.scala-lang" % "scala-reflect" % scalaVersion.value ), -addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full) + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) ) -lazy val scaldingJson = module("json").settings( - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion, - "org.json4s" %% "json4s-native" % json4SVersion, - "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion % "provided" +lazy val scaldingJson = module("json") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion, + "org.json4s" %% "json4s-native" % json4SVersion, + "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion % "provided" ) -).dependsOn(scaldingCore) + ) + .dependsOn(scaldingCore) -lazy val scaldingJdbc = module("jdbc").settings( - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "cascading" % "cascading-jdbc-core" % cascadingJDBCVersion, - "cascading" % "cascading-jdbc-mysql" % cascadingJDBCVersion +lazy val scaldingJdbc = module("jdbc") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "cascading" % "cascading-jdbc-core" % cascadingJDBCVersion, + "cascading" % "cascading-jdbc-mysql" % cascadingJDBCVersion + ) ) -).dependsOn(scaldingCore) + .dependsOn(scaldingCore) -lazy val scaldingHadoopTest = module("hadoop-test").settings( - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion, - "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, - "org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, - "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-common" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests", - "com.twitter" %% "chill-algebird" % chillVersion, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion, - "org.scalacheck" %% "scalacheck" % scalaCheckVersion, - "org.scalatest" %% "scalatest" % scalaTestVersion +lazy val scaldingHadoopTest = module("hadoop-test") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests"), + "com.twitter" %% "chill-algebird" % chillVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "org.scalacheck" %% "scalacheck" % scalaCheckVersion, + "org.scalatest" %% "scalatest" % scalaTestVersion + ) ) -).dependsOn(scaldingCore, scaldingSerialization) + .dependsOn(scaldingCore, scaldingSerialization) -lazy val scaldingEstimatorsTest = module("estimators-test").settings( - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion, - "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, - "org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, - "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-common" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests", - "com.twitter" %% "chill-algebird" % chillVersion, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion, - "org.scalacheck" %% "scalacheck" % scalaCheckVersion, - "org.scalatest" %% "scalatest" % scalaTestVersion +lazy val scaldingEstimatorsTest = module("estimators-test") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests"), + "com.twitter" %% "chill-algebird" % chillVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "org.scalacheck" %% "scalacheck" % scalaCheckVersion, + "org.scalatest" %% "scalatest" % scalaTestVersion + ) ) -).dependsOn(scaldingHadoopTest % "test") + .dependsOn(scaldingHadoopTest % "test") // This one uses a different naming convention lazy val maple = Project( @@ -573,19 +589,20 @@ lazy val maple = Project( base = file("maple") ).settings( sharedSettings ++ Seq( - name := "maple", - mimaPreviousArtifacts := Set.empty, - crossPaths := false, - autoScalaLibrary := false, - publishArtifact := true, - libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", - "org.apache.hbase" % "hbase" % hbaseVersion % "provided", - "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", - "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", - "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", - "cascading" % "cascading-hadoop" % cascadingVersion - )) + name := "maple", + mimaPreviousArtifacts := Set.empty, + crossPaths := false, + autoScalaLibrary := false, + publishArtifact := true, + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.apache.hbase" % "hbase" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", + "cascading" % "cascading-hadoop" % cascadingVersion + ) + ) ) lazy val executionTutorial = Project( @@ -593,80 +610,85 @@ lazy val executionTutorial = Project( base = file("tutorial/execution-tutorial") ).settings( sharedSettings ++ Seq( - name := "execution-tutorial", - libraryDependencies ++= Seq( - "org.scala-lang" % "scala-library" % scalaVersion.value, - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion, - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion, - "cascading" % "cascading-hadoop" % cascadingVersion - )) + name := "execution-tutorial", + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "cascading" % "cascading-hadoop" % cascadingVersion + ) + ) ).dependsOn(scaldingCore) -lazy val scaldingDb = module("db").settings( - libraryDependencies ++= Seq( - "org.scala-lang" % "scala-library" % scalaVersion.value, - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "com.twitter" %% "bijection-macros" % bijectionVersion - ), - addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full) -).dependsOn(scaldingCore) +lazy val scaldingDb = module("db") + .settings( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn(scaldingCore) lazy val scaldingThriftMacrosFixtures = module("thrift-macros-fixtures") .settings( Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", libraryDependencies ++= Seq( - "com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided" - exclude("com.google.guava", "guava"), + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), "org.apache.thrift" % "libthrift" % thriftVersion ) -) + ) lazy val scaldingThriftMacros = module("thrift-macros") .settings( - libraryDependencies ++= Seq( - "org.scala-lang" % "scala-reflect" % scalaVersion.value, - "com.twitter" %% "bijection-macros" % bijectionVersion, - "com.twitter" % "chill-thrift" % chillVersion % "test", - "com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided" - exclude("com.google.guava", "guava"), - "org.apache.thrift" % "libthrift" % thriftVersion, - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", - "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", - "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", - "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", - "org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion % "test", - "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-common" % hadoopVersion classifier "tests", - "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests" - ), - addCompilerPlugin("org.scalamacros" % "paradise" % paradiseVersion cross CrossVersion.full) -).dependsOn( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" % "chill-thrift" % chillVersion % "test", + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "org.apache.thrift" % "libthrift" % thriftVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion % "test", + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests") + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn( scaldingCore, scaldingHadoopTest % "test", scaldingSerialization, - scaldingThriftMacrosFixtures % "test->test") + scaldingThriftMacrosFixtures % "test->test" + ) def docsSourcesAndProjects(sv: String): Seq[ProjectReference] = - Seq( - scaldingArgs, - scaldingDate, - scaldingCore - // scaldingCommons, - // scaldingAvro, - // scaldingParquet, - // scaldingParquetScrooge, - // scaldingHRaven, - // scaldingRepl, - // scaldingJson, - // scaldingJdbc, - // scaldingDb, - // maple, - // scaldingSerialization, - // scaldingThriftMacros - ) + Seq( + scaldingArgs, + scaldingDate, + scaldingCore + // scaldingCommons, + // scaldingAvro, + // scaldingParquet, + // scaldingParquetScrooge, + // scaldingHRaven, + // scaldingRepl, + // scaldingJson, + // scaldingJdbc, + // scaldingDb, + // maple, + // scaldingSerialization, + // scaldingThriftMacros + ) lazy val docsMappingsAPIDir = settingKey[String]("Name of subdirectory in site target directory for api docs") @@ -681,7 +703,7 @@ lazy val docSettings = Seq( micrositeGithubOwner := "twitter", micrositeExtraMdFiles := Map(file("CONTRIBUTING.md") -> ExtraMdFileConfig("contributing.md", "home")), micrositeGithubRepo := "scalding", - micrositePalette := Map( + micrositePalette := Map( "brand-primary" -> "#5B5988", "brand-secondary" -> "#292E53", "brand-tertiary" -> "#222749", @@ -689,24 +711,26 @@ lazy val docSettings = Seq( "gray" -> "#7B7B7E", "gray-light" -> "#E5E5E6", "gray-lighter" -> "#F4F3F4", - "white-color" -> "#FFFFFF"), + "white-color" -> "#FFFFFF" + ), autoAPIMappings := true, ScalaUnidoc / unidoc / unidocProjectFilter := - inProjects(docsSourcesAndProjects(scalaVersion.value):_*), + inProjects(docsSourcesAndProjects(scalaVersion.value): _*), docsMappingsAPIDir := "api", addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, docsMappingsAPIDir), ghpagesNoJekyll := false, ScalaUnidoc / unidoc / fork := true, ScalaUnidoc / unidoc / scalacOptions ++= Seq( - "-doc-source-url", "https://github.com/twitter/scalding/tree/develop€{FILE_PATH}.scala", - "-sourcepath", (LocalRootProject / baseDirectory).value.getAbsolutePath, + "-doc-source-url", + "https://github.com/twitter/scalding/tree/develop€{FILE_PATH}.scala", + "-sourcepath", + (LocalRootProject / baseDirectory).value.getAbsolutePath, "-diagrams" ), mdocIn := new File((LocalRootProject / baseDirectory).value, "docs/src"), git.remoteRepo := "git@github.com:twitter/scalding.git", makeSite / includeFilter := "*.html" | "*.css" | "*.png" | "*.jpg" | "*.gif" | "*.js" | "*.swf" | "*.yml" | "*.md" - ) - +) // Documentation is generated for projects defined in // `docsSourcesAndProjects`. diff --git a/project/plugins.sbt b/project/plugins.sbt index 755c21658c..7f195f5ab5 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,20 +1,20 @@ resolvers ++= Seq( - "jgit-repo" at "https://download.eclipse.org/jgit/maven", - "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases", - "Twitter Maven" at "https://maven.twttr.com" + "jgit-repo".at("https://download.eclipse.org/jgit/maven"), + "sonatype-releases".at("https://oss.sonatype.org/content/repositories/releases"), + "Twitter Maven".at("https://maven.twttr.com") ) -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") -addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") -addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.4") -addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0") -addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") -addSbtPlugin("com.twitter" %% "scrooge-sbt-plugin" % "18.9.0") -addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.14") -addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") -addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.8.2") -addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.7") -addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.16") -addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.22") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") +addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") +addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.4") +addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +addSbtPlugin("com.twitter" %% "scrooge-sbt-plugin" % "18.9.0") +addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.14") +addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") +addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.8.2") +addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.7") +addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.16") +addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.22") diff --git a/scalding-args/src/main/scala/com/twitter/scalding/Args.scala b/scalding-args/src/main/scala/com/twitter/scalding/Args.scala index a069a1ce56..a9199756ae 100644 --- a/scalding-args/src/main/scala/com/twitter/scalding/Args.scala +++ b/scalding-args/src/main/scala/com/twitter/scalding/Args.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import scala.util.control.NonFatal @@ -20,9 +20,8 @@ import scala.util.control.NonFatal case class ArgsException(message: String) extends RuntimeException(message) /** - * The args class does a simple command line parsing. The rules are: - * keys start with one or more "-". Each key has zero or more values - * following. + * The args class does a simple command line parsing. The rules are: keys start with one or more "-". Each key + * has zero or more values following. */ object Args { val empty: Args = new Args(Map.empty) @@ -33,36 +32,35 @@ object Args { def apply(argString: String): Args = Args(argString.split("\\s+")) /** - * parses keys as starting with a dash, except single dashed digits. - * All following non-dashed args are a list of values. - * If the list starts with non-dashed args, these are associated with the - * empty string: "" + * parses keys as starting with a dash, except single dashed digits. All following non-dashed args are a + * list of values. If the list starts with non-dashed args, these are associated with the empty string: "" */ def apply(args: Iterable[String]): Args = { - def startingDashes(word: String) = word.takeWhile { _ == '-' }.length + def startingDashes(word: String) = word.takeWhile(_ == '-').length new Args( //Fold into a list of (arg -> List[values]) args - .filter{ a => !a.matches("\\s*") } + .filter(a => !a.matches("\\s*")) .foldLeft(List("" -> List[String]())) { (acc, arg) => - val noDashes = arg.dropWhile{ _ == '-' } + val noDashes = arg.dropWhile(_ == '-') if (arg == noDashes || isNumber(arg)) (acc.head._1 -> (arg :: acc.head._2)) :: acc.tail else (noDashes -> List()) :: acc } //Now reverse the values to keep the same order - .map { case (key, value) => key -> value.reverse }.toMap) + .map { case (key, value) => key -> value.reverse } + .toMap + ) } - def isNumber(arg: String): Boolean = { + def isNumber(arg: String): Boolean = try { arg.toDouble true } catch { case e: NumberFormatException => false } - } /** * By default, scalding will use reflection to try and identify classes to tokenize. Set to false to disable @@ -81,10 +79,9 @@ class Args(val m: Map[String, List[String]]) extends java.io.Serializable { def boolean(key: String): Boolean = m.contains(key) /** - * Get the list of values associated with a given key. - * if the key is absent, return the empty list. NOTE: empty - * does not mean the key is absent, it could be a key without - * a value. Use boolean() to check existence. + * Get the list of values associated with a given key. if the key is absent, return the empty list. NOTE: + * empty does not mean the key is absent, it could be a key without a value. Use boolean() to check + * existence. */ def list(key: String): List[String] = m.get(key).getOrElse(List()) @@ -111,13 +108,12 @@ class Args(val m: Map[String, List[String]]) extends java.io.Serializable { */ def apply(position: Int): String = required(position) - override def equals(other: Any): Boolean = { + override def equals(other: Any): Boolean = if (other.isInstanceOf[Args]) { other.asInstanceOf[Args].m.equals(m) } else { false } - } override def hashCode(): Int = m.hashCode() @@ -127,35 +123,34 @@ class Args(val m: Map[String, List[String]]) extends java.io.Serializable { def getOrElse(key: String, default: String): String = optional(key).getOrElse(default) /** - * return exactly one value for a given key. - * If there is more than one value, you get an exception + * return exactly one value for a given key. If there is more than one value, you get an exception */ def required(key: String): String = list(key) match { - case List() => throw ArgsException("Please provide a value for --" + key) + case List() => throw ArgsException("Please provide a value for --" + key) case List(a) => a - case _ => throw ArgsException("Please only provide a single value for --" + key) + case _ => throw ArgsException("Please only provide a single value for --" + key) } - def toList: List[String] = { + def toList: List[String] = m.foldLeft(List[String]()) { (args, kvlist) => val k = kvlist._1 val values = kvlist._2 if (k != "") { //Make sure positional args are first - args ++ ((("--" + k) :: values)) + args ++ (("--" + k) :: values) } else { // These are positional args (no key), put them first: values ++ args } } - } /** - * Asserts whether all the args belong to the given set of accepted arguments. - * If an arg does not belong to the given set, you get an error. + * Asserts whether all the args belong to the given set of accepted arguments. If an arg does not belong to + * the given set, you get an error. */ def restrictTo(acceptedArgs: Set[String]): Unit = { - val invalidArgs = m.keySet.filter(!_.startsWith("scalding.")) -- (acceptedArgs + "" + "tool.graph" + "hdfs" + "local") + val invalidArgs = + m.keySet.filter(!_.startsWith("scalding.")) -- (acceptedArgs + "" + "tool.graph" + "hdfs" + "local") if (!invalidArgs.isEmpty) throw ArgsException("Invalid args: " + invalidArgs.map("--" + _).mkString(", ")) } @@ -163,64 +158,84 @@ class Args(val m: Map[String, List[String]]) extends java.io.Serializable { override def toString: String = toList.mkString(" ") /** - * If there is zero or one element, return it as an Option. - * If there is a list of more than one item, you get an error + * If there is zero or one element, return it as an Option. If there is a list of more than one item, you + * get an error */ def optional(key: String): Option[String] = list(key) match { - case List() => None + case List() => None case List(a) => Some(a) - case _ => throw ArgsException("Please provide at most one value for --" + key) + case _ => throw ArgsException("Please provide at most one value for --" + key) } - def int(key: String, default: Int): Int = { - optional(key).map(value => try value.toInt catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") - }).getOrElse(default) - } + def int(key: String, default: Int): Int = + optional(key) + .map(value => + try value.toInt + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) def int(key: String): Int = { val value = required(key) - try value.toInt catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") + try value.toInt + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") } } - def long(key: String, default: Long): Long = { - optional(key).map(value => try value.toLong catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") - }).getOrElse(default) - } + def long(key: String, default: Long): Long = + optional(key) + .map(value => + try value.toLong + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) def long(key: String): Long = { val value = required(key) - try value.toLong catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") + try value.toLong + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") } } - def float(key: String, default: Float): Float = { - optional(key).map(value => try value.toFloat catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") - }).getOrElse(default) - } + def float(key: String, default: Float): Float = + optional(key) + .map(value => + try value.toFloat + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) def float(key: String): Float = { val value = required(key) - try value.toFloat catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") + try value.toFloat + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") } } - def double(key: String, default: Double): Double = { - optional(key).map(value => try value.toDouble catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") - }).getOrElse(default) - } + def double(key: String, default: Double): Double = + optional(key) + .map(value => + try value.toDouble + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) def double(key: String): Double = { val value = required(key) - try value.toDouble catch { - case NonFatal(_) => throw ArgsException(s"Invalid value ${value} for -- ${key}") + try value.toDouble + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") } } } diff --git a/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala b/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala index 65cb5585ea..319c9d6f09 100644 --- a/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala +++ b/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding @@ -23,35 +23,33 @@ object RangedArgs { case class Range[T](lower: T, upper: T)(implicit ord: Ordering[T]) { assert(ord.lteq(lower, upper), "Bad range: " + lower + " > " + upper) - def assertLowerBound(min: T): Unit = { + def assertLowerBound(min: T): Unit = assert(ord.lteq(min, lower), "Range out of bounds: " + lower + " < " + min) - } - def assertUpperBound(max: T): Unit = { + def assertUpperBound(max: T): Unit = assert(ord.gteq(max, upper), "Range out of bounds: " + upper + " > " + max) - } def assertBounds(min: T, max: T): Unit = { assertLowerBound(min) assertUpperBound(max) } - def mkString(sep: String) = { + def mkString(sep: String) = if (ord.equiv(lower, upper)) { lower.toString } else { lower.toString + sep + upper.toString } - } } class RangedArgs(args: Args) { - def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] = args.list(argName) match { - case List(v) => - Range(cnv(v), cnv(v)) - case List(v1, v2) => - Range(cnv(v1), cnv(v2)) - case _ => - throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified") - } + def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] = + args.list(argName) match { + case List(v) => + Range(cnv(v), cnv(v)) + case List(v1, v2) => + Range(cnv(v1), cnv(v2)) + case _ => + throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified") + } } diff --git a/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala b/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala index 367c596be9..15ff4c8dae 100644 --- a/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala +++ b/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalatest.WordSpec @@ -121,38 +121,40 @@ class ArgTest extends WordSpec { "verify that args belong to an accepted key set" in { val a = Args("a --one --two b --three c d --scalding.tool.mode") a.restrictTo(Set("one", "two", "three", "four")) - intercept[RuntimeException] { a.restrictTo(Set("one", "two")) } + intercept[RuntimeException](a.restrictTo(Set("one", "two"))) } "correctly parse numeric args" in { - val map = Args(Array("--anInt", "-1", "--aLong", "21474836470", "--aDecimal", "3.141592654", "--aString", "foo")) + val map = Args( + Array("--anInt", "-1", "--aLong", "21474836470", "--aDecimal", "3.141592654", "--aString", "foo") + ) assert(map.int("anInt") == "-1".toInt) assert(map.int("anInt", 2) == "-1".toInt) assert(map.int("nothing", 2) == 2) - intercept[RuntimeException] { map.int("nothing") } - intercept[RuntimeException] { map.int("aString") } - intercept[RuntimeException] { map.int("aString", 2) } + intercept[RuntimeException](map.int("nothing")) + intercept[RuntimeException](map.int("aString")) + intercept[RuntimeException](map.int("aString", 2)) assert(map.long("aLong") == "21474836470".toLong) assert(map.long("anInt", 2L) == "-1".toLong) assert(map.long("nothing", 2L) == 2L) - intercept[RuntimeException] { map.long("nothing") } - intercept[RuntimeException] { map.long("aString") } - intercept[RuntimeException] { map.long("aString", 2L) } + intercept[RuntimeException](map.long("nothing")) + intercept[RuntimeException](map.long("aString")) + intercept[RuntimeException](map.long("aString", 2L)) assert(map.float("aDecimal") == "3.141592654".toFloat) assert(map.float("aDecimal", 2.71828f) == "3.141592654".toFloat) assert(map.float("nothing", 2.71828f) == 2.71828f) - intercept[RuntimeException] { map.float("nothing") } - intercept[RuntimeException] { map.float("aString") } - intercept[RuntimeException] { map.float("aString", 2.71828f) } + intercept[RuntimeException](map.float("nothing")) + intercept[RuntimeException](map.float("aString")) + intercept[RuntimeException](map.float("aString", 2.71828f)) assert(map.double("aDecimal") == "3.141592654".toDouble) assert(map.double("aDecimal", 2.71828d) == "3.141592654".toDouble) assert(map.double("nothing", 2.71828d) == 2.71828d) - intercept[RuntimeException] { map.double("nothing") } - intercept[RuntimeException] { map.double("aString") } - intercept[RuntimeException] { map.double("aString", 2.71828d) } + intercept[RuntimeException](map.double("nothing")) + intercept[RuntimeException](map.double("aString")) + intercept[RuntimeException](map.double("aString", 2.71828d)) } } } diff --git a/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala b/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala index 079384398f..e2b05305d2 100644 --- a/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala +++ b/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding @@ -29,19 +29,19 @@ class RangeSpecs extends WordSpec { "throw errors for misordered ranges" in { Range(4, 4) - intercept[AssertionError] { Range(5, 4) } + intercept[AssertionError](Range(5, 4)) } "assert lower bounds" in { testRange.assertLowerBound(3) testRange.assertLowerBound(4) - intercept[AssertionError] { testRange.assertLowerBound(5) } + intercept[AssertionError](testRange.assertLowerBound(5)) } "assert upper bounds" in { testRange.assertUpperBound(6) testRange.assertUpperBound(5) - intercept[AssertionError] { testRange.assertUpperBound(4) } + intercept[AssertionError](testRange.assertUpperBound(4)) } "print nicely with mkString" should { diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala index 366f11869f..708436e54f 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala @@ -17,7 +17,7 @@ package com.twitter.scalding.avro import cascading.avro.AvroScheme import cascading.avro.PackedAvroScheme -import cascading.avro.local.{ AvroScheme => LAvroScheme, PackedAvroScheme => LPackedAvroScheme } +import cascading.avro.local.{AvroScheme => LAvroScheme, PackedAvroScheme => LPackedAvroScheme} import com.twitter.scalding._ import org.apache.avro.Schema import cascading.scheme.Scheme @@ -26,15 +26,17 @@ import java.io.OutputStream import java.util.Properties import cascading.tuple.Fields import collection.JavaConverters._ -import org.apache.hadoop.mapred.{ OutputCollector, RecordReader, JobConf } +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} trait UnpackedAvroFileScheme extends FileSource { def schema: Option[Schema] // HadoopSchemeInstance gives compile errors in 2.10 for some reason - override def hdfsScheme = (new AvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] + override def hdfsScheme = (new AvroScheme(schema.getOrElse(null))) + .asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] - override def localScheme = (new LAvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + override def localScheme = (new LAvroScheme(schema.getOrElse(null))) + .asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] } @@ -42,9 +44,11 @@ trait PackedAvroFileScheme[T] extends FileSource { def schema: Schema // HadoopSchemeInstance gives compile errors for this in 2.10 for some reason - override def hdfsScheme = (new PackedAvroScheme[T](schema)).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] + override def hdfsScheme = (new PackedAvroScheme[T](schema)) + .asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] - override def localScheme = (new LPackedAvroScheme[T](schema)).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + override def localScheme = + (new LPackedAvroScheme[T](schema)).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] } object UnpackedAvroSource { @@ -65,16 +69,20 @@ object UnpackedAvroSource { new UnpackedAvroSource[T](Seq(path), schema) } -case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema])(implicit val conv: TupleConverter[T], tset: TupleSetter[T]) - - extends FixedPathSource(paths: _*) - with UnpackedAvroFileScheme with Mappable[T] with TypedSink[T] { +case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema])(implicit + val conv: TupleConverter[T], + tset: TupleSetter[T] +) extends FixedPathSource(paths: _*) + with UnpackedAvroFileScheme + with Mappable[T] + with TypedSink[T] { override def sinkFields: Fields = { - val outFields = schema.map { - schema => - val schemaFields = schema.getFields - schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => cFields.append(new Fields(sField.name()))) + val outFields = schema.map { schema => + val schemaFields = schema.getFields + schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => + cFields.append(new Fields(sField.name())) + ) } outFields.getOrElse(Dsl.intFields(0 until setter.arity)) } @@ -89,8 +97,15 @@ object PackedAvroSource { def apply[T: AvroSchemaType: Manifest: TupleConverter](path: String) = new PackedAvroSource[T](Seq(path)) } -case class PackedAvroSource[T](paths: Seq[String])(implicit val mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T], avroType: AvroSchemaType[T]) - extends FixedPathSource(paths: _*) with PackedAvroFileScheme[T] with Mappable[T] with TypedSink[T] { +case class PackedAvroSource[T](paths: Seq[String])(implicit + val mf: Manifest[T], + conv: TupleConverter[T], + tset: TupleSetter[T], + avroType: AvroSchemaType[T] +) extends FixedPathSource(paths: _*) + with PackedAvroFileScheme[T] + with Mappable[T] + with TypedSink[T] { override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset) diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala index e24e71aa51..05f768305d 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala @@ -57,22 +57,29 @@ object AvroSchemaType { } // collections - implicit def CollectionSchema[CC[x] <: Iterable[x], T](implicit sch: AvroSchemaType[T]): AvroSchemaType[CC[T]] = new AvroSchemaType[CC[T]] { + implicit def CollectionSchema[CC[x] <: Iterable[x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[T]] = new AvroSchemaType[CC[T]] { def schema = Schema.createArray(sch.schema) } - implicit def ArraySchema[CC[x] <: Array[x], T](implicit sch: AvroSchemaType[T]): AvroSchemaType[CC[T]] { val schema: Schema } = new AvroSchemaType[CC[T]] { + implicit def ArraySchema[CC[x] <: Array[x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[T]] { val schema: Schema } = new AvroSchemaType[CC[T]] { val schema = Schema.createArray(sch.schema) } //maps - implicit def MapSchema[CC[String, x] <: Map[String, x], T](implicit sch: AvroSchemaType[T]): AvroSchemaType[CC[String, T]] = new AvroSchemaType[CC[String, T]] { + implicit def MapSchema[CC[String, x] <: Map[String, x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[String, T]] = new AvroSchemaType[CC[String, T]] { def schema = Schema.createMap(sch.schema) } // Avro SpecificRecord - implicit def SpecificRecordSchema[T <: SpecificRecord](implicit mf: Manifest[T]): AvroSchemaType[T] = new AvroSchemaType[T] { - def schema = mf.runtimeClass.newInstance.asInstanceOf[SpecificRecord].getSchema - } + implicit def SpecificRecordSchema[T <: SpecificRecord](implicit mf: Manifest[T]): AvroSchemaType[T] = + new AvroSchemaType[T] { + def schema = mf.runtimeClass.newInstance.asInstanceOf[SpecificRecord].getSchema + } -} \ No newline at end of file +} diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala index c5bb868103..f88472575c 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala @@ -21,27 +21,33 @@ import collection.JavaConverters._ import cascading.tuple.Fields package object avro { - def writePackedAvro[T](pipe: TypedPipe[T], path: String)(implicit mf: Manifest[T], - st: AvroSchemaType[T], - conv: TupleConverter[T], - set: TupleSetter[T], - flow: FlowDef, - mode: Mode): Unit = { + def writePackedAvro[T](pipe: TypedPipe[T], path: String)(implicit + mf: Manifest[T], + st: AvroSchemaType[T], + conv: TupleConverter[T], + set: TupleSetter[T], + flow: FlowDef, + mode: Mode + ): Unit = { val sink = PackedAvroSource[T](path) pipe.write(sink) } - def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema)(implicit mf: Manifest[T], - conv: TupleConverter[T], - set: TupleSetter[T], - flow: FlowDef, - mode: Mode): Unit = { + def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema)(implicit + mf: Manifest[T], + conv: TupleConverter[T], + set: TupleSetter[T], + flow: FlowDef, + mode: Mode + ): Unit = { import Dsl._ val sink = UnpackedAvroSource[T](path, Some(schema)) val outFields = { val schemaFields = schema.getFields - schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => cFields.append(new Fields(sField.name()))) + schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => + cFields.append(new Fields(sField.name())) + ) } pipe.toPipe(outFields).write(sink) } -} \ No newline at end of file +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala index cb6d359367..0d4c05d590 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala @@ -7,16 +7,12 @@ import com.twitter.scalding.Config import com.twitter.scalding.beam_backend.BeamOp.CoGroupedOp import com.twitter.scalding.serialization.KryoHadoop import com.twitter.scalding.typed._ -import com.twitter.scalding.typed.functions.{ - FilterKeysToFilter, - FlatMapValuesToFlatMap, - MapValuesToMap -} +import com.twitter.scalding.typed.functions.{FilterKeysToFilter, FlatMapValuesToFlatMap, MapValuesToMap} object BeamPlanner { def plan( - config: Config, - srcs: Resolver[TypedSource, BeamSource] + config: Config, + srcs: Resolver[TypedSource, BeamSource] ): FunctionK[TypedPipe, BeamOp] = { implicit val kryoCoder: KryoCoder = new KryoCoder(defaultKryoCoderConfiguration(config)) Memoize.functionK(f = new Memoize.RecursiveK[TypedPipe, BeamOp] { @@ -113,23 +109,21 @@ object BeamPlanner { }) } - def defaultKryoCoderConfiguration(config: Config): KryoInstantiator = { + def defaultKryoCoderConfiguration(config: Config): KryoInstantiator = config.getKryo match { case Some(kryoInstantiator) => kryoInstantiator - case None => new KryoHadoop(new ScalaMapConfig(Map.empty)) + case None => new KryoHadoop(new ScalaMapConfig(Map.empty)) } - } def defaultOptimizationRules(config: Config): Seq[Rule[TypedPipe]] = { def std(forceHash: Rule[TypedPipe]) = - (OptimizationRules.standardMapReduceRules ::: + OptimizationRules.standardMapReduceRules ::: List( OptimizationRules.FilterLocally, // after filtering, we may have filtered to nothing, lets see OptimizationRules.simplifyEmpty, // add any explicit forces to the optimized graph - Rule.orElse(List( - forceHash, - OptimizationRules.RemoveDuplicateForceFork)))) + Rule.orElse(List(forceHash, OptimizationRules.RemoveDuplicateForceFork)) + ) config.getOptimizationPhases match { case Some(tryPhases) => tryPhases.get.phases @@ -141,4 +135,3 @@ object BeamPlanner { } } } - diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala index ce651929e9..76f06fecc7 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala @@ -28,13 +28,13 @@ object BeamFunctions { } case class MapSideAggregator[K, V]( - size: Int, semigroup: Semigroup[V] + size: Int, + semigroup: Semigroup[V] ) extends DoFn[(K, V), (K, V)] { var cache: SummingCache[K, V] = _ @StartBundle - def startBundle(): Unit = { + def startBundle(): Unit = cache = new SummingCache[K, V](size)(semigroup) - } @ProcessElement def processElement(c: DoFn[(K, V), (K, V)]#ProcessContext): Unit = { @@ -64,8 +64,8 @@ object BeamFunctions { } case class HashJoinFn[K, V, U, W]( - joiner: (K, V, Iterable[U]) => Iterator[W], - sideInput: PCollectionView[java.util.Map[K, java.lang.Iterable[U]]] + joiner: (K, V, Iterable[U]) => Iterator[W], + sideInput: PCollectionView[java.util.Map[K, java.lang.Iterable[U]]] ) extends DoFn[(K, V), (K, W)] { private[this] var mapRight: java.util.Map[K, java.lang.Iterable[U]] = null private[this] val emptyUs: Iterable[U] = Seq.empty[U] @@ -78,7 +78,7 @@ object BeamFunctions { val key = c.element()._1 val value = c.element()._2 val it = mapRight.get(key) match { - case null => joiner(key, value, emptyUs) + case null => joiner(key, value, emptyUs) case notEmpty => joiner(key, value, notEmpty.asScala) } while (it.hasNext) { @@ -87,9 +87,8 @@ object BeamFunctions { } @FinishBundle - def finishBundle(c: DoFn[(K, V), (K, W)]#FinishBundleContext): Unit = { + def finishBundle(c: DoFn[(K, V), (K, W)]#FinishBundleContext): Unit = mapRight = null - } } def widenPCollection[A, B >: A](p: PCollection[_ <: A]): PCollection[B] = p.asInstanceOf[PCollection[B]] diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala index 424cb29da6..b567d0f72a 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala @@ -1,17 +1,18 @@ package com.twitter.scalding.beam_backend import com.twitter.scalding.Execution.Writer -import com.twitter.scalding.typed.{ Resolver, TypedSink, TypedSource } -import com.twitter.scalding.{ Config, Mode, TextLine } +import com.twitter.scalding.typed.{Resolver, TypedSink, TypedSource} +import com.twitter.scalding.{Config, Mode, TextLine} import org.apache.beam.sdk.Pipeline import org.apache.beam.sdk.io.TextIO import org.apache.beam.sdk.options.PipelineOptions import org.apache.beam.sdk.values.PCollection case class BeamMode( - pipelineOptions: PipelineOptions, - sources: Resolver[TypedSource, BeamSource], - sink: Resolver[TypedSink, BeamSink]) extends Mode { + pipelineOptions: PipelineOptions, + sources: Resolver[TypedSource, BeamSource], + sink: Resolver[TypedSink, BeamSink] +) extends Mode { def newWriter(): Writer = new BeamWriter(this) } @@ -29,26 +30,22 @@ trait BeamSource[+A] extends Serializable { object BeamSource extends Serializable { val Default: Resolver[TypedSource, BeamSource] = { new Resolver[TypedSource, BeamSource] { - def apply[A](source: TypedSource[A]): Option[BeamSource[A]] = { + def apply[A](source: TypedSource[A]): Option[BeamSource[A]] = source match { case tl: TextLine => tl.localPaths match { case path :: Nil => Some(textLine(path)) - case _ => throw new Exception("Can not accept multiple paths to BeamSource") + case _ => throw new Exception("Can not accept multiple paths to BeamSource") } case _ => None } - } } } def textLine(path: String): BeamSource[String] = new BeamSource[String] { - override def read( - pipeline: Pipeline, - config: Config): PCollection[_ <: String] = { + override def read(pipeline: Pipeline, config: Config): PCollection[_ <: String] = pipeline.apply(TextIO.read().from(path)) - } } } @@ -59,26 +56,21 @@ trait BeamSink[-A] extends Serializable { object BeamSink extends Serializable { val Default: Resolver[TypedSink, BeamSink] = { new Resolver[TypedSink, BeamSink] { - def apply[A](sink: TypedSink[A]): Option[BeamSink[A]] = { + def apply[A](sink: TypedSink[A]): Option[BeamSink[A]] = sink match { case tl: TextLine => tl.localPaths match { case path :: Nil => Some(textLine(path).asInstanceOf[BeamSink[A]]) - case _ => throw new Exception("Can not accept multiple paths to BeamSink") + case _ => throw new Exception("Can not accept multiple paths to BeamSink") } case _ => None } - } } } def textLine(path: String): BeamSink[String] = new BeamSink[String] { - override def write( - pipeline: Pipeline, - config: Config, - pc: PCollection[_ <: String]): Unit = { + override def write(pipeline: Pipeline, config: Config, pc: PCollection[_ <: String]): Unit = pc.asInstanceOf[PCollection[String]].apply(TextIO.write().to(path)) - } } } diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala index 207cd560d4..b060eef840 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala @@ -42,7 +42,7 @@ sealed abstract class BeamOp[+A] { applyPTransform(Filter.by[A, ProcessFunction[A, java.lang.Boolean]](ProcessPredicate(f))) def applyPTransform[C >: A, B]( - f: PTransform[PCollection[C], PCollection[B]] + f: PTransform[PCollection[C], PCollection[B]] )(implicit kryoCoder: KryoCoder): BeamOp[B] = TransformBeamOp(this, f, kryoCoder) @@ -54,57 +54,56 @@ object BeamOp extends Serializable { implicit private def fakeClassTag[A]: ClassTag[A] = ClassTag(classOf[AnyRef]).asInstanceOf[ClassTag[A]] def planMapGroup[K, V, U]( - pcoll: PCollection[KV[K, java.lang.Iterable[V]]], - reduceFn: (K, Iterator[V]) => Iterator[U] - )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): PCollection[KV[K, java.lang.Iterable[U]]] = { + pcoll: PCollection[KV[K, java.lang.Iterable[V]]], + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): PCollection[KV[K, java.lang.Iterable[U]]] = reduceFn match { case ComposedMapGroup(f, g) => planMapGroup(planMapGroup(pcoll, f), g) case EmptyGuard(MapValueStream(sa: SumAll[V])) => pcoll - .apply(Combine.groupedValues( - new SerializableBiFunction[V, V, V] { - override def apply(t: V, u: V): V = sa.sg.plus (t, u) - })).setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), kryoCoder)) + .apply(Combine.groupedValues(new SerializableBiFunction[V, V, V] { + override def apply(t: V, u: V): V = sa.sg.plus(t, u) + })) + .setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), kryoCoder)) .apply(MapElements.via(new SimpleFunction[KV[K, V], KV[K, java.lang.Iterable[U]]]() { override def apply(input: KV[K, V]): KV[K, lang.Iterable[U]] = KV.of(input.getKey, Seq(input.getValue.asInstanceOf[U]).toIterable.asJava) - })).setCoder( - KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder)) - ) + })) + .setCoder( + KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder)) + ) case notComposedOrSum => - pcoll.apply(ParDo.of( - MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[U]]] { elem => + pcoll + .apply(ParDo.of(MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[U]]] { elem => KV.of( elem.getKey, - notComposedOrSum(elem.getKey, elem.getValue.asScala.toIterator).toIterable.asJava) - })).setCoder( - KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder))) + notComposedOrSum(elem.getKey, elem.getValue.asScala.toIterator).toIterable.asJava + ) + })) + .setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder))) } - } - final case class Source[A]( - conf: Config, - original: TypedSource[A], - input: Option[BeamSource[A]]) extends BeamOp[A] { + final case class Source[A](conf: Config, original: TypedSource[A], input: Option[BeamSource[A]]) + extends BeamOp[A] { def run(pipeline: Pipeline): PCollection[_ <: A] = input match { - case None => throw new IllegalArgumentException( - s"source $original was not connected to a beam source" - ) + case None => + throw new IllegalArgumentException( + s"source $original was not connected to a beam source" + ) case Some(src) => src.read(pipeline, conf) } } final case class FromIterable[A](iterable: Iterable[A], kryoCoder: KryoCoder) extends BeamOp[A] { - override def run(pipeline: Pipeline): PCollection[_ <: A] = { + override def run(pipeline: Pipeline): PCollection[_ <: A] = pipeline.apply(Create.of(iterable.asJava).withCoder(kryoCoder)) - } } final case class TransformBeamOp[A, B]( - source: BeamOp[A], - f: PTransform[PCollection[A], PCollection[B]], - kryoCoder: KryoCoder + source: BeamOp[A], + f: PTransform[PCollection[A], PCollection[B]], + kryoCoder: KryoCoder ) extends BeamOp[B] { def run(pipeline: Pipeline): PCollection[B] = { val pCollection: PCollection[A] = widenPCollection(source.run(pipeline)) @@ -113,9 +112,11 @@ object BeamOp extends Serializable { } final case class HashJoinOp[K, V, U, W]( - left: BeamOp[(K, V)], - right: BeamOp[(K, U)], joiner: (K, V, Iterable[U]) => Iterator[W] - )(implicit kryoCoder: KryoCoder, ordK: Ordering[K]) extends BeamOp[(K, W)] { + left: BeamOp[(K, V)], + right: BeamOp[(K, U)], + joiner: (K, V, Iterable[U]) => Iterator[W] + )(implicit kryoCoder: KryoCoder, ordK: Ordering[K]) + extends BeamOp[(K, W)] { override def run(pipeline: Pipeline): PCollection[_ <: (K, W)] = { val leftPCollection = left.run(pipeline) val keyCoder: Coder[K] = OrderedSerializationCoder.apply(ordK, kryoCoder) @@ -123,7 +124,8 @@ object BeamOp extends Serializable { val rightPCollectionView = rightPCollection .apply(TupleToKV[K, U](keyCoder, kryoCoder)) - .apply(GroupByKey.create[K, U]()).setCoder(KvCoder.of(keyCoder, kryoCoder)) + .apply(GroupByKey.create[K, U]()) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) .apply(View.asMap[K, java.lang.Iterable[U]]()) leftPCollection @@ -137,9 +139,10 @@ object BeamOp extends Serializable { } final case class CoGroupedOp[K, V]( - cg: CoGrouped[K, V], - inputOps: Seq[BeamOp[(K, Any)]] - )(implicit kryoCoder: KryoCoder) extends BeamOp[(K, V)] { + cg: CoGrouped[K, V], + inputOps: Seq[BeamOp[(K, Any)]] + )(implicit kryoCoder: KryoCoder) + extends BeamOp[(K, V)] { override def run(pipeline: Pipeline): PCollection[_ <: (K, V)] = { val inputOpsSize = inputOps.size val keyCoder: Coder[K] = OrderedSerializationCoder.apply(cg.keyOrdering, kryoCoder) @@ -166,7 +169,8 @@ object BeamOp extends Serializable { )((keyed, colWithTag) => keyed.and[Any](colWithTag._2, colWithTag._1)) keyedPCollectionTuple - .apply(CoGroupByKey.create()).setCoder(KvCoder.of(keyCoder, coGroupResultCoder)) + .apply(CoGroupByKey.create()) + .setCoder(KvCoder.of(keyCoder, coGroupResultCoder)) .apply(ParDo.of(new CoGroupDoFn[K, V](cg, tupleTags))) .setCoder(KvCoder.of(keyCoder, kryoCoder)) .apply(KVToTuple[K, V](keyCoder, kryoCoder)) @@ -174,8 +178,8 @@ object BeamOp extends Serializable { } case class CoGroupDoFn[K, V]( - coGrouped: CoGrouped[K, V], - tags: Seq[TupleTag[Any]] + coGrouped: CoGrouped[K, V], + tags: Seq[TupleTag[Any]] ) extends DoFn[KV[K, CoGbkResult], KV[K, V]] { @ProcessElement def processElement(c: DoFn[KV[K, CoGbkResult], KV[K, V]]#ProcessContext): Unit = { @@ -189,7 +193,7 @@ object BeamOp extends Serializable { iteratorSeq.drop(1).map(_.asScala) ) - while(outputIter.hasNext) { + while (outputIter.hasNext) { c.output(KV.of(key, outputIter.next())) } } @@ -197,8 +201,8 @@ object BeamOp extends Serializable { implicit class KVOp[K, V](val op: BeamOp[(K, V)]) extends AnyVal { def mapGroup[U]( - reduceFn: (K, Iterator[V]) => Iterator[U] - )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): BeamOp[(K, U)] = { + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): BeamOp[(K, U)] = TransformBeamOp[(K, V), (K, U)]( op, new PTransform[PCollection[(K, V)], PCollection[(K, U)]]() { @@ -211,18 +215,19 @@ object BeamOp extends Serializable { .setCoder(KvCoder.of(keyCoder, IterableCoder.of(kryoCoder))) planMapGroup[K, V, U](groupedValues, reduceFn) - .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]]{ elem => + .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]] { elem => elem.getValue.asScala.map(KV.of(elem.getKey, _)) - })).setCoder(KvCoder.of(keyCoder, kryoCoder)) + })) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) .apply(KVToTuple[K, U](keyCoder, kryoCoder)) } }, - kryoCoder) - } + kryoCoder + ) def sortedMapGroup[U]( - reduceFn: (K, Iterator[V]) => Iterator[U] - )(implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder): BeamOp[(K, U)] = { + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder): BeamOp[(K, U)] = TransformBeamOp[(K, V), (K, U)]( op, new PTransform[PCollection[(K, V)], PCollection[(K, U)]]() { @@ -237,16 +242,17 @@ object BeamOp extends Serializable { .apply(SortGroupedValues[K, V]) planMapGroup[K, V, U](groupedSortedValues, reduceFn) - .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]]{ elem => + .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]] { elem => elem.getValue.asScala.map(KV.of(elem.getKey, _)) - })).setCoder(KvCoder.of(keyCoder, kryoCoder)) + })) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) .apply(KVToTuple[K, U](keyCoder, kryoCoder)) } }, - kryoCoder) - } + kryoCoder + ) - def sorted(implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder): BeamOp[(K, V)] = { + def sorted(implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder): BeamOp[(K, V)] = TransformBeamOp[(K, V), (K, V)]( op, new PTransform[PCollection[(K, V)], PCollection[(K, V)]]() { @@ -265,73 +271,77 @@ object BeamOp extends Serializable { .apply(KVToTuple[K, V](keyCoder, valueCoder)) } }, - kryoCoder) - } + kryoCoder + ) def mapSideAggregator( - size: Int, semigroup: Semigroup[V] - )(implicit kryoCoder: KryoCoder): BeamOp[(K, V)] = { + size: Int, + semigroup: Semigroup[V] + )(implicit kryoCoder: KryoCoder): BeamOp[(K, V)] = TransformBeamOp[(K, V), (K, V)]( op, new PTransform[PCollection[(K, V)], PCollection[(K, V)]]() { override def expand(input: PCollection[(K, V)]): PCollection[(K, V)] = input.apply(ParDo.of(MapSideAggregator[K, V](size, semigroup))).setCoder(kryoCoder) }, - kryoCoder) - } + kryoCoder + ) def hashJoin[U, W]( - right: BeamOp[(K, U)], - fn: (K, V, Iterable[U]) => Iterator[W] - )(implicit kryoCoder: KryoCoder, ord: Ordering[K]): BeamOp[(K, W)] = { + right: BeamOp[(K, U)], + fn: (K, V, Iterable[U]) => Iterator[W] + )(implicit kryoCoder: KryoCoder, ord: Ordering[K]): BeamOp[(K, W)] = HashJoinOp(op, right, fn) - } } /** - * @todo this needs to be changed to some external sorter, current Beam external sorter - * implementation does not provide an option to sort with custom Ordering - * @see [[org.apache.beam.sdk.extensions.sorter.ExternalSorter]] + * @todo + * this needs to be changed to some external sorter, current Beam external sorter implementation does not + * provide an option to sort with custom Ordering + * @see + * [[org.apache.beam.sdk.extensions.sorter.ExternalSorter]] */ - case class SortGroupedValues[K, V]( - implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder + case class SortGroupedValues[K, V](implicit + ordK: Ordering[K], + ordV: Ordering[V], + kryoCoder: KryoCoder ) extends PTransform[PCollection[KV[K, java.lang.Iterable[V]]], PCollection[KV[K, java.lang.Iterable[V]]]] { override def expand( - input: PCollection[KV[K, lang.Iterable[V]]] - ): PCollection[KV[K, lang.Iterable[V]]] = { + input: PCollection[KV[K, lang.Iterable[V]]] + ): PCollection[KV[K, lang.Iterable[V]]] = input - .apply(ParDo.of(MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[V]]]{ elem => + .apply(ParDo.of(MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[V]]] { elem => KV.of(elem.getKey, elem.getValue.asScala.toArray.sorted.toIterable.asJava) })) - .setCoder(KvCoder.of( - OrderedSerializationCoder(ordK, kryoCoder), - IterableCoder.of(OrderedSerializationCoder(ordV, kryoCoder))) + .setCoder( + KvCoder.of( + OrderedSerializationCoder(ordK, kryoCoder), + IterableCoder.of(OrderedSerializationCoder(ordV, kryoCoder)) + ) ) - } } case class TupleToKV[K, V]( - kCoder: Coder[K], - vCoder: Coder[V] + kCoder: Coder[K], + vCoder: Coder[V] ) extends PTransform[PCollection[(K, V)], PCollection[KV[K, V]]] { - override def expand(input: PCollection[(K, V)]): PCollection[KV[K, V]] = { + override def expand(input: PCollection[(K, V)]): PCollection[KV[K, V]] = input .apply(MapElements.via[(K, V), KV[K, V]](new SimpleFunction[(K, V), KV[K, V]]() { override def apply(input: (K, V)): KV[K, V] = KV.of(input._1, input._2) - })).setCoder(KvCoder.of(kCoder, vCoder)) - } + })) + .setCoder(KvCoder.of(kCoder, vCoder)) } case class KVToTuple[K, V]( - coderK: Coder[K], - coderV: Coder[V] + coderK: Coder[K], + coderV: Coder[V] ) extends PTransform[PCollection[KV[K, V]], PCollection[(K, V)]] { - override def expand(input: PCollection[KV[K, V]]): PCollection[(K, V)] = { + override def expand(input: PCollection[KV[K, V]]): PCollection[(K, V)] = input .apply(MapElements.via[KV[K, V], (K, V)](new SimpleFunction[KV[K, V], (K, V)]() { override def apply(input: KV[K, V]): (K, V) = (input.getKey, input.getValue) - })).setCoder(TupleCoder(coderK, coderV)) - } + })) + .setCoder(TupleCoder(coderK, coderV)) } } - diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala index cb379e8908..6130a71a2d 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala @@ -1,13 +1,13 @@ package com.twitter.scalding.beam_backend import com.stripe.dagon.Rule -import com.twitter.scalding.Execution.{ ToWrite, Writer } +import com.twitter.scalding.Execution.{ToWrite, Writer} import com.twitter.scalding.typed._ -import com.twitter.scalding.{ CFuture, CancellationHandler, Config, Execution, ExecutionCounters } +import com.twitter.scalding.{CFuture, CancellationHandler, Config, Execution, ExecutionCounters} import java.util.concurrent.atomic.AtomicLong import org.apache.beam.sdk.Pipeline import scala.annotation.tailrec -import scala.concurrent.{ ExecutionContext, Future } +import scala.concurrent.{ExecutionContext, Future} class BeamWriter(val beamMode: BeamMode) extends Writer { private val state = new AtomicLong() @@ -16,13 +16,17 @@ class BeamWriter(val beamMode: BeamMode) extends Writer { override def finished(): Unit = () - def getForced[T](conf: Config, initial: TypedPipe[T])(implicit cec: ExecutionContext): Future[TypedPipe[T]] = ??? + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[TypedPipe[T]] = ??? - def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit cec: ExecutionContext): Future[Iterable[T]] = ??? + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[Iterable[T]] = ??? - override def execute( - conf: Config, - writes: List[ToWrite[_]])(implicit cec: ExecutionContext): CFuture[(Long, ExecutionCounters)] = { + override def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { import Execution.ToWrite._ val planner = BeamPlanner.plan(conf, beamMode.sources) val phases: Seq[Rule[TypedPipe]] = BeamPlanner.defaultOptimizationRules(conf) @@ -30,7 +34,7 @@ class BeamWriter(val beamMode: BeamMode) extends Writer { val pipeline = Pipeline.create(beamMode.pipelineOptions) @tailrec - def rec(optimizedWrites: List[OptimizedWrite[TypedPipe, _]]): Unit = { + def rec(optimizedWrites: List[OptimizedWrite[TypedPipe, _]]): Unit = optimizedWrites match { case Nil => () case x :: xs => @@ -48,17 +52,17 @@ class BeamWriter(val beamMode: BeamMode) extends Writer { case _ => ??? } } - } rec(optimizedWrites) val result = pipeline.run val runId = state.getAndIncrement() CFuture( - Future{ + Future { result.waitUntilFinish() (runId, ExecutionCounters.empty) }, - CancellationHandler.fromFn{ ec => - Future{ result.cancel(); () }(ec) - }) + CancellationHandler.fromFn { ec => + Future { result.cancel(); () }(ec) + } + ) } } diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala index 1ed513b85b..527376d3b4 100644 --- a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala @@ -1,17 +1,16 @@ package com.twitter.scalding.beam_backend import com.esotericsoftware.kryo.io.Input -import com.twitter.chill.{ KryoInstantiator, KryoPool } -import com.twitter.scalding.serialization.JavaStreamEnrichments.{ RichInputStream, RichOutputStream } +import com.twitter.chill.{KryoInstantiator, KryoPool} +import com.twitter.scalding.serialization.JavaStreamEnrichments.{RichInputStream, RichOutputStream} import com.twitter.scalding.serialization.OrderedSerialization -import java.io.{ InputStream, OutputStream } -import org.apache.beam.sdk.coders.{ AtomicCoder, Coder } +import java.io.{InputStream, OutputStream} +import org.apache.beam.sdk.coders.{AtomicCoder, Coder} import scala.language.implicitConversions final class KryoCoder(kryoInstantiator: KryoInstantiator) extends AtomicCoder[Any] { - @transient private[this] lazy val kryoPool: KryoPool = KryoPool.withByteArrayOutputStream(Runtime - .getRuntime - .availableProcessors, kryoInstantiator) + @transient private[this] lazy val kryoPool: KryoPool = + KryoPool.withByteArrayOutputStream(Runtime.getRuntime.availableProcessors, kryoInstantiator) override def encode(value: Any, os: OutputStream): Unit = { val bytes = kryoPool.toBytesWithClass(value) @@ -41,7 +40,7 @@ object OrderedSerializationCoder { def apply[T](ord: Ordering[T], fallback: Coder[T]): Coder[T] = ord match { case ordSer: OrderedSerialization[T] @unchecked => OrderedSerializationCoder(ordSer) - case _ => fallback + case _ => fallback } } @@ -51,7 +50,6 @@ case class TupleCoder[K, V](coderK: Coder[K], coderV: Coder[V]) extends AtomicCo coderV.encode(value._2, outStream) } - override def decode(inStream: InputStream): (K, V) = { + override def decode(inStream: InputStream): (K, V) = (coderK.decode(inStream), coderV.decode(inStream)) - } } diff --git a/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala b/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala index 5095c0bfde..18203e7f12 100644 --- a/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala +++ b/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala @@ -23,10 +23,7 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { } before { - testPath = Paths.get( - System.getProperty("java.io.tmpdir"), - "scalding", - "beam_backend").toString + testPath = Paths.get(System.getProperty("java.io.tmpdir"), "scalding", "beam_backend").toString pipelineOptions = PipelineOptionsFactory.create() } @@ -34,64 +31,63 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { removeDir(testPath) } - def tmpPath(suffix: String): String = { + def tmpPath(suffix: String): String = Paths.get(testPath, suffix).toString - } - test("map"){ + test("map") { beamMatchesSeq( TypedPipe.from(0 to 5).map(_ * 2), Seq(0, 2, 4, 6, 8, 10) ) } - test("flatMap"){ + test("flatMap") { beamMatchesSeq( TypedPipe.from(0 to 3).flatMap(x => 0 to x), Seq(0, 0, 1, 0, 1, 2, 0, 1, 2, 3) ) } - test("mapValues"){ + test("mapValues") { beamMatchesSeq( TypedPipe.from(0 to 3).map(x => (x, x)).mapValues(_ * 2), Seq((0, 0), (1, 2), (2, 4), (3, 6)) ) } - test("flatMapValues"){ + test("flatMapValues") { beamMatchesSeq( TypedPipe.from(0 to 2).map(x => (x, x)).flatMapValues(x => 0 to x), Seq((0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (2, 2)) ) } - test("filter"){ + test("filter") { beamMatchesSeq( TypedPipe.from(0 to 10).filter(x => x % 2 == 0), Seq(0, 2, 4, 6, 8, 10) ) } - test("filterKeys"){ + test("filterKeys") { beamMatchesSeq( TypedPipe.from(0 to 10).map(x => (x, x)).filterKeys(x => x % 2 == 1), Seq((1, 1), (3, 3), (5, 5), (7, 7), (9, 9)) ) } - test("mapGroup"){ + test("mapGroup") { beamMatchesSeq( TypedPipe .from(Seq(5, 3, 2, 0, 1, 4)) .map(x => x.toDouble) .groupAll .aggregate(AveragedValue.aggregator), - Seq(((),2.5)) + Seq(((), 2.5)) ) } - test("sortedMapGroup"){ + test("sortedMapGroup") { beamMatchesSeq( TypedPipe .from(Seq(5, 3, 2, 6, 1, 4)) @@ -102,11 +98,13 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { ) } - test("priorityQueue operations"){ + test("priorityQueue operations") { + /** - * @note we are not extending support for `sortedTake` and `sortedReverseTake`, since both of them uses - * [[com.twitter.algebird.mutable.PriorityQueueMonoid.plus]] which mutates input value in pipeline - * and Beam does not allow mutations to input during transformation + * @note + * we are not extending support for `sortedTake` and `sortedReverseTake`, since both of them uses + * [[com.twitter.algebird.mutable.PriorityQueueMonoid.plus]] which mutates input value in pipeline and + * Beam does not allow mutations to input during transformation */ val test = Try { beamMatchesSeq( @@ -121,7 +119,7 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { assert(test.isFailure) } - test("SumByLocalKeys"){ + test("SumByLocalKeys") { beamMatchesSeq( TypedPipe .from(0 to 5) @@ -135,22 +133,24 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { ) } - test("HashJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.hashJoin(rightPipe) - }, + test("HashJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.hashJoin(rightPipe) + }, Seq((0, (0, 0)), (0, (0, 3)), (0, (1, 0)), (0, (1, 3))) ) } - test("HashLeftJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.hashLeftJoin(rightPipe) - }, + test("HashLeftJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.hashLeftJoin(rightPipe) + }, Seq( (0, (0, Some(0))), (0, (0, Some(3))), @@ -162,84 +162,96 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { ) } - test("InnerJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.join(rightPipe) - }, + test("InnerJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.join(rightPipe) + }, Seq((0, (0, 0)), (0, (0, 3)), (0, (1, 0)), (0, (1, 3))) ) } - test("LeftJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.leftJoin(rightPipe) - }, Seq( - (0, (0, Some(0))), - (0, (0, Some(3))), - (0, (1, Some(0))), - (0, (1, Some(3))), - (1, (1, None)), - (3, (3, None)) - )) - } - - test("RightJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.rightJoin(rightPipe) - }, Seq( - (0, (Some(0), 0)), - (0, (Some(0), 3)), - (0, (Some(1), 0)), - (0, (Some(1), 3)), - (2, (None, 2)), - (2, (None, 3)) - )) - } - - test("OuterJoin"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.outerJoin(rightPipe) - }, Seq( - (0, (Some(0), Some(0))), - (0, (Some(0), Some(3))), - (0, (Some(1), Some(0))), - (0, (Some(1), Some(3))), - (1, (Some(1), None)), - (3, (Some(3), None)), - (2, (None, Some(2))), - (2, (None, Some(3))) - )) - } - - test("CoGroup"){ - beamMatchesSeq({ - val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) - val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) - leftPipe.cogroup(rightPipe)((_, iter1, iter2) => Seq((iter1 ++ iter2).toSeq.sum).toIterator) - }, Seq( - (0, 4), - (1, 1), - (2, 5), - (3, 3) - )) - } - - private def getContents(path: String, prefix: String): List[String] = { - new File(path).listFiles.flatMap(file => { - if(file.getPath.startsWith(prefix)){ - Source.fromFile(file).getLines().flatMap(line => line.split("\\s+").toList) - }else List.empty[String] - }).toList + test("LeftJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.leftJoin(rightPipe) + }, + Seq( + (0, (0, Some(0))), + (0, (0, Some(3))), + (0, (1, Some(0))), + (0, (1, Some(3))), + (1, (1, None)), + (3, (3, None)) + ) + ) } + test("RightJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.rightJoin(rightPipe) + }, + Seq( + (0, (Some(0), 0)), + (0, (Some(0), 3)), + (0, (Some(1), 0)), + (0, (Some(1), 3)), + (2, (None, 2)), + (2, (None, 3)) + ) + ) + } + + test("OuterJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.outerJoin(rightPipe) + }, + Seq( + (0, (Some(0), Some(0))), + (0, (Some(0), Some(3))), + (0, (Some(1), Some(0))), + (0, (Some(1), Some(3))), + (1, (Some(1), None)), + (3, (Some(3), None)), + (2, (None, Some(2))), + (2, (None, Some(3))) + ) + ) + } + + test("CoGroup") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.cogroup(rightPipe)((_, iter1, iter2) => Seq((iter1 ++ iter2).toSeq.sum).toIterator) + }, + Seq( + (0, 4), + (1, 1), + (2, 5), + (3, 3) + ) + ) + } + + private def getContents(path: String, prefix: String): List[String] = + new File(path).listFiles.flatMap { file => + if (file.getPath.startsWith(prefix)) { + Source.fromFile(file).getLines().flatMap(line => line.split("\\s+").toList) + } else List.empty[String] + }.toList + private def removeDir(path: String): Unit = { def deleteRecursively(file: File): Unit = { if (file.isDirectory) file.listFiles.foreach(deleteRecursively) @@ -249,4 +261,3 @@ class BeamBackendTests extends FunSuite with BeforeAndAfter { deleteRecursively(new File(path)) } } - diff --git a/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala b/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala index a3f901f209..3c1149ab84 100644 --- a/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala +++ b/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala @@ -1,11 +1,11 @@ package com.twitter.scalding.hellcats -import cats.{ Functor, FunctorFilter, MonoidK, Semigroupal, StackSafeMonad } -import cats.effect.{ Async, Effect, ExitCase, SyncIO, IO } -import com.twitter.scalding.{ Config, Mode, TypedPipe, Execution } +import cats.{Functor, FunctorFilter, MonoidK, Semigroupal, StackSafeMonad} +import cats.effect.{Async, Effect, ExitCase, IO, SyncIO} +import com.twitter.scalding.{Config, Execution, Mode, TypedPipe} import com.twitter.scalding.typed.CoGroupable -import com.twitter.scalding.typed.functions.{ Identity, MapOptionToFlatMap } -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext, Promise } +import com.twitter.scalding.typed.functions.{Identity, MapOptionToFlatMap} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} /** * Instances for cats types when working with Scalding @@ -41,15 +41,15 @@ object HellCats { } /** - * Async[Execution] includes MonadError[Throwable, Execution] and Defer[Execution] - * which together are the most commonly used typeclasses + * Async[Execution] includes MonadError[Throwable, Execution] and Defer[Execution] which together are the + * most commonly used typeclasses */ implicit val asyncExecution: Async[Execution] with StackSafeMonad[Execution] = new AsyncExecution /** - * To use Execution as an Effect, which is to say, we can run it, we need the Config, Mode - * and ExecutionContext to use + * To use Execution as an Effect, which is to say, we can run it, we need the Config, Mode and + * ExecutionContext to use */ def executionEffect(c: Config, m: Mode)(implicit cec: ConcurrentExecutionContext): Effect[Execution] = new ExecutionEffect(c, m) @@ -82,18 +82,20 @@ object HellCats { def asyncF[A](k: (Either[Throwable, A] => Unit) => Execution[Unit]): Execution[A] = delay(Promise[A]()).flatMap { p => - val asyncEx = Execution.withNewCache(Execution.fromFuture { implicit cec: ConcurrentExecutionContext => - Future { - k { - case Right(a) => - p.success(a) - () - case Left(err) => - p.failure(err) - () + val asyncEx = Execution + .withNewCache(Execution.fromFuture { implicit cec: ConcurrentExecutionContext => + Future { + k { + case Right(a) => + p.success(a) + () + case Left(err) => + p.failure(err) + () + } } - } - }).flatten + }) + .flatten val result = Execution.fromFuture(_ => p.future) @@ -113,7 +115,9 @@ object HellCats { } // Members declared in cats.effect.Bracket - def bracketCase[A, B](acquire: Execution[A])(use: A => Execution[B])(release: (A, ExitCase[Throwable]) => Execution[Unit]): Execution[B] = + def bracketCase[A, B]( + acquire: Execution[A] + )(use: A => Execution[B])(release: (A, ExitCase[Throwable]) => Execution[Unit]): Execution[B] = acquire.flatMap { a => attempt(use(a)).flatMap { case Right(b) => @@ -147,23 +151,27 @@ object HellCats { def raiseError[A](t: Throwable): Execution[A] = Execution.failed(t) - override def recoverWith[A](ea: Execution[A])(fn: PartialFunction[Throwable, Execution[A]]): Execution[A] = + override def recoverWith[A](ea: Execution[A])( + fn: PartialFunction[Throwable, Execution[A]] + ): Execution[A] = ea.recoverWith(fn) def suspend[A](ea: => Execution[A]): Execution[A] = delay(ea).flatten } - class ExecutionEffect(c: Config, m: Mode)(implicit cec: ConcurrentExecutionContext) extends AsyncExecution with Effect[Execution] { - def runAsync[A](ea: Execution[A])(cb: Either[Throwable, A] => IO[Unit]): SyncIO[Unit] = { + class ExecutionEffect(c: Config, m: Mode)(implicit cec: ConcurrentExecutionContext) + extends AsyncExecution + with Effect[Execution] { + def runAsync[A](ea: Execution[A])(cb: Either[Throwable, A] => IO[Unit]): SyncIO[Unit] = SyncIO { - val funit = ea.run(c, m) - .map { a => Right(a) } + val funit = ea + .run(c, m) + .map(a => Right(a)) .recover { case t => Left(t) } - .map { e => cb(e).unsafeRunSync } + .map(e => cb(e).unsafeRunSync) // we can discard this future, since we have started the work () } - } } } diff --git a/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala b/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala index 83f71a6942..ac2bf2f861 100644 --- a/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala +++ b/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala @@ -1,17 +1,17 @@ package com.twitter.scalding.hellcats -import cats.{ Eq, MonadError } +import cats.{Eq, MonadError} import cats.laws.discipline.SemigroupalTests.Isomorphisms -import cats.effect.{ Effect, IO } +import cats.effect.{Effect, IO} import cats.effect.laws.discipline.EffectTests import com.twitter.scalding.typed.memory_backend.MemoryMode -import com.twitter.scalding.{ Execution, Config } +import com.twitter.scalding.{Config, Execution} import org.scalatest.FunSuite -import org.scalacheck.{ Arbitrary, Gen } +import org.scalacheck.{Arbitrary, Gen} import org.typelevel.discipline.scalatest.Discipline -import scala.concurrent.{ Await, ExecutionContext } +import scala.concurrent.{Await, ExecutionContext} import scala.concurrent.duration._ -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} import HellCats._ import cats.implicits._ @@ -35,10 +35,7 @@ object ExecutionGen { aOrB <- Gen.oneOf(a, b) } yield aOrB - Gen.frequency( - (4, g0), - (4, genFlatMap), - (1, zip)) // use zip less because it branches + Gen.frequency((4, g0), (4, genFlatMap), (1, zip)) // use zip less because it branches } } def genExecution[A](depth: Int, g: Gen[A]): Gen[Execution[A]] = @@ -59,19 +56,18 @@ object ExecutionGen { (get(l), get(r)) match { case (Success(a), Success(b)) => Eq[A].eqv(a, b) case (Failure(_), Failure(_)) => true - case _ => false + case _ => false } } implicit def eqIO[A: Eq]: Eq[IO[A]] = new Eq[IO[A]] { def eqv(l: IO[A], r: IO[A]) = - (Try(l.unsafeRunTimed(Duration(10, SECONDS))), - Try(r.unsafeRunTimed(Duration(10, SECONDS)))) match { - case (Success(a), Success(b)) => Eq[Option[A]].eqv(a, b) - case (Failure(_), Failure(_)) => true - case _ => false - } + (Try(l.unsafeRunTimed(Duration(10, SECONDS))), Try(r.unsafeRunTimed(Duration(10, SECONDS)))) match { + case (Success(a), Success(b)) => Eq[Option[A]].eqv(a, b) + case (Failure(_), Failure(_)) => true + case _ => false + } } // We consider all failures the same, we don't care about failure order diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala index 2dcdecb870..999fe5e3b0 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.extensions @@ -22,65 +22,58 @@ import com.twitter.scalding.Dsl._ import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields -import org.slf4j.{ Logger, LoggerFactory => LogManager } +import org.slf4j.{Logger, LoggerFactory => LogManager} /** - * Checkpoint provides a simple mechanism to read and write intermediate results - * from a Scalding flow to HDFS. + * Checkpoint provides a simple mechanism to read and write intermediate results from a Scalding flow to HDFS. * - * Checkpoints are useful for debugging one part of a long flow, when you would - * otherwise have to run many steps to get to the one you care about. To enable - * checkpoints, sprinkle calls to Checkpoint() throughout your flow, ideally - * after expensive steps. + * Checkpoints are useful for debugging one part of a long flow, when you would otherwise have to run many + * steps to get to the one you care about. To enable checkpoints, sprinkle calls to Checkpoint() throughout + * your flow, ideally after expensive steps. * - * When checkpoints are enabled, each Checkpoint() looks for a checkpoint file - * on HDFS. If it exists we read results from the file; otherwise we execute - * the flow and write the results to the file. When checkpoints are disabled, - * the flow is always executed and the results are never stored. + * When checkpoints are enabled, each Checkpoint() looks for a checkpoint file on HDFS. If it exists we read + * results from the file; otherwise we execute the flow and write the results to the file. When checkpoints + * are disabled, the flow is always executed and the results are never stored. * - * Each call to Checkpoint() takes the checkpoint name, as well as the types and - * names of the expected fields. A sample invocation might look like this: - * val pipe = Checkpoint[(Long, String, Long)]( - * "clicks", ('tweetId, 'clickUrl, 'clickCount)) { ... } - * where { ... } contains a flow which computes the result. + * Each call to Checkpoint() takes the checkpoint name, as well as the types and names of the expected fields. + * A sample invocation might look like this: val pipe = Checkpoint[(Long, String, Long)]( "clicks", ('tweetId, + * 'clickUrl, 'clickCount)) { ... } where { ... } contains a flow which computes the result. * * Most checkpoint parameters are specified via command-line flags: - * --checkpoint.clobber: if true, recompute and overwrite any existing - * checkpoint files. + * --checkpoint.clobber: if true, recompute and overwrite any existing checkpoint files. * --checkpoint.clobber.: override clobber for the given checkpoint. - * --checkpoint.file: specifies a filename prefix to use for checkpoint files. - * If blank, checkpoints are disabled; otherwise the file for checkpoint - * is _. - * --checkpoint.file.: override --checkpoint.file for the given - * checkpoint; specifies the whole filename, not the prefix. - * --checkpoint.format: specifies a file format, either sequencefile or tsv. - * Default is sequencefile for HDFS, tsv for local. + * --checkpoint.file: specifies a filename prefix to use for checkpoint files. If blank, checkpoints are + * disabled; otherwise the file for checkpoint is _. + * --checkpoint.file.: override --checkpoint.file for the given checkpoint; specifies the whole + * filename, not the prefix. + * --checkpoint.format: specifies a file format, either sequencefile or tsv. Default is sequencefile for HDFS, + * tsv for local. * --checkpoint.format.: specifies file format for the given checkpoint. * - * @author Mike Jahr + * @author + * Mike Jahr */ object Checkpoint { private val LOG: Logger = LogManager.getLogger(this.getClass) /** - * Type parameters: - * A: tuple of result types + * Type parameters: A: tuple of result types * - * Parameters: - * checkpointName: name of the checkpoint - * resultFields: tuple of result field names - * flow: a function to run a flow to compute the result + * Parameters: checkpointName: name of the checkpoint resultFields: tuple of result field names flow: a + * function to run a flow to compute the result * - * Implicit parameters: - * args: provided by com.twitter.pluck.job.TwitterJob - * mode: provided by com.twitter.scalding.Job - * flowDef: provided by com.twitter.scalding.Job - * conv: provided by com.twitter.scalding.TupleConversions - * setter: provided by com.twitter.scalding.TupleConversions + * Implicit parameters: args: provided by com.twitter.pluck.job.TwitterJob mode: provided by + * com.twitter.scalding.Job flowDef: provided by com.twitter.scalding.Job conv: provided by + * com.twitter.scalding.TupleConversions setter: provided by com.twitter.scalding.TupleConversions */ - def apply[A](checkpointName: String, resultFields: Fields)(flow: => Pipe)(implicit args: Args, mode: Mode, flowDef: FlowDef, - conv: TupleConverter[A], setter: TupleSetter[A]): Pipe = { + def apply[A](checkpointName: String, resultFields: Fields)(flow: => Pipe)(implicit + args: Args, + mode: Mode, + flowDef: FlowDef, + conv: TupleConverter[A], + setter: TupleSetter[A] + ): Pipe = { conv.assertArityMatches(resultFields) setter.assertArityMatches(resultFields) @@ -92,9 +85,8 @@ object Checkpoint { filename match { case Some(name) if hasInput(checkpointName, name) => // We have checkpoint input; read the file instead of executing the flow. - LOG.info(s"""Checkpoint "${checkpointName}": reading ${format} input from "${name}"""") - getSource(format, name) - .read + LOG.info(s"""Checkpoint "$checkpointName": reading $format input from "$name"""") + getSource(format, name).read .mapTo(List.range(0, resultFields.size) -> resultFields)((x: A) => x)(conv, setter) // We don't have checkpoint input; execute the flow and project to the // requested fields. @@ -102,7 +94,7 @@ object Checkpoint { val pipe = flow.project(resultFields) // Write the checkpoint output. - LOG.info(s"""Checkpoint "${checkpointName}": writing ${format} output to "${name}"""") + LOG.info(s"""Checkpoint "$checkpointName": writing $format output to "$name"""") pipe.write(getSource(format, name)) case None => flow.project(resultFields) @@ -110,8 +102,13 @@ object Checkpoint { } // Wrapper for Checkpoint when using a TypedPipe - def apply[A](checkpointName: String)(flow: => TypedPipe[A])(implicit args: Args, mode: Mode, flowDef: FlowDef, - conv: TupleConverter[A], setter: TupleSetter[A]): TypedPipe[A] = { + def apply[A](checkpointName: String)(flow: => TypedPipe[A])(implicit + args: Args, + mode: Mode, + flowDef: FlowDef, + conv: TupleConverter[A], + setter: TupleSetter[A] + ): TypedPipe[A] = { val rPipe = apply(checkpointName, Dsl.intFields(0 until conv.arity)) { flow.toPipe(Dsl.intFields(0 until conv.arity)) } @@ -133,7 +130,7 @@ object Checkpoint { } else { baseValue } - def isTrue: Boolean = value.exists { _.toLowerCase != "false" } + def isTrue: Boolean = value.exists(_.toLowerCase != "false") } // Returns the filename to use for the given checkpoint, or None if this @@ -158,22 +155,20 @@ object Checkpoint { private def getFormat(checkpointName: String)(implicit args: Args, mode: Mode): String = { val defaultFormat = mode match { case Hdfs(_, _) | HadoopTest(_, _) => "sequencefile" - case _ => "tsv" + case _ => "tsv" } CheckpointArg(checkpointName, "format").value.getOrElse(defaultFormat).toLowerCase } // Returns a source for the checkpoint in the given format. - private def getSource(format: String, filename: String)(implicit mode: Mode): Source = { + private def getSource(format: String, filename: String)(implicit mode: Mode): Source = format match { case "sequencefile" => SequenceFile(filename) - case "tsv" => Tsv(filename) - case _ => sys.error("Invalid value for --checkpoint.format: " + format) + case "tsv" => Tsv(filename) + case _ => sys.error("Invalid value for --checkpoint.format: " + format) } - } // Returns true if the given checkpoint file exists and should be read. - private def hasInput(checkpointName: String, filename: String)(implicit args: Args, mode: Mode): Boolean = { + private def hasInput(checkpointName: String, filename: String)(implicit args: Args, mode: Mode): Boolean = !CheckpointArg(checkpointName, "clobber").isTrue && CascadingMode.cast(mode).fileExists(filename) - } } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala index a987ca0e11..6036b561d6 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala @@ -1,16 +1,20 @@ package com.twitter.scalding.commons.scheme import cascading.scheme.Scheme -import com.twitter.elephantbird.cascading2.scheme.{ CombinedSequenceFile, CombinedWritableSequenceFile } -import com.twitter.scalding.{ HadoopSchemeInstance, SequenceFileScheme, WritableSequenceFileScheme } +import com.twitter.elephantbird.cascading2.scheme.{CombinedSequenceFile, CombinedWritableSequenceFile} +import com.twitter.scalding.{HadoopSchemeInstance, SequenceFileScheme, WritableSequenceFileScheme} trait CombinedSequenceFileScheme extends SequenceFileScheme { // TODO Cascading doesn't support local mode yet - override def hdfsScheme = HadoopSchemeInstance(new CombinedSequenceFile(fields).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + new CombinedSequenceFile(fields).asInstanceOf[Scheme[_, _, _, _, _]] + ) } trait CombinedWritableSequenceFileScheme extends WritableSequenceFileScheme { // TODO Cascading doesn't support local mode yet override def hdfsScheme = - HadoopSchemeInstance(new CombinedWritableSequenceFile(fields, keyType, valueType).asInstanceOf[Scheme[_, _, _, _, _]]) -} \ No newline at end of file + HadoopSchemeInstance( + new CombinedWritableSequenceFile(fields, keyType, valueType).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala index 24beaa7e9f..a8b308c9cd 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import com.twitter.elephantbird.mapreduce.io.BinaryConverter -import com.twitter.scrooge.{ BinaryThriftStructSerializer, ThriftStructCodec, ThriftStruct } +import com.twitter.scrooge.{BinaryThriftStructSerializer, ThriftStruct, ThriftStructCodec} import scala.reflect.ClassTag import scala.util.Try @@ -33,7 +33,9 @@ case object IdentityBinaryConverter extends BinaryConverter[Array[Byte]] { object ScroogeBinaryConverter { // codec code borrowed from chill's ScroogeThriftStructSerializer class - private[this] def codecForNormal[T <: ThriftStruct](thriftStructClass: Class[T]): Try[ThriftStructCodec[T]] = + private[this] def codecForNormal[T <: ThriftStruct]( + thriftStructClass: Class[T] + ): Try[ThriftStructCodec[T]] = Try(Class.forName(thriftStructClass.getName + "$").getField("MODULE$").get(null)) .map(_.asInstanceOf[ThriftStructCodec[T]]) @@ -53,4 +55,3 @@ object ScroogeBinaryConverter { } } } - diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala index 8bdfb6102a..5f7109f125 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -28,69 +28,94 @@ import org.apache.thrift.TBase // Retrieve implicits -abstract class DailySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit @transient suppliedInjection: Injection[T, Array[Byte]]) - extends DailySuffixSource(prefix, dateRange) with LzoCodec[T] { +abstract class DailySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit + @transient suppliedInjection: Injection[T, Array[Byte]] +) extends DailySuffixSource(prefix, dateRange) + with LzoCodec[T] { val boxed = Externalizer(suppliedInjection) override lazy val injection = boxed.get } abstract class DailySuffixLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoProtobuf[T] { + extends DailySuffixSource(prefix, dateRange) + with LzoProtobuf[T] { override def column = manifest[T].runtimeClass } abstract class DailySuffixMostRecentLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) - extends DailySuffixMostRecentSource(prefix, dateRange) with LzoProtobuf[T] { + extends DailySuffixMostRecentSource(prefix, dateRange) + with LzoProtobuf[T] { override def column = manifest[T].runtimeClass } abstract class DailySuffixLzoThrift[T <: TBase[_, _]: Manifest](prefix: String, dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoThrift[T] { + extends DailySuffixSource(prefix, dateRange) + with LzoThrift[T] { override def column = manifest[T].runtimeClass } -abstract class DailyPrefixSuffixLzoThrift[T <: TBase[_, _]: Manifest](prefix: String, suffix: String, dateRange: DateRange) - extends DailyPrefixSuffixSource(prefix, suffix, dateRange) with LzoThrift[T] { +abstract class DailyPrefixSuffixLzoThrift[T <: TBase[_, _]: Manifest]( + prefix: String, + suffix: String, + dateRange: DateRange +) extends DailyPrefixSuffixSource(prefix, suffix, dateRange) + with LzoThrift[T] { override def column = manifest[T].runtimeClass } -abstract class TimePathedLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, prefix: String, dateFormat: String, dateRange: DateRange) - extends TimePathedSource(prefix + dateFormat + "/*", dateRange, DateOps.UTC) - with WritableSequenceFileScheme - with Serializable - with Mappable[(Long, V)] - with TypedSink[(Long, V)] - with LongThriftTransformer[V] { +abstract class TimePathedLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + prefix: String, + dateFormat: String, + dateRange: DateRange +) extends TimePathedSource(prefix + dateFormat + "/*", dateRange, DateOps.UTC) + with WritableSequenceFileScheme + with Serializable + with Mappable[(Long, V)] + with TypedSink[(Long, V)] + with LongThriftTransformer[V] { override val fields = f override def sinkFields = f override val mt = implicitly[Manifest[V]] - override def converter[U >: (Long, V)] = TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def converter[U >: (Long, V)] = + TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) override def setter[U <: (Long, V)] = TupleSetter.asSubSetter[(Long, V), U](TupleSetter.of[(Long, V)]) } -abstract class MostRecentGoodLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, pattern: String, dateRange: DateRange) - extends MostRecentGoodSource(pattern, dateRange, DateOps.UTC) - with WritableSequenceFileScheme - with Serializable - with Mappable[(Long, V)] - with TypedSink[(Long, V)] - with LongThriftTransformer[V] { +abstract class MostRecentGoodLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + pattern: String, + dateRange: DateRange +) extends MostRecentGoodSource(pattern, dateRange, DateOps.UTC) + with WritableSequenceFileScheme + with Serializable + with Mappable[(Long, V)] + with TypedSink[(Long, V)] + with LongThriftTransformer[V] { override val fields = f override def sinkFields = f override val mt = implicitly[Manifest[V]] - override def converter[U >: (Long, V)] = TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def converter[U >: (Long, V)] = + TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) override def setter[U <: (Long, V)] = TupleSetter.asSubSetter[(Long, V), U](TupleSetter.of[(Long, V)]) } -abstract class DailySuffixLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, prefix: String, dateRange: DateRange) - extends TimePathedLongThriftSequenceFile[V](f, prefix, TimePathedSource.YEAR_MONTH_DAY, dateRange) +abstract class DailySuffixLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + prefix: String, + dateRange: DateRange +) extends TimePathedLongThriftSequenceFile[V](f, prefix, TimePathedSource.YEAR_MONTH_DAY, dateRange) -case class DailySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoTsv { +case class DailySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends DailySuffixSource(prefix, dateRange) + with LzoTsv { override val fields = fs } -case class DailyPrefixSuffixLzoTsv(prefix: String, suffix: String, fs: Fields = Fields.ALL)(implicit override val dateRange: DateRange) - extends DailyPrefixSuffixSource(prefix, suffix, dateRange) with LzoTsv { +case class DailyPrefixSuffixLzoTsv(prefix: String, suffix: String, fs: Fields = Fields.ALL)(implicit + override val dateRange: DateRange +) extends DailyPrefixSuffixSource(prefix, suffix, dateRange) + with LzoTsv { override val fields = fs } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala index eaca1d9863..6c032930fc 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -21,11 +21,13 @@ import com.twitter.scalding._ import org.apache.thrift.TBase abstract class FixedPathLzoThrift[T <: TBase[_, _]: Manifest](path: String*) - extends FixedPathSource(path: _*) with LzoThrift[T] { + extends FixedPathSource(path: _*) + with LzoThrift[T] { def column = manifest[T].runtimeClass } abstract class FixedPathLzoProtobuf[T <: Message: Manifest](path: String) - extends FixedPathSource(path) with LzoProtobuf[T] { + extends FixedPathSource(path) + with LzoProtobuf[T] { def column = manifest[T].runtimeClass } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala index a6e5a18b99..4f1a55c5dd 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala @@ -13,38 +13,72 @@ trait LzoTypedTsv4[A, B, C, D] extends LzoTypedTsv[Tuple4[A, B, C, D]] with Mapp trait LzoTypedTsv5[A, B, C, D, E] extends LzoTypedTsv[Tuple5[A, B, C, D, E]] with Mappable5[A, B, C, D, E] -trait LzoTypedTsv6[A, B, C, D, E, F] extends LzoTypedTsv[Tuple6[A, B, C, D, E, F]] with Mappable6[A, B, C, D, E, F] +trait LzoTypedTsv6[A, B, C, D, E, F] + extends LzoTypedTsv[Tuple6[A, B, C, D, E, F]] + with Mappable6[A, B, C, D, E, F] -trait LzoTypedTsv7[A, B, C, D, E, F, G] extends LzoTypedTsv[Tuple7[A, B, C, D, E, F, G]] with Mappable7[A, B, C, D, E, F, G] +trait LzoTypedTsv7[A, B, C, D, E, F, G] + extends LzoTypedTsv[Tuple7[A, B, C, D, E, F, G]] + with Mappable7[A, B, C, D, E, F, G] -trait LzoTypedTsv8[A, B, C, D, E, F, G, H] extends LzoTypedTsv[Tuple8[A, B, C, D, E, F, G, H]] with Mappable8[A, B, C, D, E, F, G, H] +trait LzoTypedTsv8[A, B, C, D, E, F, G, H] + extends LzoTypedTsv[Tuple8[A, B, C, D, E, F, G, H]] + with Mappable8[A, B, C, D, E, F, G, H] -trait LzoTypedTsv9[A, B, C, D, E, F, G, H, I] extends LzoTypedTsv[Tuple9[A, B, C, D, E, F, G, H, I]] with Mappable9[A, B, C, D, E, F, G, H, I] +trait LzoTypedTsv9[A, B, C, D, E, F, G, H, I] + extends LzoTypedTsv[Tuple9[A, B, C, D, E, F, G, H, I]] + with Mappable9[A, B, C, D, E, F, G, H, I] -trait LzoTypedTsv10[A, B, C, D, E, F, G, H, I, J] extends LzoTypedTsv[Tuple10[A, B, C, D, E, F, G, H, I, J]] with Mappable10[A, B, C, D, E, F, G, H, I, J] +trait LzoTypedTsv10[A, B, C, D, E, F, G, H, I, J] + extends LzoTypedTsv[Tuple10[A, B, C, D, E, F, G, H, I, J]] + with Mappable10[A, B, C, D, E, F, G, H, I, J] -trait LzoTypedTsv11[A, B, C, D, E, F, G, H, I, J, K] extends LzoTypedTsv[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] with Mappable11[A, B, C, D, E, F, G, H, I, J, K] +trait LzoTypedTsv11[A, B, C, D, E, F, G, H, I, J, K] + extends LzoTypedTsv[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] + with Mappable11[A, B, C, D, E, F, G, H, I, J, K] -trait LzoTypedTsv12[A, B, C, D, E, F, G, H, I, J, K, L] extends LzoTypedTsv[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] with Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] +trait LzoTypedTsv12[A, B, C, D, E, F, G, H, I, J, K, L] + extends LzoTypedTsv[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] + with Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] -trait LzoTypedTsv13[A, B, C, D, E, F, G, H, I, J, K, L, M] extends LzoTypedTsv[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] with Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] +trait LzoTypedTsv13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends LzoTypedTsv[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] + with Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] -trait LzoTypedTsv14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] extends LzoTypedTsv[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] with Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] +trait LzoTypedTsv14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends LzoTypedTsv[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] + with Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] -trait LzoTypedTsv15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] extends LzoTypedTsv[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] with Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] +trait LzoTypedTsv15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends LzoTypedTsv[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] + with Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] -trait LzoTypedTsv16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] extends LzoTypedTsv[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] with Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] +trait LzoTypedTsv16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends LzoTypedTsv[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + with Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] -trait LzoTypedTsv17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] extends LzoTypedTsv[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] with Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] +trait LzoTypedTsv17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends LzoTypedTsv[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + with Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] -trait LzoTypedTsv18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] extends LzoTypedTsv[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] with Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] +trait LzoTypedTsv18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends LzoTypedTsv[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + with Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] -trait LzoTypedTsv19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] extends LzoTypedTsv[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] with Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] +trait LzoTypedTsv19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends LzoTypedTsv[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + with Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] -trait LzoTypedTsv20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] extends LzoTypedTsv[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] with Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] +trait LzoTypedTsv20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends LzoTypedTsv[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + with Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] -trait LzoTypedTsv21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] extends LzoTypedTsv[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] with Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] +trait LzoTypedTsv21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends LzoTypedTsv[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + with Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] -trait LzoTypedTsv22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] extends LzoTypedTsv[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] with Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] +trait LzoTypedTsv22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends LzoTypedTsv[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + with Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] // end of autogenerated diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala index 568dce0609..55d51d049f 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -24,26 +24,33 @@ import com.twitter.scalding._ import com.twitter.scalding.source._ import org.apache.thrift.TBase -abstract class HourlySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit @transient suppliedInjection: Injection[T, Array[Byte]]) - extends HourlySuffixSource(prefix, dateRange) with LzoCodec[T] { +abstract class HourlySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit + @transient suppliedInjection: Injection[T, Array[Byte]] +) extends HourlySuffixSource(prefix, dateRange) + with LzoCodec[T] { val boxed = Externalizer(suppliedInjection) override lazy val injection = boxed.get } -case class HourlySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoTsv { +case class HourlySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends HourlySuffixSource(prefix, dateRange) + with LzoTsv { override val fields = fs } abstract class HourlySuffixLzoThrift[T <: TBase[_, _]: Manifest](prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoThrift[T] { + extends HourlySuffixSource(prefix, dateRange) + with LzoThrift[T] { override def column = manifest[T].runtimeClass } abstract class HourlySuffixLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoProtobuf[T] { + extends HourlySuffixSource(prefix, dateRange) + with LzoProtobuf[T] { override def column = manifest[T].runtimeClass } abstract class HourlySuffixLzoText(prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoText + extends HourlySuffixSource(prefix, dateRange) + with LzoText diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala index dda71a26a4..91725e3897 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala @@ -12,16 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import cascading.pipe.Pipe import cascading.tuple.Fields import com.twitter.elephantbird.mapreduce.io.ThriftWritable -import com.twitter.elephantbird.util.{ ThriftUtils, TypeRef } +import com.twitter.elephantbird.util.{ThriftUtils, TypeRef} import com.twitter.scalding._ -import org.apache.hadoop.io.{ LongWritable, Writable } +import org.apache.hadoop.io.{LongWritable, Writable} import org.apache.thrift.TBase trait LongThriftTransformer[V <: TBase[_, _]] extends Source { @@ -32,18 +32,16 @@ trait LongThriftTransformer[V <: TBase[_, _]] extends Source { // meant to override fields within WritableSequenceFileScheme. val keyType = classOf[LongWritable] val valueType = classOf[ThriftWritable[V]].asInstanceOf[Class[Writable]] - override protected def transformForRead(pipe: Pipe): Pipe = { + override protected def transformForRead(pipe: Pipe): Pipe = new RichPipe(pipe).mapTo(fields -> fields) { v: (LongWritable, ThriftWritable[V]) => v._2.setConverter(mt.runtimeClass.asInstanceOf[Class[V]]) (v._1.get, v._2.get) } - } - override protected def transformForWrite(pipe: Pipe) = { + override protected def transformForWrite(pipe: Pipe) = new RichPipe(pipe).mapTo(fields -> fields) { v: (Long, V) => val key = new LongWritable(v._1) val value = new ThriftWritable(v._2, typeRef) (key, value) } - } lazy val typeRef = ThriftUtils.getTypeRef(mt.runtimeClass).asInstanceOf[TypeRef[TBase[_, _]]] } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala index f7b1e9b0c9..bb3c40e617 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -20,8 +20,7 @@ import com.twitter.chill.Externalizer import com.twitter.bijection.Injection /** - * Source used to write some type T into an LZO-compressed SequenceFile using a - * codec on T for serialization. + * Source used to write some type T into an LZO-compressed SequenceFile using a codec on T for serialization. */ object LzoCodecSource { diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala index 09ad893905..4e9764ece6 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -22,12 +22,12 @@ import com.twitter.bijection._ import com.twitter.chill.Externalizer import com.twitter.elephantbird.cascading2.scheme.LzoBinaryScheme import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat -import com.twitter.elephantbird.mapreduce.io.{ BinaryConverter, GenericWritable } -import com.twitter.elephantbird.mapreduce.input.{ BinaryConverterProvider, MultiInputFormat } +import com.twitter.elephantbird.mapreduce.io.{BinaryConverter, GenericWritable} +import com.twitter.elephantbird.mapreduce.input.{BinaryConverterProvider, MultiInputFormat} import com.twitter.elephantbird.mapreduce.output.LzoGenericBlockOutputFormat import com.twitter.elephantbird.mapred.output.DeprecatedOutputFormatWrapper -import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} import org.apache.hadoop.conf.Configuration import cascading.tap.Tap @@ -41,7 +41,8 @@ private[source] object ExternalizerSerializer { import com.twitter.bijection.Inversion.attemptWhen import com.twitter.bijection.codec.Base64 - implicit val baseInj: Injection[Externalizer[T], Array[Byte]] = JavaSerializationInjection[Externalizer[T]] + implicit val baseInj: Injection[Externalizer[T], Array[Byte]] = + JavaSerializationInjection[Externalizer[T]] implicit val unwrap: Injection[GZippedBase64String, String] = // this does not catch cases where it's Base64 but not compressed @@ -59,18 +60,19 @@ private[source] object SourceConfigBinaryConverterProvider { val ProviderConfKey = "com.twitter.scalding.lzo.converter.provider.source" } private[source] class SourceConfigBinaryConverterProvider[M] - extends ConfigBinaryConverterProvider[M](SourceConfigBinaryConverterProvider.ProviderConfKey) + extends ConfigBinaryConverterProvider[M](SourceConfigBinaryConverterProvider.ProviderConfKey) private[source] object SinkConfigBinaryConverterProvider { val ProviderConfKey = "com.twitter.scalding.lzo.converter.provider.sink" } private[source] class SinkConfigBinaryConverterProvider[M] - extends ConfigBinaryConverterProvider[M](SinkConfigBinaryConverterProvider.ProviderConfKey) + extends ConfigBinaryConverterProvider[M](SinkConfigBinaryConverterProvider.ProviderConfKey) /** * Provides BinaryConverter serialized in JobConf. */ -private[source] class ConfigBinaryConverterProvider[M](private[this] val confKey: String) extends BinaryConverterProvider[M] { +private[source] class ConfigBinaryConverterProvider[M](private[this] val confKey: String) + extends BinaryConverterProvider[M] { private[this] var cached: Option[(String, BinaryConverter[M])] = None override def getConverter(conf: Configuration): BinaryConverter[M] = { @@ -97,32 +99,40 @@ object LzoGenericScheme { /** * From a Binary Converter passed in configure in the JobConf using of that by ElephantBird */ - def setConverter[M](conv: BinaryConverter[M], conf: JobConf, confKey: String, overrideConf: Boolean = false): Unit = { + def setConverter[M]( + conv: BinaryConverter[M], + conf: JobConf, + confKey: String, + overrideConf: Boolean = false + ): Unit = if ((conf.get(confKey) == null) || overrideConf) { val extern = Externalizer(conv) try { ExternalizerSerializer.inj.invert(ExternalizerSerializer.inj(extern)).get } catch { - case e: Exception => throw new RuntimeException("Unable to roundtrip the BinaryConverter in the Externalizer.", e) + case e: Exception => + throw new RuntimeException("Unable to roundtrip the BinaryConverter in the Externalizer.", e) } conf.set(confKey, ExternalizerSerializer.inj(extern)) } - } } /** - * Generic scheme for data stored as lzo-compressed protobuf messages. - * Serialization is performed using the supplied BinaryConverter. + * Generic scheme for data stored as lzo-compressed protobuf messages. Serialization is performed using the + * supplied BinaryConverter. */ -class LzoGenericScheme[M](@transient conv: BinaryConverter[M], clazz: Class[M]) extends LzoBinaryScheme[M, GenericWritable[M]] { +class LzoGenericScheme[M](@transient conv: BinaryConverter[M], clazz: Class[M]) + extends LzoBinaryScheme[M, GenericWritable[M]] { override protected def prepareBinaryWritable(): GenericWritable[M] = new GenericWritable(conv) - override def sourceConfInit(fp: FlowProcess[JobConf], - tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], - conf: JobConf): Unit = { + override def sourceConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + conf: JobConf + ): Unit = { LzoGenericScheme.setConverter(conv, conf, SourceConfigBinaryConverterProvider.ProviderConfKey) MultiInputFormat.setClassConf(clazz, conf) @@ -131,13 +141,17 @@ class LzoGenericScheme[M](@transient conv: BinaryConverter[M], clazz: Class[M]) DelegateCombineFileInputFormat.setDelegateInputFormat(conf, classOf[MultiInputFormat[_]]) } - override def sinkConfInit(fp: FlowProcess[JobConf], - tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], - conf: JobConf): Unit = { + override def sinkConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + conf: JobConf + ): Unit = { LzoGenericScheme.setConverter(conv, conf, SinkConfigBinaryConverterProvider.ProviderConfKey) LzoGenericBlockOutputFormat.setClassConf(clazz, conf) - LzoGenericBlockOutputFormat.setGenericConverterClassConf(classOf[SinkConfigBinaryConverterProvider[_]], conf) + LzoGenericBlockOutputFormat.setGenericConverterClassConf( + classOf[SinkConfigBinaryConverterProvider[_]], + conf + ) DeprecatedOutputFormatWrapper.setOutputFormat(classOf[LzoGenericBlockOutputFormat[_]], conf) } } - diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala index 07eabe5256..72c305fcaa 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala @@ -12,11 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source - import com.twitter.elephantbird.mapreduce.io.BinaryConverter import com.twitter.scalding._ @@ -25,11 +24,17 @@ import cascading.scheme.Scheme /** * Generic source with an underlying GenericScheme that uses the supplied BinaryConverter. */ -abstract class LzoGenericSource[T] extends FileSource with SingleMappable[T] with TypedSink[T] with LocalTapSource { +abstract class LzoGenericSource[T] + extends FileSource + with SingleMappable[T] + with TypedSink[T] + with LocalTapSource { def clazz: Class[T] def conv: BinaryConverter[T] override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance(LzoGenericScheme[T](conv, clazz).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + LzoGenericScheme[T](conv, clazz).asInstanceOf[Scheme[_, _, _, _, _]] + ) } object LzoGenericSource { diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala index 392ee13166..d0e0e68eb2 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -25,7 +25,7 @@ import com.twitter.bijection.Injection import com.twitter.elephantbird.cascading2.scheme._ import com.twitter.scalding._ import com.twitter.scalding.Dsl._ -import com.twitter.scalding.source.{ CheckedInversion, MaxFailuresCheck } +import com.twitter.scalding.source.{CheckedInversion, MaxFailuresCheck} import com.twitter.scalding.typed.TypedSink import scala.collection.JavaConverters._ @@ -34,16 +34,17 @@ trait LzoCodec[T] extends FileSource with SingleMappable[T] with TypedSink[T] wi override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) override def hdfsScheme = HadoopSchemeInstance((new LzoByteArrayScheme).asInstanceOf[Scheme[_, _, _, _, _]]) override def transformForRead(pipe: Pipe) = - pipe.flatMap(0 -> 0) { fromBytes(_: Array[Byte]) } + pipe.flatMap(0 -> 0)(fromBytes(_: Array[Byte])) override def transformForWrite(pipe: Pipe) = - pipe.mapTo(0 -> 0) { injection.apply(_: T) } + pipe.mapTo(0 -> 0)(injection.apply(_: T)) protected def fromBytes(b: Array[Byte]): Option[T] = Some(injection.invert(b).get) override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { val tap = createTap(Read)(mode) - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(config, tap) .asScala .flatMap { te => @@ -67,13 +68,17 @@ trait ErrorThresholdLzoCodec[T] extends ErrorHandlingLzoCodec[T] { trait LzoProtobuf[T <: Message] extends LocalTapSource with SingleMappable[T] with TypedSink[T] { def column: Class[_] override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance((new LzoProtobufScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoProtobufScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]] + ) } trait LzoThrift[T <: TBase[_, _]] extends LocalTapSource with SingleMappable[T] with TypedSink[T] { def column: Class[_] override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance((new LzoThriftScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoThriftScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]] + ) } trait LzoText extends LocalTapSource with SingleMappable[String] with TypedSink[String] { @@ -83,19 +88,25 @@ trait LzoText extends LocalTapSource with SingleMappable[String] with TypedSink[ } trait LzoTsv extends DelimitedScheme with LocalTapSource { - override def hdfsScheme = HadoopSchemeInstance((new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) } trait LzoTypedTsv[T] extends DelimitedScheme with Mappable[T] with TypedSink[T] with LocalTapSource { override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance((new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)).asInstanceOf[Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) def mf: Manifest[T] override val types: Array[Class[_]] = { if (classOf[scala.Product].isAssignableFrom(mf.runtimeClass)) { //Assume this is a Tuple: - mf.typeArguments.map { _.runtimeClass }.toArray + mf.typeArguments.map(_.runtimeClass).toArray } else { //Assume there is only a single item Array(mf.runtimeClass) diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala index 9ea2aac715..7f32be0259 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala @@ -18,21 +18,30 @@ object LzoTypedText { * to get the implicit TypedDescriptor. * Then use TypedText.lzoTzv[MyCaseClass]("path") */ - def lzoTsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = new FixedLzoTypedText[T](TAB, path: _*) - def lzoOsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = new FixedLzoTypedText[T](ONE, path: _*) - def lzoCsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = new FixedLzoTypedText[T](COMMA, path: _*) - - def hourlyLzoTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + def lzoTsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](TAB, path: _*) + def lzoOsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](ONE, path: _*) + def lzoCsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](COMMA, path: _*) + + def hourlyLzoTsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") new TimePathLzoTypedText[T](TAB, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") } - def hourlyLzoOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + def hourlyLzoOsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") new TimePathLzoTypedText[T](ONE, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") } - def hourlyLzoCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + def hourlyLzoCsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") new TimePathLzoTypedText[T](COMMA, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") } @@ -52,7 +61,10 @@ object LzoTypedText { new TimePathLzoTypedText[T](COMMA, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") } - def dailyPrefixSuffixLzoOsv[T](prefix: String, suffix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + def dailyPrefixSuffixLzoOsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") require(suffix.head == '/', "suffix should include a preceding /") new TimePathLzoTypedText[T](ONE, prefix + TimePathedSource.YEAR_MONTH_DAY + suffix + "/*") @@ -62,26 +74,37 @@ object LzoTypedText { trait LzoTypedTextDelimited[T] extends TypedTextDelimited[T] with LocalTapSource { override def hdfsScheme = - HadoopSchemeInstance(new LzoTextDelimited(typeDescriptor.fields, false, false, - separator.str, strict, null /* quote */ , - typeDescriptor.fields.getTypesClasses, safe).asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new LzoTextDelimited( + typeDescriptor.fields, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ).asInstanceOf[Scheme[_, _, _, _, _]] + ) } class TimePathLzoTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) - extends TimePathedSource(path, dr, DateOps.UTC) with LzoTypedTextDelimited[T] { + extends TimePathedSource(path, dr, DateOps.UTC) + with LzoTypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } class MostRecentLzoTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) - extends MostRecentGoodSource(path, dr, DateOps.UTC) with LzoTypedTextDelimited[T] { + extends MostRecentGoodSource(path, dr, DateOps.UTC) + with LzoTypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } class FixedLzoTypedText[T](sep: TypedSep, path: String*)(implicit td: TypeDescriptor[T]) - extends FixedPathSource(path: _*) with LzoTypedTextDelimited[T] { + extends FixedPathSource(path: _*) + with LzoTypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } - diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala index 9f39aa972a..5aefae7f62 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -22,7 +22,7 @@ import cascading.tuple.Fields import com.google.common.base.Charsets import com.google.common.io.Files import com.twitter.scalding._ -import java.io.{ BufferedWriter, File, FileOutputStream, IOException, OutputStreamWriter } +import java.io.{BufferedWriter, File, FileOutputStream, IOException, OutputStreamWriter} import org.apache.hadoop.fs.Path /** @@ -31,9 +31,9 @@ import org.apache.hadoop.fs.Path * Header file format: tab separated column names. */ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) - extends FixedPathSource(p) - with DelimitedScheme - with FieldConversions { + extends FixedPathSource(p) + with DelimitedScheme + with FieldConversions { val headerPath = p.replaceAll("/+$", "") + ".HEADER" // make it lazy so as to only do once @@ -51,7 +51,7 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } // TODO: move this method to make it a util function. - def readFromFile(filename: String)(implicit mode: Mode) = { + def readFromFile(filename: String)(implicit mode: Mode) = mode match { case Hdfs(_, conf) => { try { @@ -75,10 +75,9 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } } } - } // TODO: move this method to make it a util function. - def writeToFile(filename: String, text: String)(implicit mode: Mode): Unit = { + def writeToFile(filename: String, text: String)(implicit mode: Mode): Unit = mode match { case Hdfs(_, conf) => { try { @@ -97,8 +96,7 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) // Local mode case _ => { try { - val br = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(filename), "utf-8")) + val br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "utf-8")) br.write(text) br.close() @@ -109,7 +107,6 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } } } - } override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = { val ret = super.writeFrom(pipe)(flowDef, mode) diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala index c95f2c746e..85e12eb82f 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -29,34 +29,49 @@ import com.twitter.scalding._ import com.twitter.scalding.commons.scheme.KeyValueByteScheme import com.twitter.scalding.commons.tap.VersionedTap import com.twitter.scalding.commons.tap.VersionedTap.TapMode -import com.twitter.scalding.source.{ CheckedInversion, MaxFailuresCheck } +import com.twitter.scalding.source.{CheckedInversion, MaxFailuresCheck} import com.twitter.scalding.typed.KeyedListLike import com.twitter.scalding.typed.TypedSink import org.apache.hadoop.mapred.JobConf import scala.collection.JavaConverters._ /** - * Source used to write key-value pairs as byte arrays into a versioned store. - * Supports incremental updates via the monoid on V. + * Source used to write key-value pairs as byte arrays into a versioned store. Supports incremental updates + * via the monoid on V. */ object VersionedKeyValSource { val defaultVersionsToKeep = 3 // TODO: have two apply methods here for binary compatibility purpose. Need to clean it up in next release. - def apply[K, V](path: String, sourceVersion: Option[Long] = None, sinkVersion: Option[Long] = None, maxFailures: Int = 0)(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = { + def apply[K, V]( + path: String, + sourceVersion: Option[Long] = None, + sinkVersion: Option[Long] = None, + maxFailures: Int = 0 + )(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = new VersionedKeyValSource[K, V](path, sourceVersion, sinkVersion, maxFailures, defaultVersionsToKeep) - } - def apply[K, V](path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int, versionsToKeep: Int)(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = + def apply[K, V]( + path: String, + sourceVersion: Option[Long], + sinkVersion: Option[Long], + maxFailures: Int, + versionsToKeep: Int + )(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = new VersionedKeyValSource[K, V](path, sourceVersion, sinkVersion, maxFailures, versionsToKeep) } -class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Long], val sinkVersion: Option[Long], - val maxFailures: Int, val versionsToKeep: Int)( - implicit @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])]) extends Source - with Mappable[(K, V)] - with TypedSink[(K, V)] { +class VersionedKeyValSource[K, V]( + val path: String, + val sourceVersion: Option[Long], + val sinkVersion: Option[Long], + val maxFailures: Int, + val versionsToKeep: Int +)(implicit @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])]) + extends Source + with Mappable[(K, V)] + with TypedSink[(K, V)] { import Dsl._ @@ -73,7 +88,9 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo HadoopSchemeInstance(new KeyValueByteScheme(fields).asInstanceOf[Scheme[_, _, _, _, _]]) @deprecated("This method is deprecated", "0.1.6") - def this(path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int)(implicit @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = + def this(path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int)(implicit + @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])] + ) = this(path, sourceVersion, sinkVersion, maxFailures, VersionedKeyValSource.defaultVersionsToKeep)(codec) def getTap(mode: TapMode) = { @@ -91,7 +108,7 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo val source = getTap(TapMode.SOURCE) val sink = getTap(TapMode.SINK) - override def validateTaps(mode: Mode): Unit = { + override def validateTaps(mode: Mode): Unit = // if a version is explicitly supplied, ensure that it exists sourceVersion.foreach { version => mode match { @@ -101,24 +118,26 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo if (!store.hasVersion(version)) { throw new InvalidSourceException( "Version %s does not exist. Currently available versions are: %s" - .format(version, store.getAllVersions)) + .format(version, store.getAllVersions) + ) } } - case _ => throw new IllegalArgumentException( - "VersionedKeyValSource does not support mode %s. Only HadoopMode is supported" - .format(mode)) + case _ => + throw new IllegalArgumentException( + "VersionedKeyValSource does not support mode %s. Only HadoopMode is supported" + .format(mode) + ) } } - } def resourceExists(mode: Mode): Boolean = mode match { case Test(buffers) => { - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) } case HadoopTest(conf, buffers) => { - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) } case _ => { val conf = new JobConf(mode.asInstanceOf[HadoopMode].jobConf) @@ -130,10 +149,10 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo sinkVersion.exists { version => mode match { case Test(buffers) => - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) case HadoopTest(conf, buffers) => - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) case m: HadoopMode => val conf = new JobConf(m.jobConf) @@ -148,7 +167,7 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo mode match { case Hdfs(_strict, _config) => readOrWrite match { - case Read => CastHfsTap(source) + case Read => CastHfsTap(source) case Write => CastHfsTap(sink) } case _ => @@ -162,21 +181,20 @@ class VersionedKeyValSource[K, V](val path: String, val sourceVersion: Option[Lo override def sinkFields: Fields = fields - override def transformForRead(pipe: Pipe): Pipe = { + override def transformForRead(pipe: Pipe): Pipe = pipe.flatMap((keyField, valField) -> (keyField, valField)) { pair: (Array[Byte], Array[Byte]) => checkedInversion(pair) } - } - override def transformForWrite(pipe: Pipe): Pipe = { + override def transformForWrite(pipe: Pipe): Pipe = pipe.mapTo((0, 1) -> (keyField, valField)) { pair: (K, V) => codecBox.get.apply(pair) } - } override def toIterator(implicit config: Config, mode: Mode): Iterator[(K, V)] = { val tap = createTap(Read)(mode) - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(config, tap) .asScala .flatMap { te => @@ -213,7 +231,8 @@ object RichPipeEx extends java.io.Serializable { implicit def typedPipeToRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]): TypedRichPipeEx[K, V] = new TypedRichPipeEx(pipe) implicit def keyedListLikeToRichPipeEx[K: Ordering, V: Monoid, T[K, +V] <: KeyedListLike[K, V, T]]( - kll: KeyedListLike[K, V, T]): TypedRichPipeEx[K, V] = typedPipeToRichPipeEx(kll.toTypedPipe) + kll: KeyedListLike[K, V, T] + ): TypedRichPipeEx[K, V] = typedPipeToRichPipeEx(kll.toTypedPipe) } class TypedRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]) extends java.io.Serializable { @@ -224,7 +243,10 @@ class TypedRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]) extends j // the pipe in using an implicit `Monoid[V]` and sinks all results // into the `sinkVersion` of data (or a new version) specified by // `src`. - def writeIncremental(src: VersionedKeyValSource[K, V], reducers: Int = 1)(implicit flowDef: FlowDef, mode: Mode): TypedPipe[(K, V)] = { + def writeIncremental(src: VersionedKeyValSource[K, V], reducers: Int = 1)(implicit + flowDef: FlowDef, + mode: Mode + ): TypedPipe[(K, V)] = { val outPipe = if (!src.resourceExists(mode)) pipe @@ -236,10 +258,10 @@ class TypedRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]) extends j val newPairs = pipe.sumByLocalKeys.map { case (k, v) => (k, v, 1) } (oldPairs ++ newPairs) - .groupBy { _._1 } + .groupBy(_._1) .withReducers(reducers) - .sortBy { _._3 } - .mapValues { _._2 } + .sortBy(_._3) + .mapValues(_._2) .sum .toTypedPipe } @@ -253,9 +275,11 @@ class RichPipeEx(pipe: Pipe) extends java.io.Serializable { // VersionedKeyValSource always merges with the most recent complete // version - def writeIncremental[K, V](src: VersionedKeyValSource[K, V], fields: Fields, reducers: Int = 1)(implicit monoid: Monoid[V], - flowDef: FlowDef, - mode: Mode) = { + def writeIncremental[K, V](src: VersionedKeyValSource[K, V], fields: Fields, reducers: Int = 1)(implicit + monoid: Monoid[V], + flowDef: FlowDef, + mode: Mode + ) = { def appendToken(pipe: Pipe, token: Int) = pipe.mapTo((0, 1) -> ('key, 'value, 'isNew)) { pair: (K, V) => pair :+ token } @@ -267,7 +291,7 @@ class RichPipeEx(pipe: Pipe) extends java.io.Serializable { val newPairs = appendToken(pipe, 1) (oldPairs ++ newPairs) - .groupBy('key) { _.reducers(reducers).sortBy('isNew).sum[V]('value) } + .groupBy('key)(_.reducers(reducers).sortBy('isNew).sum[V]('value)) .project(('key, 'value)) .rename(('key, 'value) -> fields) } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala index d7db06d87e..21ba4c8d78 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala @@ -9,10 +9,12 @@ object KMeans { * This is the euclidean norm between two vectors */ private def distance(v1: Vector[Double], v2: Vector[Double]): Double = - math.sqrt(v1.iterator - .zip(v2.iterator) - .map { case (l, r) => (l - r) * (l - r) } - .sum) + math.sqrt( + v1.iterator + .zip(v2.iterator) + .map { case (l, r) => (l - r) * (l - r) } + .sum + ) // Just normal vector addition private def add(v1: Vector[Double], v2: Vector[Double]): Vector[Double] = @@ -20,13 +22,13 @@ object KMeans { // normal scalar multiplication private def scale(s: Double, v: Vector[Double]): Vector[Double] = - v.map { x => s * x } + v.map(x => s * x) // Here we return the centroid of some vectors private def centroidOf(vecs: TraversableOnce[Vector[Double]]): Vector[Double] = { val (vec, count) = vecs // add a 1 to each value to count the number of vectors in one pass: - .map { v => (v, 1) } + .map(v => (v, 1)) // Here we add both the count and the vectors: .reduce { (ll, rr) => val (l, lc) = ll @@ -37,8 +39,10 @@ object KMeans { scale(1.0 / count, vec) } - private def closest[Id](from: Vector[Double], - centroids: TraversableOnce[(Id, Vector[Double])]): (Id, Vector[Double]) = + private def closest[Id]( + from: Vector[Double], + centroids: TraversableOnce[(Id, Vector[Double])] + ): (Id, Vector[Double]) = centroids // compute the distance to each center .map { case (id, cent) => (distance(from, cent), (id, cent)) } @@ -50,19 +54,20 @@ object KMeans { type LabeledVector = (Int, Vector[Double]) /** - * This runs one step in a kmeans algorithm - * It returns the number of vectors that changed clusters, - * the new clusters - * and the new list of labeled vectors + * This runs one step in a kmeans algorithm It returns the number of vectors that changed clusters, the new + * clusters and the new list of labeled vectors */ - def kmeansStep(k: Int, - s: Stat, - clusters: ValuePipe[List[LabeledVector]], - points: TypedPipe[LabeledVector]): Execution[(ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + def kmeansStep( + k: Int, + s: Stat, + clusters: ValuePipe[List[LabeledVector]], + points: TypedPipe[LabeledVector] + ): Execution[(ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { // Do a cross product to produce all point, cluster pairs // in scalding, the smaller pipe should go on the right. - val next = points.leftCross(clusters) + val next = points + .leftCross(clusters) // now compute the closest cluster for each vector .map { case ((oldId, vector), Some(centroids)) => @@ -75,35 +80,40 @@ object KMeans { // Now update the clusters: next.map { pipe => - (ComputedValue(pipe - .group - // There is no need to use more than k reducers - .withReducers(k) - .mapValueStream { vectors => Iterator(centroidOf(vectors)) } - // Now collect them all into one big - .groupAll - .toList - // discard the "all" key used to group them together - .values), pipe) + ( + ComputedValue( + pipe.group + // There is no need to use more than k reducers + .withReducers(k) + .mapValueStream(vectors => Iterator(centroidOf(vectors))) + // Now collect them all into one big + .groupAll + .toList + // discard the "all" key used to group them together + .values + ), + pipe + ) } } - def initializeClusters(k: Int, points: TypedPipe[Vector[Double]]): (ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector]) = { + def initializeClusters( + k: Int, + points: TypedPipe[Vector[Double]] + ): (ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector]) = { val rng = new java.util.Random(123) // take a random k vectors: - val clusters = points.map { v => (rng.nextDouble, v) } + val clusters = points + .map(v => (rng.nextDouble, v)) .groupAll .sortedTake(k)(Ordering.by(_._1)) .mapValues { randk => - randk.iterator - .zipWithIndex - .map { case ((_, v), id) => (id, v) } - .toList + randk.iterator.zipWithIndex.map { case ((_, v), id) => (id, v) }.toList } .values // attach a random cluster to each vector - val labeled = points.map { v => (rng.nextInt(k), v) } + val labeled = points.map(v => (rng.nextInt(k), v)) (ComputedValue(clusters), labeled) } @@ -112,24 +122,25 @@ object KMeans { * Run the full k-means algorithm by flatMapping the above function into itself * while the number of vectors that changed is not zero */ - def kmeans(k: Int, - clusters: ValuePipe[List[LabeledVector]], - points: TypedPipe[LabeledVector]): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + def kmeans( + k: Int, + clusters: ValuePipe[List[LabeledVector]], + points: TypedPipe[LabeledVector] + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { val key = StatKey("changed", "scalding.kmeans") - def go(s: Stat, - c: ValuePipe[List[LabeledVector]], - p: TypedPipe[LabeledVector], - step: Int): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = - - kmeansStep(k, s, c, p) - .getAndResetCounters - .flatMap { - case ((nextC, nextP), counters) => - val changed = counters(key) - if (changed == 0L) Execution.from((step, nextC, nextP)) - else go(s, nextC, nextP, step + 1) + def go( + s: Stat, + c: ValuePipe[List[LabeledVector]], + p: TypedPipe[LabeledVector], + step: Int + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = + kmeansStep(k, s, c, p).getAndResetCounters + .flatMap { case ((nextC, nextP), counters) => + val changed = counters(key) + if (changed == 0L) Execution.from((step, nextC, nextP)) + else go(s, nextC, nextP, step + 1) } Execution.withId { implicit uid => @@ -137,7 +148,10 @@ object KMeans { } } - def apply(k: Int, points: TypedPipe[Vector[Double]]): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + def apply( + k: Int, + points: TypedPipe[Vector[Double]] + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { val (clusters, labeled) = initializeClusters(k, points) kmeans(k, clusters, labeled) } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala index b02a6311d0..c9e7e71721 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala @@ -5,17 +5,19 @@ import scala.annotation.tailrec import com.twitter.scalding._ /** - * This example job does not yet work. It is a test for Kyro serialization + * This example job does not yet work. It is a test for Kyro serialization */ class MergeTest(args: Args) extends Job(args) { - TextLine(args("input")).flatMapTo('word) { _.split("""\s+""") } - .groupBy('word) { _.size } + TextLine(args("input")) + .flatMapTo('word)(_.split("""\s+""")) + .groupBy('word)(_.size) //Now, let's get the top 10 words: .groupAll { - _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ { (l1: List[(String, Long)], l2: List[(String, Long)]) => - mergeSort2(l1, l2, 10, cmpTup) - } /* map2 */ { - lout: List[(String, Long)] => lout + _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ { + (l1: List[(String, Long)], l2: List[(String, Long)]) => + mergeSort2(l1, l2, 10, cmpTup) + } /* map2 */ { lout: List[(String, Long)] => + lout } } //Now expand out the list. @@ -28,7 +30,7 @@ class MergeTest(args: Args) extends Job(args) { def mergeSort2[T](v1: List[T], v2: List[T], k: Int, cmp: Function2[T, T, Int]) = { @tailrec - def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] = { + def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] = (list1, list2, k) match { case (_, _, 0) => acc case (x1 :: t1, x2 :: t2, _) => { @@ -40,9 +42,8 @@ class MergeTest(args: Args) extends Job(args) { } case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k - 1) case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k - 1) - case (Nil, Nil, _) => acc + case (Nil, Nil, _) => acc } - } mergeSortR(Nil, v1, v2, k).reverse } } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala index 66f16598c0..c2ae4827be 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala @@ -6,18 +6,17 @@ import com.twitter.scalding._ /** * Options: * --input: the three column TSV with node, comma-sep-out-neighbors, initial pagerank (set to 1.0 first) - * --output: the name for the TSV you want to write to, same as above. - * optional arguments: - * --errorOut: name of where to write the L1 error between the input page-rank and the output - * if this is omitted, we don't compute the error - * --iterations: how many iterations to run inside this job. Default is 1, 10 is about as - * much as cascading can handle. + * --output: the name for the TSV you want to write to, same as above. optional arguments: + * --errorOut: name of where to write the L1 error between the input page-rank and the output if this is + * omitted, we don't compute the error + * --iterations: how many iterations to run inside this job. Default is 1, 10 is about as much as cascading + * can handle. * --jumpprob: probability of a random jump, default is 0.15 - * --convergence: if this is set, after every "--iterations" steps, we check the error and see - * if we should continue. Since the error check is expensive (involving a join), you should - * avoid doing this too frequently. 10 iterations is probably a good number to set. - * --temp: this is the name where we will store a temporary output so we can compare to the previous - * for convergence checking. If convergence is set, this MUST be. + * --convergence: if this is set, after every "--iterations" steps, we check the error and see if we should + * continue. Since the error check is expensive (involving a join), you should avoid doing this too + * frequently. 10 iterations is probably a good number to set. + * --temp: this is the name where we will store a temporary output so we can compare to the previous for + * convergence checking. If convergence is set, this MUST be. */ class PageRank(args: Args) extends Job(args) { @@ -36,22 +35,23 @@ class PageRank(args: Args) extends Job(args) { //initial rank (default to 1.0 if you are starting from nothing) initialize('src, 'dst, 'rank) /* - * This algorithm works by having two types of rows that have the same column structure. - * the node -> list(neighbors), and node -> individual neighbor. - * We distinguish these two types with an id which nodes if this is a NODESET or an EDGE. - * The first step is to append that value. We also need to have a column for the degree. - * It doesn't matter what the initial degree is, we recompute below - */ - .map(() -> ('rowtype, 'd_src)) { (u: Unit) => (NODESET, -1) } - .thenDo(doPageRank(STEPS)_) + * This algorithm works by having two types of rows that have the same column structure. + * the node -> list(neighbors), and node -> individual neighbor. + * We distinguish these two types with an id which nodes if this is a NODESET or an EDGE. + * The first step is to append that value. We also need to have a column for the degree. + * It doesn't matter what the initial degree is, we recompute below + */ + .map(() -> ('rowtype, 'd_src))((u: Unit) => (NODESET, -1)) + .thenDo(doPageRank(STEPS) _) .thenDo(computeError _) .thenDo(output _) /** * Here is where we check for convergence and then run the next job if we're not converged */ - override def next: Option[Job] = { - args.optional("convergence") + override def next: Option[Job] = + args + .optional("convergence") .flatMap { convErr => /* * It's easy for this to seem broken, so think about it twice: @@ -76,53 +76,51 @@ class PageRank(args: Args) extends Job(args) { None } } - } + /** - * override this function to change how you generate a pipe of - * (Long, String, Double) - * where the first entry is the nodeid, the second is the list of neighbors, - * as a comma (no spaces) separated string representation of the numeric nodeids, - * the third is the initial page rank (if not starting from a previous run, this - * should be 1.0 + * override this function to change how you generate a pipe of (Long, String, Double) where the first entry + * is the nodeid, the second is the list of neighbors, as a comma (no spaces) separated string + * representation of the numeric nodeids, the third is the initial page rank (if not starting from a + * previous run, this should be 1.0 * - * NOTE: if you want to run until convergence, the initialize method must read the same - * EXACT format as the output method writes. This is your job! + * NOTE: if you want to run until convergence, the initialize method must read the same EXACT format as the + * output method writes. This is your job! */ - def initialize(nodeCol: Symbol, neighCol: Symbol, pageRank: Symbol) = { + def initialize(nodeCol: Symbol, neighCol: Symbol, pageRank: Symbol) = Tsv(args("input")).read //Just to name the columns: - .mapTo((0, 1, 2) -> (nodeCol, neighCol, pageRank)) { - input: (Long, String, Double) => input + .mapTo((0, 1, 2) -> (nodeCol, neighCol, pageRank)) { input: (Long, String, Double) => + input } - } /** - * The basic idea is to groupBy the dst key with BOTH the nodeset and the edge rows. - * the nodeset rows have the old page-rank, the edge rows are reversed, so we can get - * the incoming page-rank from the nodes that point to each destination. + * The basic idea is to groupBy the dst key with BOTH the nodeset and the edge rows. the nodeset rows have + * the old page-rank, the edge rows are reversed, so we can get the incoming page-rank from the nodes that + * point to each destination. */ @tailrec - final def doPageRank(steps: Int)(pagerank: RichPipe): RichPipe = { + final def doPageRank(steps: Int)(pagerank: RichPipe): RichPipe = if (steps <= 0) { pagerank } else { val nodeRows = pagerank //remove any EDGE rows from the previous loop - .filter('rowtype) { (rowtype: Int) => rowtype == NODESET } + .filter('rowtype)((rowtype: Int) => rowtype == NODESET) //compute the incremental rank due to the random jump: - val randomJump = nodeRows.map('rank -> 'rank) { (rank: Double) => ALPHA } + val randomJump = nodeRows.map('rank -> 'rank)((rank: Double) => ALPHA) //expand the neighbor list inte an edge list and out-degree of the src - val edges = nodeRows.flatMap(('dst, 'd_src) -> ('dst, 'd_src)) { args: (String, Long) => - if (args._1.length > 0) { - val dsts = args._1.split(",") - //Ignore the old degree: - val deg = dsts.size - dsts.map { str => (str.toLong, deg) } - } else { - //Here is a node that points to no other nodes (dangling) - Nil + val edges = nodeRows + .flatMap(('dst, 'd_src) -> ('dst, 'd_src)) { args: (String, Long) => + if (args._1.length > 0) { + val dsts = args._1.split(",") + //Ignore the old degree: + val deg = dsts.size + dsts.map(str => (str.toLong, deg)) + } else { + //Here is a node that points to no other nodes (dangling) + Nil + } } - } //Here we make a false row that we use to tell dst how much incoming //Page rank it needs to add to itself: .map(('src, 'd_src, 'dst, 'rank, 'rowtype) -> ('src, 'd_src, 'dst, 'rank, 'rowtype)) { @@ -132,11 +130,11 @@ class PageRank(args: Args) extends Job(args) { //We swap destination into the source position (dst, -1L, "", rank * (1.0 - ALPHA) / d_src, EDGE) } + /** - * Here we do the meat of the algorithm: - * if N = number of nodes, pr(N_i) prob of walking to node i, then: - * N pr(N_i) = (\sum_{j points to i} N pr(N_j) * (1-ALPHA)/d_j) + ALPHA - * N pr(N_i) is the page rank of node i. + * Here we do the meat of the algorithm: if N = number of nodes, pr(N_i) prob of walking to node i, + * then: N pr(N_i) = (\sum_{j points to i} N pr(N_j) * (1-ALPHA)/d_j) + ALPHA N pr(N_i) is the page rank + * of node i. */ val nextPr = (edges ++ randomJump).groupBy('src) { /* @@ -152,13 +150,11 @@ class PageRank(args: Args) extends Job(args) { //Must call ourselves in the tail position: doPageRank(steps - 1)(nextPr) } - } //This outputs in the same format as the input, so you can run the job //iteratively, subclass to change the final behavior - def output(pipe: RichPipe) = { + def output(pipe: RichPipe) = pipe.project('src, 'dst, 'rank).write(Tsv(args("output"))) - } //Optionally compute the average error: def computeError(pr: RichPipe): RichPipe = { @@ -169,7 +165,7 @@ class PageRank(args: Args) extends Job(args) { .mapTo(('rank0, 'rank) -> 'err) { ranks: (Double, Double) => scala.math.abs(ranks._1 - ranks._2) } - .groupAll { _.average('err) } + .groupAll(_.average('err)) .write(TypedTsv[Double](errOut)) } pr diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala index c32f07d501..31e9612244 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala @@ -3,25 +3,20 @@ package com.twitter.scalding.examples import com.twitter.scalding._ /** - * weighted page rank for the given graph, start from the given pagerank, - * perform one iteartion, test for convergence, if not yet, clone itself - * and start the next page rank job with updated pagerank as input. + * weighted page rank for the given graph, start from the given pagerank, perform one iteartion, test for + * convergence, if not yet, clone itself and start the next page rank job with updated pagerank as input. * * This class is very similar to the PageRank class, main differences are: - * 1. supported weighted pagerank - * 2. the reset pagerank is pregenerated, possibly through a previous job - * 3. dead pagerank is evenly distributed + * 1. supported weighted pagerank 2. the reset pagerank is pregenerated, possibly through a previous job 3. + * dead pagerank is evenly distributed * * Options: - * --pwd: working directory, will read/generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file eg pagerank_0, pagerank_1 etc - * totaldiff: the current max pagerank delta - * Optional arguments: + * --pwd: working directory, will read/generate the following files there numnodes: total number of nodes + * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> pagerank: the page rank file eg pagerank_0, + * pagerank_1 etc totaldiff: the current max pagerank delta Optional arguments: * --weighted: do weighted pagerank, default false * --curiteration: what is the current iteration, default 0 - * --maxiterations: how many iterations to run. Default is 20 + * --maxiterations: how many iterations to run. Default is 20 * --jumpprob: probability of a random jump, default is 0.1 * --threshold: total difference before finishing early, default 0.001 */ @@ -57,7 +52,7 @@ class WeightedPageRank(args: Args) extends Job(args) { .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) => scala.math.abs(args._1 - args._2) } - .groupAll { _.sum[Double]('mass_diff) } + .groupAll(_.sum[Double]('mass_diff)) .write(TypedTsv[Double](PWD + "/totaldiff")) /** @@ -75,17 +70,16 @@ class WeightedPageRank(args: Args) extends Job(args) { } } - def getInputPagerank(fileName: String) = { + def getInputPagerank(fileName: String) = Tsv(fileName).read - .mapTo((0, 1) -> ('src_id_input, 'mass_input)) { - input: (Int, Double) => input + .mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Int, Double) => + input } - } /** * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> */ - def getNodes(fileName: String) = { + def getNodes(fileName: String) = mode match { case Hdfs(_, conf) => { SequenceFile(fileName).read @@ -97,59 +91,45 @@ class WeightedPageRank(args: Args) extends Job(args) { Tsv(fileName).read .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { input: (Int, String, String, Double) => - { - ( - input._1, - // convert string to int array - if (input._2 != null && input._2.length > 0) { - input._2.split(",").map { _.toInt } - } else { - Array[Int]() - }, - // convert string to float array - if (input._3 != null && input._3.length > 0) { - input._3.split(",").map { _.toFloat } - } else { - Array[Float]() - }, - input._4) - } + ( + input._1, + // convert string to int array + if (input._2 != null && input._2.length > 0) { + input._2.split(",").map(_.toInt) + } else { + Array[Int]() + }, + // convert string to float array + if (input._3 != null && input._3.length > 0) { + input._3.split(",").map(_.toFloat) + } else { + Array[Float]() + }, + input._4 + ) } } } - } /** * the total number of nodes, single line file */ - def getNumNodes(fileName: String) = { + def getNumNodes(fileName: String) = Tsv(fileName).read .mapTo(0 -> 'size) { input: Int => input } - } /** - * one iteration of pagerank - * inputPagerank: <'src_id_input, 'mass_input> - * return <'src_id, 'mass_n, 'mass_input> + * one iteration of pagerank inputPagerank: <'src_id_input, 'mass_input> return <'src_id, 'mass_n, + * 'mass_input> * - * Here is a highlevel view of the unweighted algorithm: - * let - * N: number of nodes - * inputPagerank(N_i): prob of walking to node i, - * d(N_j): N_j's out degree - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j) - * deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N - * randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) - * pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA) + * Here is a highlevel view of the unweighted algorithm: let N: number of nodes inputPagerank(N_i): prob of + * walking to node i, d(N_j): N_j's out degree then pagerankNext(N_i) = (\sum_{j points to i} + * inputPagerank(N_j) / d_j) deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N randomPagerank(N_i) = + * userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) pagerankOutput(N_i) = randomPagerank(N_i) + + * pagerankNext(N_i) * (1-ALPHA) * - * For weighted algorithm: - * let - * w(N_j, N_i): weight from N_j to N_i - * tw(N_j): N_j's total out weights - * then + * For weighted algorithm: let w(N_j, N_i): weight from N_j to N_i tw(N_j): N_j's total out weights then * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j)) - * */ def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = { // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input @@ -161,23 +141,21 @@ class WeightedPageRank(args: Args) extends Job(args) { val pagerankNext = nodeJoined .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) { args: (Array[Int], Array[Float], Double) => - { - if (args._1.length > 0) { - if (WEIGHTED) { - // weighted distribution - val total: Double = args._2.sum - (args._1 zip args._2).map { idWeight: (Int, Float) => - (idWeight._1, args._3 * idWeight._2 / total) - } - } else { - // equal distribution - val dist: Double = args._3 / args._1.length - args._1.map { id: Int => (id, dist) } + if (args._1.length > 0) { + if (WEIGHTED) { + // weighted distribution + val total: Double = args._2.sum + args._1.zip(args._2).map { idWeight: (Int, Float) => + (idWeight._1, args._3 * idWeight._2 / total) } } else { - //Here is a node that points to no other nodes (dangling) - Nil + // equal distribution + val dist: Double = args._3 / args._1.length + args._1.map { id: Int => (id, dist) } } + } else { + //Here is a node that points to no other nodes (dangling) + Nil } } .groupBy('src_id) { @@ -185,7 +163,7 @@ class WeightedPageRank(args: Args) extends Job(args) { } // 'sum_mass - val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) } + val sumPagerankNext = pagerankNext.groupAll(_.sum[Double]('mass_n -> 'sum_mass)) // 'deadMass // single row jobs @@ -199,7 +177,8 @@ class WeightedPageRank(args: Args) extends Job(args) { // 'src_id_r, 'mass_n_r // random jump probability plus dead page rank - val randomPagerank = nodeJoined.crossWithTiny(deadPagerank) + val randomPagerank = nodeJoined + .crossWithTiny(deadPagerank) .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) { ranks: (Int, Double, Double, Double) => (ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4) diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala index 45e82b536d..66789d1b98 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala @@ -1,39 +1,33 @@ package com.twitter.scalding.examples import com.twitter.scalding._ -import com.twitter.scalding.mathematics.{ Matrix, ColVector } +import com.twitter.scalding.mathematics.{ColVector, Matrix} import com.twitter.scalding.mathematics.Matrix._ /** - * A weighted PageRank implementation using the Scalding Matrix API. This - * assumes that all rows and columns are of type {@link Int} and values or egde - * weights are {@link Double}. If you want an unweighted PageRank, simply set - * the weights on the edges to 1. + * A weighted PageRank implementation using the Scalding Matrix API. This assumes that all rows and columns + * are of type {@link Int} and values or egde weights are {@link Double}. If you want an unweighted PageRank, + * simply set the weights on the edges to 1. * * Input arguments: * - * d -- damping factor - * n -- number of nodes in the graph - * currentIteration -- start with 0 probably - * maxIterations -- stop after n iterations - * convergenceThreshold -- using the sum of the absolute difference between - * iteration solutions, iterating stops once we reach - * this threshold - * rootDir -- the root directory holding all starting, intermediate and final - * data/output + * d -- damping factor n -- number of nodes in the graph currentIteration -- start with 0 probably + * maxIterations -- stop after n iterations convergenceThreshold -- using the sum of the absolute difference + * between iteration solutions, iterating stops once we reach this threshold rootDir -- the root directory + * holding all starting, intermediate and final data/output * * The expected structure of the rootDir is: * - * rootDir - * |- iterations - * | |- 0 <-- a TSV of (row, value) of size n, value can be 1/n (generate this) - * | |- n <-- holds future iterations/solutions - * |- edges <-- a TSV of (row, column, value) for edges in the graph - * |- onesVector <-- a TSV of (row, 1) of size n (generate this) - * |- diff <-- a single line representing the difference between the last iterations - * |- constants <-- built at iteration 0, these are constant for any given matrix/graph - * |- M_hat - * |- priorVector + * rootDir + * |- iterations + * | |- 0 <-- a TSV of (row, value) of size n, value can be 1/n (generate this) + * | |- n <-- holds future iterations/solutions + * |- edges <-- a TSV of (row, column, value) for edges in the graph + * |- onesVector <-- a TSV of (row, 1) of size n (generate this) + * |- diff <-- a single line representing the difference between the last iterations + * |- constants <-- built at iteration 0, these are constant for any given matrix/graph + * |- M_hat + * |- priorVector * * Don't forget to set the number of reducers for this job: * -D mapred.reduce.tasks=n @@ -68,8 +62,7 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { measureConvergenceAndStore() /** - * Recurse and iterate again iff we are under the max number of iterations and - * vector has not converged. + * Recurse and iterate again iff we are under the max number of iterations and vector has not converged. */ override def next = { val diff = TypedTsv[Double](diffLoc).toIterator.next @@ -83,22 +76,19 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } /** - * Measure convergence by calculating the total of the absolute difference - * between the previous and next vectors. This stores the result after - * calculation. + * Measure convergence by calculating the total of the absolute difference between the previous and next + * vectors. This stores the result after calculation. */ - def measureConvergenceAndStore(): Unit = { - (previousVector - nextVector). - mapWithIndex { case (value, index) => math.abs(value) }. - sum. - write(TypedTsv[Double](diffLoc)) - } + def measureConvergenceAndStore(): Unit = + (previousVector - nextVector) + .mapWithIndex { case (value, index) => math.abs(value) } + .sum + .write(TypedTsv[Double](diffLoc)) /** * Load or generate on first iteration the matrix M^ given A. */ - def M_hat: Matrix[Int, Int, Double] = { - + def M_hat: Matrix[Int, Int, Double] = if (currentIteration == 0) { val A = matrixFromTsv(edgesLoc) val M = A.rowL1Normalize.transpose @@ -108,13 +98,11 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } else { matrixFromTsv(rootDir + "/constants/M_hat") } - } /** * Load or generate on first iteration the prior vector given d and n. */ - def priorVector: ColVector[Int, Double] = { - + def priorVector: ColVector[Int, Double] = if (currentIteration == 0) { val onesVector = colVectorFromTsv(onesVectorLoc) val priorVector = ((1 - d) / n) * onesVector.toMatrix(0) @@ -123,7 +111,6 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } else { colVectorFromTsv(rootDir + "/constants/priorVector") } - } def matrixFromTsv(input: String): Matrix[Int, Int, Double] = TypedTsv[(Int, Int, Double)](input).toMatrix diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala index 83a8dd0175..64298cd306 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala @@ -3,9 +3,10 @@ package com.twitter.scalding.examples import com.twitter.scalding._ class WordCountJob(args: Args) extends Job(args) { - TypedPipe.from(TextLine(args("input"))) - .flatMap { line => line.split("\\s+") } - .map { word => (word, 1L) } + TypedPipe + .from(TextLine(args("input"))) + .flatMap(line => line.split("\\s+")) + .map(word => (word, 1L)) .sumByKey // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink .write(TypedTsv[(String, Long)](args("output"))) diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala index 02acacc8f5..2a820c7f87 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ @@ -35,13 +35,15 @@ class ExecutionKMeansTest extends WordSpec with Matchers { // To have the seeds stay sane for kmeans k == vectorCount val vectorCount = k - val vectors = TypedPipe.from((0 until vectorCount).map { i => randVect(i % k) }) + val vectors = TypedPipe.from((0 until vectorCount).map(i => randVect(i % k))) - val labels = KMeans(k, vectors).flatMap { - case (_, _, labeledPipe) => + val labels = KMeans(k, vectors) + .flatMap { case (_, _, labeledPipe) => labeledPipe.toIterableExecution - } - .waitFor(Config.default, Local(false)).get.toList + } + .waitFor(Config.default, Local(false)) + .get + .toList def clusterOf(v: Vector[Double]): Int = v.indexWhere(_ > 0.0) @@ -49,10 +51,9 @@ class ExecutionKMeansTest extends WordSpec with Matchers { // The rule is this: if two vectors share the same prefix, // the should be in the same cluster - byCluster.foreach { - case (clusterId, vs) => - val id = vs.head._1 - vs.foreach { case (thisId, _) => id shouldBe thisId } + byCluster.foreach { case (clusterId, vs) => + val id = vs.head._1 + vs.foreach { case (thisId, _) => id shouldBe thisId } } } } diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala index 794d960d05..f210fee451 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class PageRankTest extends WordSpec with Matchers { "A PageRank job" should { @@ -29,14 +29,14 @@ class PageRankTest extends WordSpec with Matchers { .arg("convergence", "0.05") .source(Tsv("inputFile"), List((1L, "2", 1.0), (2L, "1,3", 1.0), (3L, "2", 1.0))) //Don't check the tempBuffer: - .sink[(Long, String, Double)](Tsv("tempBuffer")) { ob => () } + .sink[(Long, String, Double)](Tsv("tempBuffer"))(ob => ()) .sink[Double](TypedTsv[Double]("error")) { ob => "have low error" in { ob.head should be <= 0.05 } } - .sink[(Long, String, Double)](Tsv("outputFile")){ outputBuffer => - val pageRank = outputBuffer.map { res => (res._1, res._3) }.toMap + .sink[(Long, String, Double)](Tsv("outputFile")) { outputBuffer => + val pageRank = outputBuffer.map(res => (res._1, res._3)).toMap "correctly compute pagerank" in { val d = 0.85 val twoPR = (1.0 + 2 * d) / (1.0 + d) diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala index df29092fd8..9dad6b2950 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.examples import scala.collection._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ @@ -32,14 +32,8 @@ class WeightedPageRankFromMatrixSpec extends WordSpec with Matchers { // 0.5 0.0 0.0 0.0 0.0 // 0.0 1.0 0.5 0.0 0.0 // 0.0 0.0 0.5 1.0 0.0 - val edges = List( - (0, 4, 1.0), - (1, 0, 0.5), - (2, 0, 0.5), - (3, 1, 1.0), - (3, 2, 0.5), - (4, 2, 0.5), - (4, 3, 1.0)) + val edges = + List((0, 4, 1.0), (1, 0, 0.5), (2, 0, 0.5), (3, 1, 1.0), (3, 2, 0.5), (4, 2, 0.5), (4, 3, 1.0)) val d = 0.4d // damping factor val n = 5 // number of nodes @@ -72,49 +66,38 @@ class WeightedPageRankFromMatrixSpec extends WordSpec with Matchers { .sink[(Int, Double)](Tsv("root/constants/priorVector")) { outputBuffer => outputBuffer should have size 5 val expectedValue = ((1 - d) / 2) * d - assertVectorsEqual( - new Array[Double](5).map { v => expectedValue }, - outputBuffer.map(_._2).toArray) + assertVectorsEqual(new Array[Double](5).map(v => expectedValue), outputBuffer.map(_._2).toArray) } .sink[(Int, Double)](Tsv("root/iterations/1")) { outputBuffer => outputBuffer should have size 5 - assertVectorsEqual( - expectedSolution, - outputBuffer.map(_._2).toArray, - 0.00001) + assertVectorsEqual(expectedSolution, outputBuffer.map(_._2).toArray, 0.00001) } .typedSink(TypedTsv[Double]("root/diff")) { outputBuffer => outputBuffer should have size 1 val expectedDiff = - expectedSolution.zip(iterationZeroVector.map(_._2)). - map { case (a, b) => math.abs(a - b) }. - sum + expectedSolution.zip(iterationZeroVector.map(_._2)).map { case (a, b) => math.abs(a - b) }.sum outputBuffer.head shouldBe expectedDiff +- 0.00001 } .run .finish() } - private def assertVectorsEqual(expected: Array[Double], actual: Array[Double], variance: Double): Unit = { - actual.zipWithIndex.foreach { - case (value, i) => - value shouldBe (expected(i)) +- variance + private def assertVectorsEqual(expected: Array[Double], actual: Array[Double], variance: Double): Unit = + actual.zipWithIndex.foreach { case (value, i) => + value shouldBe (expected(i)) +- variance } - } - private def assertVectorsEqual(expected: Array[Double], actual: Array[Double]): Unit = { - actual.zipWithIndex.foreach { - case (value, i) => - value shouldBe (expected(i)) + private def assertVectorsEqual(expected: Array[Double], actual: Array[Double]): Unit = + actual.zipWithIndex.foreach { case (value, i) => + value shouldBe (expected(i)) } - } } object WeightedPageRankFromMatrixSpec { def toSparseMap[Row, Col, V](iterable: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = - iterable.map { entry => ((entry._1, entry._2), entry._3) }.toMap + iterable.map(entry => ((entry._1, entry._2), entry._3)).toMap def filledColumnVector(value: Double, size: Int): List[(Int, Double)] = { val vector = mutable.ListBuffer[(Int, Double)]() @@ -127,42 +110,30 @@ object WeightedPageRankFromMatrixSpec { } /** - * Octave/Matlab implementations to provide the expected ranks. This comes from - * the Wikipedia page on PageRank: - * http://en.wikipedia.org/wiki/PageRank#Computation + * Octave/Matlab implementations to provide the expected ranks. This comes from the Wikipedia page on + * PageRank: http://en.wikipedia.org/wiki/PageRank#Computation * * function [v] = iterate(A, sv, d) * - * N = size(A, 2) - * M = (spdiags(1 ./ sum(A, 2), 0, N, N) * A)'; - * v = (d * M * sv) + (((1 - d) / N) .* ones(N, 1)); + * N = size(A, 2) M = (spdiags(1 ./ sum(A, 2), 0, N, N) * A)'; v = (d * M * sv) + (((1 - d) / N) .* ones(N, + * 1)); * * endfunction * * iterate([0 0 0 0 1; 0.5 0 0 0 0; 0.5 0 0 0 0; 0 1 0.5 0 0; 0 0 0.5 1 0], [0.2; 0.2; 0.2; 0.2; 0.2], 0.4) * - * % Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' sum(i, M_i,j) = 1 - * % Parameter d damping factor - * % Parameter v_quadratic_error quadratic error for v - * % Return v, a vector of ranks such that v_i is the i-th rank from [0, 1] + * % Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' + * sum(i, M_i,j) = 1 % Parameter d damping factor % Parameter v_quadratic_error quadratic error for v % Return + * v, a vector of ranks such that v_i is the i-th rank from [0, 1] * * function [v] = rank(M, d, v_quadratic_error) * - * N = size(M, 2); % N is equal to half the size of M - * v = rand(N, 1); - * v = v ./ norm(v, 2); - * last_v = ones(N, 1) * inf; - * M_hat = (d .* M) + (((1 - d) / N) .* ones(N, N)); + * N = size(M, 2); % N is equal to half the size of M v = rand(N, 1); v = v ./ norm(v, 2); last_v = ones(N, 1) + * * inf; M_hat = (d .* M) + (((1 - d) / N) .* ones(N, N)); * - * while(norm(v - last_v, 2) > v_quadratic_error) - * last_v = v; - * v = M_hat * v; - * v = v ./ norm(v, 2); - * end + * while(norm(v - last_v, 2) > v_quadratic_error) last_v = v; v = M_hat * v; v = v ./ norm(v, 2); end * * endfunction * - * M = [0 0 0 0 1 ; 0.5 0 0 0 0 ; 0.5 0 0 0 0 ; 0 1 0.5 0 0 ; 0 0 0.5 1 0]; - * rank(M, 0.4, 0.001) - * + * M = [0 0 0 0 1 ; 0.5 0 0 0 0 ; 0.5 0 0 0 0 ; 0 1 0.5 0 0 ; 0 0 0.5 1 0]; rank(M, 0.4, 0.001) */ diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala index d215b03d77..b55e30ab5f 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class WeightedPageRankSpec extends WordSpec with Matchers { "Weighted PageRank job" should { @@ -26,7 +26,7 @@ class WeightedPageRankSpec extends WordSpec with Matchers { .arg("maxiterations", "1") .arg("jumpprob", "0.1") .source(Tsv("./nodes"), List((1, "2,3", "1,2", 0.26), (2, "3", "1", 0.54), (3, "", "", 0.2))) - .source(Tsv("./numnodes"), List((3))) + .source(Tsv("./numnodes"), List(3)) .source(Tsv("./pagerank_0"), List((1, 0.086), (2, 0.192), (3, 0.722))) .typedSink(TypedTsv[Double]("./totaldiff")) { ob => (idx + ": have low error") in { @@ -34,13 +34,13 @@ class WeightedPageRankSpec extends WordSpec with Matchers { } idx += 1 } - .sink[(Int, Double)](Tsv("./pagerank_1")){ outputBuffer => - val pageRank = outputBuffer.map { res => (res._1, res._2) }.toMap + .sink[(Int, Double)](Tsv("./pagerank_1")) { outputBuffer => + val pageRank = outputBuffer.map(res => (res._1, res._2)).toMap (idx + ": correctly compute pagerank") in { val deadMass = 0.722 / 3 * 0.9 - val userMass = List(0.26, 0.54, 0.2).map { _ * 0.1 } - val massNext = List(0, 0.086 / 3, (0.086 * 2 / 3 + 0.192)).map { _ * 0.9 } - val expected = (userMass zip massNext) map { a: (Double, Double) => a._1 + a._2 + deadMass } + val userMass = List(0.26, 0.54, 0.2).map(_ * 0.1) + val massNext = List(0, 0.086 / 3, (0.086 * 2 / 3 + 0.192)).map(_ * 0.9) + val expected = (userMass.zip(massNext)).map { a: (Double, Double) => a._1 + a._2 + deadMass } println(pageRank) (pageRank(1) + pageRank(2) + pageRank(3)) shouldBe 1.0 +- 0.001 diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala index 3dd4226248..b25b232cd9 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class WordCountTest extends WordSpec with Matchers { "A WordCount job" should { @@ -23,7 +23,7 @@ class WordCountTest extends WordSpec with Matchers { .arg("input", "inputFile") .arg("output", "outputFile") .source(TextLine("inputFile"), List((0, "hack hack hack and hack"))) - .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")){ outputBuffer => + .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "count words correctly" in { outMap("hack") shouldBe 4 diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala index 1961fe60ec..dd232c069f 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala @@ -12,16 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ import com.twitter.scalding.commons.datastores.VersionedStore import com.twitter.bijection.Injection import com.google.common.io.Files import org.apache.hadoop.mapred.JobConf -import java.io.{ File, FileWriter } +import java.io.{File, FileWriter} // Use the scalacheck generators import scala.collection.mutable.Buffer @@ -29,10 +29,11 @@ class TypedWriteIncrementalJob(args: Args) extends Job(args) { import RichPipeEx._ val pipe = TypedPipe.from(TypedTsv[Int]("input")) - implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] + implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = + Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] pipe - .map{ k => (k, k) } + .map(k => (k, k)) .writeIncremental(VersionedKeyValSource[Int, Int]("output")) } @@ -40,10 +41,11 @@ class MoreComplexTypedWriteIncrementalJob(args: Args) extends Job(args) { import RichPipeEx._ val pipe = TypedPipe.from(TypedTsv[Int]("input")) - implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] + implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = + Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] pipe - .map{ k => (k, k) } + .map(k => (k, k)) .group .sum .writeIncremental(VersionedKeyValSource[Int, Int]("output")) @@ -58,9 +60,7 @@ class ToIteratorJob(args: Args) extends Job(args) { val duplicatedPipe = TypedPipe.from(source) ++ iteratorPipe - duplicatedPipe - .group - .sum + duplicatedPipe.group.sum .writeIncremental(VersionedKeyValSource[Int, Int]("output")) } @@ -70,12 +70,15 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { "A TypedWriteIncrementalJob" should { JobTest(new TypedWriteIncrementalJob(_)) .source(TypedTsv[Int]("input"), input) - .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { outputBuffer: Buffer[(Int, Int)] => - "Outputs must be as expected" in { - assert(outputBuffer.size === input.size) - val singleInj = implicitly[Injection[Int, Array[Byte]]] - assert(input.map{ k => (k, k) }.sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString) - } + .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { + outputBuffer: Buffer[(Int, Int)] => + "Outputs must be as expected" in { + assert(outputBuffer.size === input.size) + val singleInj = implicitly[Injection[Int, Array[Byte]]] + assert( + input.map(k => (k, k)).sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString + ) + } } .run .finish() @@ -84,12 +87,15 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { "A MoreComplexTypedWriteIncrementalJob" should { JobTest(new MoreComplexTypedWriteIncrementalJob(_)) .source(TypedTsv[Int]("input"), input) - .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { outputBuffer: Buffer[(Int, Int)] => - "Outputs must be as expected" in { - assert(outputBuffer.size === input.size) - val singleInj = implicitly[Injection[Int, Array[Byte]]] - assert(input.map{ k => (k, k) }.sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString) - } + .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { + outputBuffer: Buffer[(Int, Int)] => + "Outputs must be as expected" in { + assert(outputBuffer.size === input.size) + val singleInj = implicitly[Injection[Int, Array[Byte]]] + assert( + input.map(k => (k, k)).sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString + ) + } } .run .finish() @@ -101,7 +107,7 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { .source(VersionedKeyValSource[Int, Int]("input"), input.zip(input)) .sink(VersionedKeyValSource[Int, Int]("output")) { outputBuffer: Buffer[(Int, Int)] => val (keys, vals) = outputBuffer.unzip - assert(keys.map { _ * 2 } === vals) + assert(keys.map(_ * 2) === vals) } .run .finish() @@ -113,8 +119,10 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { val path = setupLocalVersionStore(100L to 102L) val thrown = the[InvalidSourceException] thrownBy { validateVersion(path, Some(103)) } - assert(thrown.getMessage === "Version 103 does not exist. " + - "Currently available versions are: [102, 101, 100]") + assert( + thrown.getMessage === "Version 103 does not exist. " + + "Currently available versions are: [102, 101, 100]" + ) // should not throw validateVersion(path, Some(101)) @@ -127,27 +135,32 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { val oldContent = "size of old content should be ignored" val content = "Hello World" val contentSize = content.getBytes.length - val path = setupLocalVersionStore(100L to 102L, { - case 102L => Some(content) - case _ => Some(oldContent) - }) + val path = setupLocalVersionStore( + 100L to 102L, + { + case 102L => Some(content) + case _ => Some(oldContent) + } + ) - val keyValueSize = VersionedKeyValSource(path) - .source + val keyValueSize = VersionedKeyValSource(path).source .getSize(new JobConf()) - contentSize should be (keyValueSize) + contentSize should be(keyValueSize) } } /** * Creates a temp dir and then creates the provided versions within it. */ - private def setupLocalVersionStore(versions: Seq[Long], contentFn: Long => Option[String] = _ => None): String = { + private def setupLocalVersionStore( + versions: Seq[Long], + contentFn: Long => Option[String] = _ => None + ): String = { val root = Files.createTempDir() root.deleteOnExit() val store = new VersionedStore(root.getAbsolutePath) - versions foreach { v => + versions.foreach { v => val p = store.createVersion(v) new File(p).mkdirs() @@ -165,8 +178,7 @@ class VersionedKeyValSourceTest extends WordSpec with Matchers { } /** - * Creates a VersionedKeyValSource using the provided version - * and then validates it. + * Creates a VersionedKeyValSource using the provided version and then validates it. */ private def validateVersion(path: String, version: Option[Long] = None) = VersionedKeyValSource(path = path, sourceVersion = version) diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala index 45d9b1888b..a960c26dd1 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.extensions @@ -21,7 +21,8 @@ import org.scalatest.WordSpec import scala.collection.mutable.Buffer /** - * @author Mike Jahr + * @author + * Mike Jahr */ class CheckpointJob(args: Args) extends Job(args) { @@ -37,7 +38,7 @@ class CheckpointJob(args: Args) extends Job(args) { in0 .joinWithSmaller('y0 -> 'y1, in1) .map(('s0, 's1) -> 'score) { v: (Int, Int) => v._1 * v._2 } - .groupBy('x0, 'x1) { _.sum[Double]('score) } + .groupBy('x0, 'x1)(_.sum[Double]('score)) } out.write(Tsv("output")) @@ -50,17 +51,18 @@ class TypedCheckpointJob(args: Args) extends Job(args) { def in0 = Checkpoint[(Int, Int, Int)]("c0") { TypedTsv[(Int, Int, Int)]("input0").map(x => x) } - def in1 = Checkpoint[(Int, Int, Int)]("c1"){ + def in1 = Checkpoint[(Int, Int, Int)]("c1") { TypedTsv[(Int, Int, Int)]("input1").map(x => x) } def out = Checkpoint[(Int, Int, Double)]("c2") { - in0.groupBy(_._2) + in0 + .groupBy(_._2) .join(in1.groupBy(_._2)) - .mapValues{ case (l, r) => ((l._1, r._1), (l._3 * r._3).toDouble) } + .mapValues { case (l, r) => ((l._1, r._1), (l._3 * r._3).toDouble) } .values .group .sum - .map{ tup => (tup._1._1, tup._1._2, tup._2) } // super ugly, don't do this in a real job + .map(tup => (tup._1._1, tup._1._2, tup._2)) // super ugly, don't do this in a real job } out.write(TypedTsv[(Int, Int, Double)]("output")) diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala index bf07a31709..157deb1ea9 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala @@ -1,23 +1,17 @@ package com.twitter.scalding.commons.scheme import com.twitter.scalding.source.TypedSequenceFile -import com.twitter.scalding.{ - Config, - Execution, - Hdfs, - Local, - TypedPipe -} +import com.twitter.scalding.{Config, Execution, Hdfs, Local, TypedPipe} import org.apache.hadoop.conf.Configuration -import org.scalatest.{ Matchers, WordSpec } -import scala.util.{ Failure, Success } +import org.scalatest.{Matchers, WordSpec} +import scala.util.{Failure, Success} class ExecutionTest extends WordSpec with Matchers { object TestPath { def getCurrentDirectory = new java.io.File(".").getCanonicalPath def prefix = getCurrentDirectory.split("/").last match { case "scalding-commons" => getCurrentDirectory - case _ => getCurrentDirectory + "/scalding-commons" + case _ => getCurrentDirectory + "/scalding-commons" } val testfsPathRoot = prefix + "/src/test/resources/com/twitter/scalding/test_filesystem/" } @@ -53,14 +47,16 @@ class ExecutionTest extends WordSpec with Matchers { } "Execution" should { - class TypedSequenceFileSource[T](override val path: String) extends TypedSequenceFile[T](path) with CombinedSequenceFileScheme + class TypedSequenceFileSource[T](override val path: String) + extends TypedSequenceFile[T](path) + with CombinedSequenceFileScheme "toIterableExecution works correctly on partly empty input (empty part, part with value)" in { val exec = TypedPipe .from(new TypedSequenceFileSource[(Long, Long)](TestPath.testfsPathRoot + "test_data/2013/09")) .toIterableExecution - .map { _.toSet } + .map(_.toSet) val res = exec.shouldSucceedHadoop() @@ -72,7 +68,7 @@ class ExecutionTest extends WordSpec with Matchers { TypedPipe .from(new TypedSequenceFileSource[(Long, Long)](TestPath.testfsPathRoot + "test_data/2013/10")) .toIterableExecution - .map { _.toSet } + .map(_.toSet) val res = exec.shouldSucceedHadoop() diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala index 6b87a9edc5..955d9f3e2b 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import com.twitter.bijection.JavaSerializationInjection -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.util.Success class LzoGenericSourceSpec extends WordSpec with Matchers { diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala index 9cc4ed753f..60c1c513a2 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala @@ -9,7 +9,8 @@ case class Test2(one: Test1, d: String) class TypedTextTest extends FunSuite { test("Test with a nested tuple: Daily") { - val source = LzoTypedText.dailyLzoTsv[Test2]("myPath")(DateRange(RichDate.now, RichDate.now + Hours(1)), implicitly) + val source = + LzoTypedText.dailyLzoTsv[Test2]("myPath")(DateRange(RichDate.now, RichDate.now + Hours(1)), implicitly) assert(source.sourceFields.size == 4) } } diff --git a/scalding-core/src/main/scala/com/twitter/package.scala b/scalding-core/src/main/scala/com/twitter/package.scala index 18ddfcacac..090a463f95 100644 --- a/scalding-core/src/main/scala/com/twitter/package.scala +++ b/scalding-core/src/main/scala/com/twitter/package.scala @@ -12,15 +12,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter -import org.apache.hadoop.fs.{ Path, PathFilter } +import org.apache.hadoop.fs.{Path, PathFilter} package object scalding { + /** - * The objects for the Typed-API live in the scalding.typed package - * but are aliased here. + * The objects for the Typed-API live in the scalding.typed package but are aliased here. */ val TDsl = com.twitter.scalding.typed.TDsl val TypedPipe = com.twitter.scalding.typed.TypedPipe @@ -42,35 +42,29 @@ package object scalding { class RichPathFilter(f: PathFilter) { - def and(filters: PathFilter*): PathFilter = { + def and(filters: PathFilter*): PathFilter = new AndPathFilter(Seq(f) ++ filters) - } - def or(filters: PathFilter*): PathFilter = { + def or(filters: PathFilter*): PathFilter = new OrPathFilter(Seq(f) ++ filters) - } - def not: PathFilter = { + def not: PathFilter = new NotPathFilter(f) - } } private[this] class AndPathFilter(filters: Seq[PathFilter]) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = filters.forall(_.accept(p)) - } } private[this] class OrPathFilter(filters: Seq[PathFilter]) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = filters.exists(_.accept(p)) - } } private[this] class NotPathFilter(filter: PathFilter) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = !filter.accept(p) - } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala b/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala index 94c21a0ae4..6470ff2550 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala @@ -14,24 +14,29 @@ class HelpException extends RuntimeException("User asked for help") class DescriptionValidationException(msg: String) extends RuntimeException(msg) trait ArgHelper { + /** * Similar to describe but validate all args are described * - * @param describedArgs List of Argument Descriptions - * @param ex Input Execution - * @return Output Execution + * @param describedArgs + * List of Argument Descriptions + * @param ex + * Input Execution + * @return + * Output Execution */ - def validatedDescribe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = { + def validatedDescribe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = Execution.getArgs.flatMap { args => validatedDescribe(describedArgs, args) ex } - } /** * Describe a set of Args given Descriptions and validate all Args are described - * @param describedArgs List of Argument Descriptions - * @param args Job Arguments + * @param describedArgs + * List of Argument Descriptions + * @param args + * Job Arguments */ def validatedDescribe(describedArgs: Seq[DescribedArg], args: Args): Unit = { describe(describedArgs, args) @@ -46,25 +51,29 @@ trait ArgHelper { } /** - * Describe the Arguments of this Execution. By running --help the args will output - * and the execution will end + * Describe the Arguments of this Execution. By running --help the args will output and the execution will + * end * - * @param describedArgs List of Argument Descriptions - * @param ex Input Execution - * @return Output Execution + * @param describedArgs + * List of Argument Descriptions + * @param ex + * Input Execution + * @return + * Output Execution */ - def describe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = { + def describe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = Execution.getArgs.flatMap { args => describe(describedArgs, args) ex } - } /** * Describe a set of Args given Descriptions * - * @param describedArgs List of Argument Descriptions - * @param args Job Arguments + * @param describedArgs + * List of Argument Descriptions + * @param args + * Job Arguments */ def describe(describedArgs: Seq[DescribedArg], args: Args): Unit = if (args.boolean("help")) helpRequest(describedArgs) @@ -83,40 +92,40 @@ trait ArgHelper { /** * Command line arg string given the Described Args * - * @param describedArgs List of Argument Descriptions - * @return Command Line Parameters + * @param describedArgs + * List of Argument Descriptions + * @return + * Command Line Parameters */ - private[this] def argString(describedArgs: Seq[DescribedArg]): String = { - describedArgs.foldLeft("") { - case (str, describedArg) => - val msg = describedArg match { - case RequiredArg(key, _) => s"--$key VALUE " - case OptionalArg(key, _) => s"[--$key VALUE] " - case ListArg(key, _) => s"[--$key VALUE VALUE2] " - case BooleanArg(key, _) => s"[--$key] " - } - str + msg + private[this] def argString(describedArgs: Seq[DescribedArg]): String = + describedArgs.foldLeft("") { case (str, describedArg) => + val msg = describedArg match { + case RequiredArg(key, _) => s"--$key VALUE " + case OptionalArg(key, _) => s"[--$key VALUE] " + case ListArg(key, _) => s"[--$key VALUE VALUE2] " + case BooleanArg(key, _) => s"[--$key] " + } + str + msg } + "[--help]" - } /** * More detailed help command for these described arguments * - * @param describedArgs List of Argument Descriptions - * @return Detailed Help for the Args + * @param describedArgs + * List of Argument Descriptions + * @return + * Detailed Help for the Args */ - private[this] def help(describedArgs: Seq[DescribedArg]): String = { - describedArgs.foldLeft("") { - case (str, describedArg) => - val msg = describedArg match { - case RequiredArg(key, description) => s"--$key(Required) :: $description \n" - case OptionalArg(key, description) => s"--$key(Optional) :: $description \n" - case ListArg(key, description) => s"--$key(List) :: $description \n" - case BooleanArg(key, description) => s"--$key(Boolean) :: $description \n" - } - str + msg + private[this] def help(describedArgs: Seq[DescribedArg]): String = + describedArgs.foldLeft("") { case (str, describedArg) => + val msg = describedArg match { + case RequiredArg(key, description) => s"--$key(Required) :: $description \n" + case OptionalArg(key, description) => s"--$key(Optional) :: $description \n" + case ListArg(key, description) => s"--$key(List) :: $description \n" + case BooleanArg(key, description) => s"--$key(Boolean) :: $description \n" + } + str + msg } + "--help :: Show this help message." - } } object ArgHelp extends ArgHelper diff --git a/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala b/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala index b5ad813408..befebc1553 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala @@ -16,13 +16,18 @@ limitations under the License. package com.twitter.scalding import com.twitter.scalding.serialization.OrderedSerialization -import com.twitter.bijection.{ ImplicitBijection, Injection } +import com.twitter.bijection.{ImplicitBijection, Injection} object BijectedOrderedSerialization { - implicit def fromBijection[T, U](implicit bij: ImplicitBijection[T, U], ordSer: OrderedSerialization[U]): OrderedSerialization[T] = + implicit def fromBijection[T, U](implicit + bij: ImplicitBijection[T, U], + ordSer: OrderedSerialization[U] + ): OrderedSerialization[T] = OrderedSerialization.viaTransform[T, U](bij.apply(_), bij.invert(_)) - implicit def fromInjection[T, U](implicit bij: Injection[T, U], ordSer: OrderedSerialization[U]): OrderedSerialization[T] = + implicit def fromInjection[T, U](implicit + bij: Injection[T, U], + ordSer: OrderedSerialization[U] + ): OrderedSerialization[T] = OrderedSerialization.viaTryTransform[T, U](bij.apply(_), bij.invert(_)) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CFuture.scala b/scalding-core/src/main/scala/com/twitter/scalding/CFuture.scala index 7d3d3421ff..1c1461ec2c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CFuture.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CFuture.scala @@ -1,6 +1,6 @@ package com.twitter.scalding -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} /** * Represents a cancellable future. @@ -25,29 +25,27 @@ case class CFuture[+T](future: Future[T], cancellationHandler: CancellationHandl } object CFuture { - def successful[T](result: T): CFuture[T] = { + def successful[T](result: T): CFuture[T] = CFuture(Future.successful(result), CancellationHandler.empty) - } def failed(t: Throwable): CFuture[Nothing] = { val f = Future.failed(t) CFuture(f, CancellationHandler.empty) } - def uncancellable[T](fut: Future[T]): CFuture[T] = { + def uncancellable[T](fut: Future[T]): CFuture[T] = CFuture(fut, CancellationHandler.empty) - } - def fromFuture[T](fut: Future[CFuture[T]])(implicit cec: ConcurrentExecutionContext): CFuture[T] = { + def fromFuture[T](fut: Future[CFuture[T]])(implicit cec: ConcurrentExecutionContext): CFuture[T] = CFuture(fut.flatMap(_.future), CancellationHandler.fromFuture(fut.map(_.cancellationHandler))) - } /** * Use our internal faster failing zip function rather than the standard one due to waiting */ - def failFastSequence[T](t: Iterable[CFuture[T]])(implicit cec: ConcurrentExecutionContext): CFuture[List[T]] = { + def failFastSequence[T]( + t: Iterable[CFuture[T]] + )(implicit cec: ConcurrentExecutionContext): CFuture[List[T]] = t.foldLeft(CFuture.successful(Nil: List[T])) { (f, i) => f.zip(i).map { case (tail, h) => h :: tail } }.map(_.reverse) - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CPromise.scala b/scalding-core/src/main/scala/com/twitter/scalding/CPromise.scala index 451831a3ff..d06f6b04dc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CPromise.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CPromise.scala @@ -1,17 +1,17 @@ package com.twitter.scalding -import scala.concurrent.{ Future, Promise, ExecutionContext => ConcurrentExecutionContext } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} /** * Represents a cancellable promise. */ case class CPromise[T](promise: Promise[T], cancellationHandler: Promise[CancellationHandler]) { + /** * Creates a CFuture using the given promises. */ - def cfuture: CFuture[T] = { + def cfuture: CFuture[T] = CFuture(promise.future, CancellationHandler.fromFuture(cancellationHandler.future)) - } def completeWith(other: CFuture[T]): this.type = { // fulfill the main and cancellation handler promises diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CancellationHandler.scala b/scalding-core/src/main/scala/com/twitter/scalding/CancellationHandler.scala index f3c007a6ff..f5967f6217 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CancellationHandler.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CancellationHandler.scala @@ -1,13 +1,12 @@ package com.twitter.scalding -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} sealed trait CancellationHandler { outer => def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] def compose(other: CancellationHandler): CancellationHandler = new CancellationHandler { - override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = { + override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = other.stop().zip(outer.stop()).map(_ => ()) - } } } @@ -21,8 +20,7 @@ object CancellationHandler { } def fromFuture(f: Future[CancellationHandler]): CancellationHandler = new CancellationHandler { - override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = { + override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = f.flatMap(_.stop()) - } } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala index 58e80e8200..9a1a8f4d7e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala @@ -8,7 +8,7 @@ abstract class CascadeJob(args: Args) extends Job(args) { def jobs: Seq[Job] override def run = { - val flows = jobs.map { _.buildFlow } + val flows = jobs.map(_.buildFlow) val cascade = new CascadeConnector().connect(flows: _*) preProcessCascade(cascade) cascade.complete() @@ -19,9 +19,8 @@ abstract class CascadeJob(args: Args) extends Job(args) { statsData.isSuccessful } - override def validate(): Unit = { - jobs.foreach { _.validate() } - } + override def validate(): Unit = + jobs.foreach(_.validate()) /* * Good for printing a dot file, setting the flow skip strategy, etc diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala index 6227145129..06a6be71f4 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala @@ -1,23 +1,23 @@ package com.twitter.scalding -import cascading.flow.local.{ LocalFlowConnector, LocalFlowProcess } -import cascading.flow.{ FlowConnector, FlowProcess } +import cascading.flow.local.{LocalFlowConnector, LocalFlowProcess} +import cascading.flow.{FlowConnector, FlowProcess} import cascading.property.AppProps -import cascading.tap.{ CompositeTap, Tap } +import cascading.tap.{CompositeTap, Tap} import cascading.tap.hadoop.Hfs -import cascading.tuple.{ Tuple, TupleEntryIterator } +import cascading.tuple.{Tuple, TupleEntryIterator} import com.twitter.scalding.tap.ScaldingHfs import com.twitter.scalding.typed.cascading_backend.AsyncFlowDefRunner import java.io.File -import java.util.{ Properties, UUID } +import java.util.{Properties, UUID} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory import scala.annotation.tailrec import scala.collection.JavaConverters._ -import scala.collection.mutable.{ Buffer, Map => MMap, Set => MSet } -import scala.util.{ Failure, Success } +import scala.collection.mutable.{Buffer, Map => MMap, Set => MSet} +import scala.util.{Failure, Success} /** * Any Mode running on cascading extends CascadingMode @@ -38,13 +38,14 @@ trait CascadingMode extends Mode { // Returns true if the file exists on the current filesystem. def fileExists(filename: String): Boolean + /** Create a new FlowConnector for this cascading planner */ def newFlowConnector(props: Config): FlowConnector /** * Make sure we are using our `ScaldingHfs` for `Hfs` taps. */ - protected def checkTap(tap: Tap[_, _, _], config: Config): Unit = { + protected def checkTap(tap: Tap[_, _, _], config: Config): Unit = if (config.getCheckHfsTaps) { tap match { case hfs: Hfs => @@ -52,24 +53,22 @@ trait CascadingMode extends Mode { hfs.getClass.isAssignableFrom(classOf[ScaldingHfs]), """You are using instance of tap inherited from cascading.tap.hadoop.Hfs in toIterator method, |which is broken in cascading 2.6.1, instead you need to use com.twitter.scalding.tap.ScaldingHfs. - """.stripMargin) + """.stripMargin + ) case composite: CompositeTap[t] => - composite - .getChildTaps - .asScala + composite.getChildTaps.asScala .map(_.asInstanceOf[Tap[_, _, _]]) .foreach(checkTap(_, config)) case _ => } } - } } object CascadingMode { def cast(m: Mode): CascadingMode = m match { case cm: CascadingMode => cm - case other => throw new ModeException(s"mode: $other is not a CascadingMode") + case other => throw new ModeException(s"mode: $other is not a CascadingMode") } } @@ -84,10 +83,13 @@ trait HadoopMode extends CascadingMode { case Some(Success(cls)) => asMap + (jarKey -> cls) case Some(Failure(err)) => // This may or may not cause the job to fail at submission, let's punt till then - LoggerFactory.getLogger(getClass) + LoggerFactory + .getLogger(getClass) .error( - "Could not create class from: %s in config key: %s, Job may fail.".format(conf.get(jarKey), AppProps.APP_JAR_CLASS), - err) + "Could not create class from: %s in config key: %s, Job may fail." + .format(conf.get(jarKey), AppProps.APP_JAR_CLASS), + err + ) // Just delete the key and see if it fails when cascading tries to submit asMap - jarKey case None => asMap @@ -101,7 +103,10 @@ trait HadoopMode extends CascadingMode { ctor.newInstance(finalMap.asJava).asInstanceOf[FlowConnector] } catch { case ncd: ClassNotFoundException => { - throw new ModeLoadException("Failed to load Cascading flow connector class " + flowConnectorClass, ncd) + throw new ModeLoadException( + "Failed to load Cascading flow connector class " + flowConnectorClass, + ncd + ) } } } @@ -112,19 +117,20 @@ trait HadoopMode extends CascadingMode { val htap = tap.asInstanceOf[Tap[JobConf, _, _]] val conf = new JobConf(true) // initialize the default config // copy over Config - config.toMap.foreach{ case (k, v) => conf.set(k, v) } + config.toMap.foreach { case (k, v) => conf.set(k, v) } val flowProcessClass = jobConf.get(Mode.CascadingFlowProcessClassKey, Mode.DefaultHadoopFlowProcess) - val fp = try { - val clazz = Class.forName(flowProcessClass) - val ctor = clazz.getConstructor(classOf[JobConf]) - ctor.newInstance(conf).asInstanceOf[FlowProcess[JobConf]] - } catch { - case ncd: ClassNotFoundException => { - throw new ModeLoadException("Failed to load Cascading flow process class " + flowProcessClass, ncd) + val fp = + try { + val clazz = Class.forName(flowProcessClass) + val ctor = clazz.getConstructor(classOf[JobConf]) + ctor.newInstance(conf).asInstanceOf[FlowProcess[JobConf]] + } catch { + case ncd: ClassNotFoundException => { + throw new ModeLoadException("Failed to load Cascading flow process class " + flowProcessClass, ncd) + } } - } htap.retrieveSourceFields(fp) htap.sourceConfInit(fp, conf) @@ -165,15 +171,16 @@ case class Hdfs(strict: Boolean, @transient conf: Configuration) extends HadoopM } object Hdfs { + /** * Make an Hdfs instance in strict mode with new Configuration */ def default: Hdfs = Hdfs(true, new Configuration) } -case class HadoopTest(@transient conf: Configuration, - @transient buffers: Source => Option[Buffer[Tuple]]) - extends HadoopMode with TestMode { +case class HadoopTest(@transient conf: Configuration, @transient buffers: Source => Option[Buffer[Tuple]]) + extends HadoopMode + with TestMode { // This is a map from source.toString to disk path private val writePaths = MMap[Source, String]() @@ -208,7 +215,9 @@ case class HadoopTest(@transient conf: Configuration, * functions, and those functions have been documented accordingly to * warn about this invariant. */ - @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) // Get the buffer for the given source, and empty it: + @SuppressWarnings( + Array("org.wartremover.warts.OptionPartial") + ) // Get the buffer for the given source, and empty it: val buf = buffers(src).get buf.clear() // Now fill up this buffer with the content of the file diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala index 227c5864d0..3a70858a8a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.hadoop.SerializationToken @@ -33,7 +33,7 @@ object CascadingTokenUpdater { .filter(_.nonEmpty) .map(_.split("=")) .filter(_.length == 2) - .map { ary => (ary(0).toInt, ary(1)) } + .map(ary => (ary(0).toInt, ary(1))) .toMap // does the inverse of the previous function, given a Map of index to class @@ -50,32 +50,37 @@ object CascadingTokenUpdater { // assign each of the class names given to al the subsequent // positions private def assignTokens(first: Int, names: Iterable[String]): Map[Int, String] = - names.foldLeft((first, Map[Int, String]())) { (idMap, clz) => - val (id, m) = idMap - (id + 1, m + (id -> clz)) - }._2 + names + .foldLeft((first, Map[Int, String]())) { (idMap, clz) => + val (id, m) = idMap + (id + 1, m + (id -> clz)) + } + ._2 def update(config: Config, clazzes: Set[Class[_]]): Config = { val toks = config.getCascadingSerializationTokens val serializations = config.get(Config.IoSerializationsKey).getOrElse("") - val fromSerializations: Seq[String] = if (serializations.isEmpty) - Seq.empty - else - for { - serialization <- serializations.split(",") - clazz = Class.forName(serialization) - tokenAnnotation = clazz.getAnnotation(classOf[SerializationToken]) - if tokenAnnotation != null - className <- tokenAnnotation.classNames() - } yield { - className - } + val fromSerializations: Seq[String] = + if (serializations.isEmpty) + Seq.empty + else + for { + serialization <- serializations.split(",") + clazz = Class.forName(serialization) + tokenAnnotation = clazz.getAnnotation(classOf[SerializationToken]) + if tokenAnnotation != null + className <- tokenAnnotation.classNames() + } yield { + className + } // We don't want to assign tokens to classes already in the map - val newClasses: Iterable[String] = clazzes.map { _.getName } -- fromSerializations -- toks.values + val newClasses: Iterable[String] = clazzes.map(_.getName) -- fromSerializations -- toks.values - config + (Config.CascadingSerializationTokens -> toksToString(toks ++ assignTokens(firstAvailableToken(toks), newClasses))) + config + (Config.CascadingSerializationTokens -> toksToString( + toks ++ assignTokens(firstAvailableToken(toks), newClasses) + )) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala b/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala index aa8ee5d786..1d454916cc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala @@ -12,17 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe.{ CoGroup, Every, Pipe } +import cascading.pipe.{CoGroup, Every, Pipe} import cascading.pipe.joiner.MixedJoin import cascading.tuple.Fields /** - * Builder classes used internally to implement coGroups (joins). - * Can also be used for more generalized joins, e.g., star joins. - * + * Builder classes used internally to implement coGroups (joins). Can also be used for more generalized joins, + * e.g., star joins. */ class CoGroupBuilder(groupFields: Fields, joinMode: JoinMode) extends GroupBuilder(groupFields) { protected var coGroups: List[(Fields, Pipe, JoinMode)] = Nil @@ -40,9 +39,9 @@ class CoGroupBuilder(groupFields: Fields, joinMode: JoinMode) extends GroupBuild override def schedule(name: String, pipe: Pipe): Pipe = { assert(!sorting.isDefined, "cannot use a sortBy when doing a coGroup") assert(!coGroups.isEmpty, "coGroupBy requires at least one other pipe to .coGroup") - val fields = (groupFields :: coGroups.map{ _._1 }).toArray - val pipes = (pipe :: coGroups.map{ _._2 }).map{ RichPipe.assignName(_) }.toArray - val joinModes = (joinMode :: coGroups.map{ _._3 }).map{ _.booleanValue }.toArray + val fields = (groupFields :: coGroups.map(_._1)).toArray + val pipes = (pipe :: coGroups.map(_._2)).map(RichPipe.assignName(_)).toArray + val joinModes = (joinMode :: coGroups.map(_._3)).map(_.booleanValue).toArray val mixedJoiner = new MixedJoin(joinModes) val cg: Pipe = new CoGroup(pipes, fields, null, WrappedJoiner(mixedJoiner)) overrideReducers(cg) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Config.scala b/scalding-core/src/main/scala/com/twitter/scalding/Config.scala index f89bc55489..540a02c9d2 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Config.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Config.scala @@ -12,20 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.MRJobConfig -import org.apache.hadoop.io.serializer.{ Serialization => HSerialization } -import com.twitter.chill.{ ExternalizerCodec, ExternalizerInjection, Externalizer, KryoInstantiator } -import com.twitter.chill.config.{ ScalaMapConfig, ConfiguredInstantiator } -import com.twitter.bijection.{ Base64String, Injection } +import org.apache.hadoop.io.serializer.{Serialization => HSerialization} +import com.twitter.chill.{Externalizer, ExternalizerCodec, ExternalizerInjection, KryoInstantiator} +import com.twitter.chill.config.{ConfiguredInstantiator, ScalaMapConfig} +import com.twitter.bijection.{Base64String, Injection} import com.twitter.scalding.filecache.{CachedFile, DistributedCacheFile, HadoopCachedFile} import cascading.pipe.assembly.AggregateBy -import cascading.flow.{ FlowListener, FlowStepListener, FlowProps, FlowStepStrategy } +import cascading.flow.{FlowListener, FlowProps, FlowStepListener, FlowStepStrategy} import cascading.property.AppProps import cascading.tuple.collect.SpillableProps @@ -33,7 +33,7 @@ import java.security.MessageDigest import java.net.URI import scala.collection.JavaConverters._ -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} import com.twitter.scalding.serialization.RequireOrderedSerializationMode /** @@ -51,7 +51,7 @@ abstract class Config extends Serializable { def update[R](k: String)(fn: Option[String] => (Option[String], R)): (R, Config) = fn(get(k)) match { case (Some(v), r) => (r, this + (k -> v)) - case (None, r) => (r, this - k) + case (None, r) => (r, this - k) } def getBoolean(key: String, orElse: => Boolean): Boolean = @@ -59,28 +59,28 @@ abstract class Config extends Serializable { /** * Add files to be localized to the config. Intended to be used by user code. - * @param cachedFiles CachedFiles to be added - * @return new Config with cached files + * @param cachedFiles + * CachedFiles to be added + * @return + * new Config with cached files */ def addDistributedCacheFiles(cachedFiles: CachedFile*): Config = cachedFiles.foldLeft(this) { case (config, file) => - file match { - case hadoopFile: HadoopCachedFile => - Config.addDistributedCacheFile(hadoopFile.sourceUri, config) - case _ => config - } + file match { + case hadoopFile: HadoopCachedFile => + Config.addDistributedCacheFile(hadoopFile.sourceUri, config) + case _ => config + } } /** * Get cached files from config */ - def getDistributedCachedFiles: Seq[CachedFile] = { + def getDistributedCachedFiles: Seq[CachedFile] = Config.getDistributedCacheFile(this) - } /** - * This is a name that if present is passed to flow.setName, - * which should appear in the job tracker. + * This is a name that if present is passed to flow.setName, which should appear in the job tracker. */ def getCascadingAppName: Option[String] = get(CascadingAppName) def setCascadingAppName(name: String): Config = @@ -90,19 +90,15 @@ abstract class Config extends Serializable { this + (CascadingAppId -> id) /** - * Non-fat-jar use cases require this, BUT using it - * with fat jars can cause problems. It is not - * set by default, but if you have problems you - * might need to set the Job class here - * Consider also setting this same class here: - * setScaldingFlowClass + * Non-fat-jar use cases require this, BUT using it with fat jars can cause problems. It is not set by + * default, but if you have problems you might need to set the Job class here Consider also setting this + * same class here: setScaldingFlowClass */ def setCascadingAppJar(clazz: Class[_]): Config = this + (AppProps.APP_JAR_CLASS -> clazz.getName) /** - * Returns None if not set, otherwise reflection - * is used to create the Class.forName + * Returns None if not set, otherwise reflection is used to create the Class.forName */ def getCascadingAppJar: Option[Try[Class[_]]] = getClassForKey(AppProps.APP_JAR_CLASS) @@ -112,11 +108,11 @@ abstract class Config extends Serializable { try { Success( // Make sure we are using the class-loader for the current thread - Class.forName(str, true, Thread.currentThread().getContextClassLoader)) + Class.forName(str, true, Thread.currentThread().getContextClassLoader) + ) } catch { case err: Throwable => Failure(err) } } - /* * Used in joins to determine how much of the "right hand side" of * the join to keep in memory @@ -154,20 +150,20 @@ abstract class Config extends Serializable { getRequireOrderedSerializationMode == Some(RequireOrderedSerializationMode.Fail) /** - * Set this configuration option to require all grouping/cogrouping - * to use OrderedSerialization + * Set this configuration option to require all grouping/cogrouping to use OrderedSerialization */ def setRequireOrderedSerializationMode(r: Option[RequireOrderedSerializationMode]): Config = - r.map { - v => this + (ScaldingRequireOrderedSerialization -> (v.toString)) + r.map { v => + this + (ScaldingRequireOrderedSerialization -> (v.toString)) }.getOrElse(this) def getRequireOrderedSerializationMode: Option[RequireOrderedSerializationMode] = get(ScaldingRequireOrderedSerialization) - .map(_.toLowerCase()).collect { + .map(_.toLowerCase()) + .collect { case "true" => RequireOrderedSerializationMode.Fail // backwards compatibility case "fail" => RequireOrderedSerializationMode.Fail - case "log" => RequireOrderedSerializationMode.Log + case "log" => RequireOrderedSerializationMode.Log } def getCascadingSerializationTokens: Map[Int, String] = @@ -176,25 +172,25 @@ abstract class Config extends Serializable { .getOrElse(Map.empty[Int, String]) /** - * This function gets the set of classes that have been registered to Kryo. - * They may or may not be used in this job, but Cascading might want to be made aware - * that these classes exist + * This function gets the set of classes that have been registered to Kryo. They may or may not be used in + * this job, but Cascading might want to be made aware that these classes exist */ - def getKryoRegisteredClasses: Set[Class[_]] = { + def getKryoRegisteredClasses: Set[Class[_]] = // Get an instance of the Kryo serializer (which is populated with registrations) - getKryo.map { kryo => - val cr = kryo.newKryo.getClassResolver - - @annotation.tailrec - def kryoClasses(idx: Int, acc: Set[Class[_]]): Set[Class[_]] = - Option(cr.getRegistration(idx)) match { - case Some(reg) => kryoClasses(idx + 1, acc + reg.getType) - case None => acc // The first null is the end of the line - } + getKryo + .map { kryo => + val cr = kryo.newKryo.getClassResolver + + @annotation.tailrec + def kryoClasses(idx: Int, acc: Set[Class[_]]): Set[Class[_]] = + Option(cr.getRegistration(idx)) match { + case Some(reg) => kryoClasses(idx + 1, acc + reg.getType) + case None => acc // The first null is the end of the line + } - kryoClasses(0, Set[Class[_]]()) - }.getOrElse(Set()) - } + kryoClasses(0, Set[Class[_]]()) + } + .getOrElse(Set()) /* * Hadoop and Cascading serialization needs to be first, and the Kryo serialization @@ -207,27 +203,31 @@ abstract class Config extends Serializable { * with a class to serialize to bootstrap the process: * Left((classOf[serialization.KryoHadoop], myInstance)) */ - def setSerialization(kryo: Either[(Class[_ <: KryoInstantiator], KryoInstantiator), Class[_ <: KryoInstantiator]], - userHadoop: Seq[Class[_ <: HSerialization[_]]] = Nil): Config = { + def setSerialization( + kryo: Either[(Class[_ <: KryoInstantiator], KryoInstantiator), Class[_ <: KryoInstantiator]], + userHadoop: Seq[Class[_ <: HSerialization[_]]] = Nil + ): Config = { // Hadoop and Cascading should come first val first: Seq[Class[_ <: HSerialization[_]]] = - Seq(classOf[org.apache.hadoop.io.serializer.WritableSerialization], + Seq( + classOf[org.apache.hadoop.io.serializer.WritableSerialization], classOf[cascading.tuple.hadoop.TupleSerialization], - classOf[serialization.WrappedSerialization[_]]) + classOf[serialization.WrappedSerialization[_]] + ) // this must come last val last: Seq[Class[_ <: HSerialization[_]]] = Seq(classOf[com.twitter.chill.hadoop.KryoSerialization]) val required = (first ++ last).toSet[AnyRef] // Class is invariant, but we use it as a function // Make sure we keep the order correct and don't add the required fields twice val hadoopSer = first ++ (userHadoop.filterNot(required)) ++ last - val hadoopKV = (Config.IoSerializationsKey -> hadoopSer.map(_.getName).mkString(",")) + val hadoopKV = Config.IoSerializationsKey -> hadoopSer.map(_.getName).mkString(",") // Now handle the Kryo portion which uses another mechanism val chillConf = ScalaMapConfig(toMap) kryo match { case Left((bootstrap, inst)) => ConfiguredInstantiator.setSerialized(chillConf, bootstrap, inst) - case Right(refl) => ConfiguredInstantiator.setReflect(chillConf, refl) + case Right(refl) => ConfiguredInstantiator.setReflect(chillConf, refl) } val withKryo = Config(chillConf.toMap + hadoopKV) @@ -242,16 +242,21 @@ abstract class Config extends Serializable { * If a ConfiguredInstantiator has been set up, this returns it */ def getKryo: Option[KryoInstantiator] = - if (toMap.contains(ConfiguredInstantiator.KEY)) Some((new ConfiguredInstantiator(ScalaMapConfig(toMap))).getDelegate) + if (toMap.contains(ConfiguredInstantiator.KEY)) + Some((new ConfiguredInstantiator(ScalaMapConfig(toMap))).getDelegate) else None def getArgs: Args = get(Config.ScaldingJobArgsSerialized) match { case None => new Args(Map.empty) - case Some(str) => argsSerializer - .invert(str) - .map(new Args(_)) - .getOrElse(throw new RuntimeException( - s"""Could not deserialize Args from Config. Maybe "$ScaldingJobArgsSerialized" was modified without using Config.setArgs?""")) + case Some(str) => + argsSerializer + .invert(str) + .map(new Args(_)) + .getOrElse( + throw new RuntimeException( + s"""Could not deserialize Args from Config. Maybe "$ScaldingJobArgsSerialized" was modified without using Config.setArgs?""" + ) + ) } def setArgs(args: Args): Config = @@ -277,20 +282,22 @@ abstract class Config extends Serializable { def getScaldingVersion: Option[String] = get(Config.ScaldingVersion) def setScaldingVersion: Config = - (this.+(Config.ScaldingVersion -> scaldingVersion)).+( - // This is setting a property for cascading/driven - (AppProps.APP_FRAMEWORKS -> ("scalding:" + scaldingVersion))) + (this + .+(Config.ScaldingVersion -> scaldingVersion)) + .+( + // This is setting a property for cascading/driven + (AppProps.APP_FRAMEWORKS -> ("scalding:" + scaldingVersion)) + ) def getUniqueIds: Set[UniqueID] = get(UniqueID.UNIQUE_JOB_ID) - .map { str => str.split(",").toSet[String].map(UniqueID(_)) } + .map(str => str.split(",").toSet[String].map(UniqueID(_))) .getOrElse(Set.empty) /** - * The serialization of your data will be smaller if any classes passed between tasks in your job - * are listed here. Without this, strings are used to write the types IN EACH RECORD, which - * compression probably takes care of, but compression acts AFTER the data is serialized into - * buffers and spilling has been triggered. + * The serialization of your data will be smaller if any classes passed between tasks in your job are listed + * here. Without this, strings are used to write the types IN EACH RECORD, which compression probably takes + * care of, but compression acts AFTER the data is serialized into buffers and spilling has been triggered. */ def addCascadingClassSerializationTokens(clazzes: Set[Class[_]]): Config = CascadingTokenUpdater.update(this, clazzes) @@ -301,7 +308,7 @@ abstract class Config extends Serializable { */ def addUniqueId(u: UniqueID): Config = update(UniqueID.UNIQUE_JOB_ID) { - case None => (Some(u.get), ()) + case None => (Some(u.get), ()) case Some(str) => (Some((StringUtility.fastSplit(str, ",").toSet + u.get).mkString(",")), ()) }._2 @@ -330,7 +337,9 @@ abstract class Config extends Serializable { * Add this class name and the md5 hash of it into the config */ def setScaldingFlowClass(clazz: Class[_]): Config = - this.+(ScaldingFlowClassName -> clazz.getName).+(ScaldingFlowClassSignature -> Config.md5Identifier(clazz)) + this + .+(ScaldingFlowClassName -> clazz.getName) + .+(ScaldingFlowClassSignature -> Config.md5Identifier(clazz)) def setScaldingFlowCounterValue(value: Long): Config = this + (ScaldingFlowCounterValue -> value.toString) @@ -339,7 +348,7 @@ abstract class Config extends Serializable { get(ScaldingFlowCounterValue).map(_.toLong) def getSubmittedTimestamp: Option[RichDate] = - get(ScaldingFlowSubmittedTimestamp).map { ts => RichDate(ts.toLong) } + get(ScaldingFlowSubmittedTimestamp).map(ts => RichDate(ts.toLong)) /* * Sets the timestamp only if it was not already set. This is here * to prevent overwriting the submission time if it was set by an @@ -348,23 +357,23 @@ abstract class Config extends Serializable { def maybeSetSubmittedTimestamp(date: RichDate = RichDate.now): (Option[RichDate], Config) = update(ScaldingFlowSubmittedTimestamp) { case s @ Some(ts) => (s, Some(RichDate(ts.toLong))) - case None => (Some(date.timestamp.toString), None) + case None => (Some(date.timestamp.toString), None) } /** - * Prepend an estimator so it will be tried first. If it returns None, - * the previously-set estimators will be tried in order. + * Prepend an estimator so it will be tried first. If it returns None, the previously-set estimators will be + * tried in order. */ def addReducerEstimator[T](cls: Class[T]): Config = addReducerEstimator(cls.getName) /** - * Prepend an estimator so it will be tried first. If it returns None, - * the previously-set estimators will be tried in order. + * Prepend an estimator so it will be tried first. If it returns None, the previously-set estimators will be + * tried in order. */ def addReducerEstimator(clsName: String): Config = update(Config.ReducerEstimators) { - case None => (Some(clsName), ()) + case None => (Some(clsName), ()) case Some(lst) => (Some(s"$clsName,$lst"), ()) }._2 @@ -378,35 +387,33 @@ abstract class Config extends Serializable { def addFlowListener(flowListenerProvider: (Mode, Config) => FlowListener): Config = { val serializedListener = flowListenerSerializer(flowListenerProvider) update(Config.FlowListeners) { - case None => (Some(serializedListener), ()) + case None => (Some(serializedListener), ()) case Some(lst) => (Some(s"$serializedListener,$lst"), ()) }._2 } def getFlowListeners: List[Try[(Mode, Config) => FlowListener]] = - get(Config.FlowListeners) - .toList + get(Config.FlowListeners).toList .flatMap(s => StringUtility.fastSplit(s, ",")) .map(flowListenerSerializer.invert(_)) def addFlowStepListener(flowListenerProvider: (Mode, Config) => FlowStepListener): Config = { val serializedListener = flowStepListenerSerializer(flowListenerProvider) update(Config.FlowStepListeners) { - case None => (Some(serializedListener), ()) + case None => (Some(serializedListener), ()) case Some(lst) => (Some(s"$serializedListener,$lst"), ()) }._2 } def getFlowStepListeners: List[Try[(Mode, Config) => FlowStepListener]] = - get(Config.FlowStepListeners) - .toList + get(Config.FlowStepListeners).toList .flatMap(s => StringUtility.fastSplit(s, ",")) .map(flowStepListenerSerializer.invert(_)) def addFlowStepStrategy(flowStrategyProvider: (Mode, Config) => FlowStepStrategy[JobConf]): Config = { val serializedListener = flowStepStrategiesSerializer(flowStrategyProvider) update(Config.FlowStepStrategies) { - case None => (Some(serializedListener), ()) + case None => (Some(serializedListener), ()) case Some(lst) => (Some(s"$serializedListener,$lst"), ()) }._2 } @@ -415,8 +422,7 @@ abstract class Config extends Serializable { this.-(Config.FlowStepStrategies) def getFlowStepStrategies: List[Try[(Mode, Config) => FlowStepStrategy[JobConf]]] = - get(Config.FlowStepStrategies) - .toList + get(Config.FlowStepStrategies).toList .flatMap(s => StringUtility.fastSplit(s, ",")) .map(flowStepStrategiesSerializer.invert(_)) @@ -435,9 +441,8 @@ abstract class Config extends Serializable { getBoolean(HashJoinAutoForceRight, false) /** - * Set to true to enable very verbose logging during FileSource's validation and planning. - * This can help record what files were present / missing at runtime. Should only be enabled - * for debugging. + * Set to true to enable very verbose logging during FileSource's validation and planning. This can help + * record what files were present / missing at runtime. Should only be enabled for debugging. */ def setVerboseFileSourceLogging(b: Boolean): Config = this + (VerboseFileSourceLoggingKey -> b.toString) @@ -446,36 +451,30 @@ abstract class Config extends Serializable { getBoolean(SkipNullCounters, false) /** - * If this is true, on hadoop, when we get a null Counter - * for a given name, we just ignore the counter instead - * of NPE + * If this is true, on hadoop, when we get a null Counter for a given name, we just ignore the counter + * instead of NPE */ def setSkipNullCounters(boolean: Boolean): Config = this + (SkipNullCounters -> boolean.toString) /** - * When this value is true, all temporary output is removed - * when the outer-most execution completes, not on JVM shutdown. + * When this value is true, all temporary output is removed when the outer-most execution completes, not on + * JVM shutdown. * - * When you do .forceToDiskExecution or .toIterableExecution - * we need to materialize the data somewhere. We can't be sure - * that when the outer most execution is complete that all reads - * have been done, since they could escape the value of - * the Execution. If you know no such reference escapes, it - * is safe to set to true. + * When you do .forceToDiskExecution or .toIterableExecution we need to materialize the data somewhere. We + * can't be sure that when the outer most execution is complete that all reads have been done, since they + * could escape the value of the Execution. If you know no such reference escapes, it is safe to set to + * true. * - * Note, this is *always* safe for Execution[Unit], a common - * value. + * Note, this is *always* safe for Execution[Unit], a common value. */ def setExecutionCleanupOnFinish(boolean: Boolean): Config = this + (ScaldingExecutionCleanupOnFinish -> boolean.toString) /** - * should we cleanup temporary files when - * the outer most Execution is run. + * should we cleanup temporary files when the outer most Execution is run. * - * Not safe if the outer-most execution returns - * a TypedPipe or Iterable derived from a forceToDiskExecution + * Not safe if the outer-most execution returns a TypedPipe or Iterable derived from a forceToDiskExecution * or a toIterableExecution */ def getExecutionCleanupOnFinish: Boolean = @@ -524,6 +523,7 @@ object Config { val IoSerializationsKey: String = "io.serializations" val ScaldingFlowClassName: String = "scalding.flow.class.name" val ScaldingFlowClassSignature: String = "scalding.flow.class.signature" + /** * This is incremented every time a cascading flow is run as an Execution */ @@ -548,8 +548,8 @@ object Config { val RuntimeFrameworkValueLocal = "local" /** - * Parameter that actually controls the number of reduce tasks. - * Be sure to set this in the JobConf for the *step* not the flow. + * Parameter that actually controls the number of reduce tasks. Be sure to set this in the JobConf for the + * *step* not the flow. */ val HadoopNumReducers = "mapred.reduce.tasks" @@ -582,10 +582,9 @@ object Config { val StepDescriptions = "scalding.step.descriptions" /** - * Parameter that can be used to determine behavior on the rhs of a hashJoin. - * If true, we try to guess when to auto force to disk before a hashJoin - * else (the default) we don't try to infer this and the behavior can be dictated by the user manually - * calling forceToDisk on the rhs or not as they wish. + * Parameter that can be used to determine behavior on the rhs of a hashJoin. If true, we try to guess when + * to auto force to disk before a hashJoin else (the default) we don't try to infer this and the behavior + * can be dictated by the user manually calling forceToDisk on the rhs or not as they wish. */ val HashJoinAutoForceRight: String = "scalding.hashjoin.autoforceright" @@ -608,7 +607,12 @@ object Config { * Extensions to the Default Config to tune it for unit tests */ def unitTestDefault: Config = - Config(Config.default.toMap ++ Map("cascading.update.skip" -> "true", RuntimeFrameworkKey -> RuntimeFrameworkValueLocal)) + Config( + Config.default.toMap ++ Map( + "cascading.update.skip" -> "true", + RuntimeFrameworkKey -> RuntimeFrameworkValueLocal + ) + ) /** * Merge Config.default with Hadoop config from the mode (if in Hadoop mode) @@ -616,7 +620,7 @@ object Config { def defaultFrom(mode: Mode): Config = default ++ (mode match { case m: HadoopMode => Config.fromHadoop(m.jobConf) - IoSerializationsKey - case _ => empty + case _ => empty }) def apply(m: Map[String, String]): Config = new Config { def toMap = m } @@ -636,20 +640,19 @@ object Config { (nonStrings .get(AppProps.APP_JAR_CLASS) match { - case Some(clazz) => - // Again, the _ causes problem with Try - try { - val cls = classOf[Class[_]].cast(clazz) - Success((nonStrings - AppProps.APP_JAR_CLASS, initConf.setCascadingAppJar(cls))) - } catch { - case err: Throwable => Failure(err) - } - case None => Success((nonStrings, initConf)) - }) - .flatMap { - case (unhandled, withJar) => - if (unhandled.isEmpty) Success(withJar) - else Failure(new Exception("unhandled configurations: " + unhandled.toString)) + case Some(clazz) => + // Again, the _ causes problem with Try + try { + val cls = classOf[Class[_]].cast(clazz) + Success((nonStrings - AppProps.APP_JAR_CLASS, initConf.setCascadingAppJar(cls))) + } catch { + case err: Throwable => Failure(err) + } + case None => Success((nonStrings, initConf)) + }) + .flatMap { case (unhandled, withJar) => + if (unhandled.isEmpty) Success(withJar) + else Failure(new Exception("unhandled configurations: " + unhandled.toString)) } } @@ -657,12 +660,11 @@ object Config { * Returns all the non-string keys on the left, the string keys/values on the right */ def stringsFrom[K >: String, V >: String](m: Map[K, V]): (Map[K, V], Map[String, String]) = - m.foldLeft((Map.empty[K, V], Map.empty[String, String])) { - case ((kvs, conf), kv) => - kv match { - case (ks: String, vs: String) => (kvs, conf + (ks -> vs)) - case _ => (kvs + kv, conf) - } + m.foldLeft((Map.empty[K, V], Map.empty[String, String])) { case ((kvs, conf), kv) => + kv match { + case (ks: String, vs: String) => (kvs, conf + (ks -> vs)) + case _ => (kvs + kv, conf) + } } /** @@ -670,10 +672,11 @@ object Config { */ def disjointUnion[K >: String, V >: String](m: Map[K, V], conf: Config): Either[Set[String], Map[K, V]] = { val asMap = conf.toMap.toMap[K, V] // linter:disable:TypeToType // we are upcasting K, V - val duplicateKeys = (m.keySet & asMap.keySet) + val duplicateKeys = m.keySet & asMap.keySet if (duplicateKeys.isEmpty) Right(m ++ asMap) else Left(conf.toMap.keySet.filter(duplicateKeys(_))) // make sure to return Set[String], and not cast } + /** * This overwrites any keys in m that exist in config. */ @@ -688,7 +691,7 @@ object Config { */ def fromHadoop(conf: Configuration): Config = // use `conf.get` to force JobConf to evaluate expressions - Config(conf.asScala.map { e => e.getKey -> conf.get(e.getKey) }.toMap) + Config(conf.asScala.map(e => e.getKey -> conf.get(e.getKey)).toMap) /* * For everything BUT SERIALIZATION, this prefers values in conf, @@ -729,12 +732,16 @@ object Config { /** * Add a file to be localized to the config. Intended to be used by user code. * - * @param qualifiedURI The qualified uri of the cache to be localized - * @param config Config to add the cache to + * @param qualifiedURI + * The qualified uri of the cache to be localized + * @param config + * Config to add the cache to * - * @return new Config with cached files + * @return + * new Config with cached files * - * @see basic logic from [[org.apache.hadoop.mapreduce.filecache.DistributedCache.addCacheFile]] + * @see + * basic logic from [[org.apache.hadoop.mapreduce.filecache.DistributedCache.addCacheFile]] */ private def addDistributedCacheFile(qualifiedURI: URI, config: Config): Config = { val newFile = DistributedCacheFile @@ -752,9 +759,10 @@ object Config { /** * Get distributed cache files from config * - * @param config Config with cached files + * @param config + * Config with cached files */ - private def getDistributedCacheFile(config: Config): Seq[CachedFile] = { + private def getDistributedCacheFile(config: Config): Seq[CachedFile] = config .get(MRJobConfig.CACHE_FILES) .toSeq @@ -765,13 +773,14 @@ object Config { val qualifiedUri = new URI(symlinkedUri.getScheme, symlinkedUri.getSchemeSpecificPart, null) HadoopCachedFile(qualifiedUri) } - } private[this] def buildInj[T: ExternalizerInjection: ExternalizerCodec]: Injection[T, String] = Injection.connect[T, Externalizer[T], Array[Byte], Base64String, String] - @transient private[scalding] lazy val flowStepListenerSerializer = buildInj[(Mode, Config) => FlowStepListener] + @transient private[scalding] lazy val flowStepListenerSerializer = + buildInj[(Mode, Config) => FlowStepListener] @transient private[scalding] lazy val flowListenerSerializer = buildInj[(Mode, Config) => FlowListener] - @transient private[scalding] lazy val flowStepStrategiesSerializer = buildInj[(Mode, Config) => FlowStepStrategy[JobConf]] + @transient private[scalding] lazy val flowStepStrategiesSerializer = + buildInj[(Mode, Config) => FlowStepStrategy[JobConf]] @transient private[scalding] lazy val argsSerializer = buildInj[Map[String, List[String]]] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CumulativeSum.scala b/scalding-core/src/main/scala/com/twitter/scalding/CumulativeSum.scala index b26882c732..d12074b74a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CumulativeSum.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CumulativeSum.scala @@ -3,66 +3,52 @@ package com.twitter.scalding.typed import com.twitter.algebird._ /** - * Extension for TypedPipe to add a cumulativeSum method. - * Given a TypedPipe with T = (GroupField, (SortField, SummableField)) - * cumulaitiveSum will return a SortedGrouped with the SummableField accumulated - * according to the sort field. - * eg: - * ('San Francisco', (100, 100)), - * ('San Francisco', (101, 50)), - * ('San Francisco', (200, 200)), - * ('Vancouver', (100, 50)), - * ('Vancouver', (101, 300)), - * ('Vancouver', (200, 100)) - * becomes - * ('San Francisco', (100, 100)), - * ('San Francisco', (101, 150)), - * ('San Francisco', (200, 300)), - * ('Vancouver', (100, 50)), - * ('Vancouver', (101, 350)), - * ('Vancouver', (200, 450)) + * Extension for TypedPipe to add a cumulativeSum method. Given a TypedPipe with T = (GroupField, (SortField, + * SummableField)) cumulaitiveSum will return a SortedGrouped with the SummableField accumulated according to + * the sort field. eg: ('San Francisco', (100, 100)), ('San Francisco', (101, 50)), ('San Francisco', (200, + * 200)), ('Vancouver', (100, 50)), ('Vancouver', (101, 300)), ('Vancouver', (200, 100)) becomes ('San + * Francisco', (100, 100)), ('San Francisco', (101, 150)), ('San Francisco', (200, 300)), ('Vancouver', (100, + * 50)), ('Vancouver', (101, 350)), ('Vancouver', (200, 450)) * - * If you provide cumulativeSum a partition function you get the same result - * but you allow for more than one reducer per group. This is useful for - * when you have a single group that has a very large number of entries. - * For example in the previous example if you gave a partition function of the - * form { _ / 100 } then you would never have any one reducer deal with more - * than 2 entries. + * If you provide cumulativeSum a partition function you get the same result but you allow for more than one + * reducer per group. This is useful for when you have a single group that has a very large number of entries. + * For example in the previous example if you gave a partition function of the form { _ / 100 } then you would + * never have any one reducer deal with more than 2 entries. */ object CumulativeSum { implicit def toCumulativeSum[K, U, V](pipe: TypedPipe[(K, (U, V))]): CumulativeSumExtension[K, U, V] = new CumulativeSumExtension(pipe) - class CumulativeSumExtension[K, U, V]( - val pipe: TypedPipe[(K, (U, V))]) { - /** Takes a sortable field and a monoid and returns the cumulative sum of that monoid **/ - def cumulativeSum( - implicit sg: Semigroup[V], - ordU: Ordering[U], - ordK: Ordering[K]): SortedGrouped[K, (U, V)] = { + class CumulativeSumExtension[K, U, V](val pipe: TypedPipe[(K, (U, V))]) { + + /** Takes a sortable field and a monoid and returns the cumulative sum of that monoid * */ + def cumulativeSum(implicit + sg: Semigroup[V], + ordU: Ordering[U], + ordK: Ordering[K] + ): SortedGrouped[K, (U, V)] = pipe.group .sortBy { case (u, _) => u } - .scanLeft(Nil: List[(U, V)]) { - case (acc, (u, v)) => - acc match { - case List((previousU, previousSum)) => List((u, sg.plus(previousSum, v))) - case _ => List((u, v)) - } + .scanLeft(Nil: List[(U, V)]) { case (acc, (u, v)) => + acc match { + case List((previousU, previousSum)) => List((u, sg.plus(previousSum, v))) + case _ => List((u, v)) + } } .flattenValues - } + /** - * An optimization of cumulativeSum for cases when a particular key has many - * entries. Requires a sortable partitioning of U. - * Accomplishes the optimization by not requiring all the entries for a - * single key to go through a single scan. Instead requires the sums of the - * partitions for a single key to go through a single scan. + * An optimization of cumulativeSum for cases when a particular key has many entries. Requires a sortable + * partitioning of U. Accomplishes the optimization by not requiring all the entries for a single key to + * go through a single scan. Instead requires the sums of the partitions for a single key to go through a + * single scan. */ - def cumulativeSum[S](partition: U => S)( - implicit ordS: Ordering[S], - sg: Semigroup[V], - ordU: Ordering[U], - ordK: Ordering[K]): TypedPipe[(K, (U, V))] = { + def cumulativeSum[S](partition: U => S)(implicit + ordS: Ordering[S], + sg: Semigroup[V], + ordU: Ordering[U], + ordK: Ordering[K] + ): TypedPipe[(K, (U, V))] = { val sumPerS = pipe .map { case (k, (u, v)) => (k, partition(u)) -> v } @@ -70,46 +56,42 @@ object CumulativeSum { .map { case ((k, s), v) => (k, (s, v)) } .group .sortBy { case (s, v) => s } - .scanLeft(None: Option[(Option[V], V, S)]) { - case (acc, (s, v)) => - acc match { - case Some((previousPreviousSum, previousSum, previousS)) => { - Some((Some(previousSum), sg.plus(v, previousSum), s)) - } - case _ => Some((None, v, s)) + .scanLeft(None: Option[(Option[V], V, S)]) { case (acc, (s, v)) => + acc match { + case Some((previousPreviousSum, previousSum, previousS)) => { + Some((Some(previousSum), sg.plus(v, previousSum), s)) } + case _ => Some((None, v, s)) + } } - .flatMap{ - case (k, maybeAcc) => - for ( - acc <- maybeAcc; - previousSum <- acc._1 - ) yield { (k, acc._3) -> (None, previousSum) } + .flatMap { case (k, maybeAcc) => + for { + acc <- maybeAcc + previousSum <- acc._1 + } yield { (k, acc._3) -> (None, previousSum) } } val summands = pipe - .map { - case (k, (u, v)) => - (k, partition(u)) -> (Some(u), v) + .map { case (k, (u, v)) => + (k, partition(u)) -> (Some(u), v) } ++ sumPerS - summands - .group + summands.group .sortBy { case (u, _) => u } - .scanLeft(None: Option[(Option[U], V)]) { - case (acc, (maybeU, v)) => - acc match { - case Some((_, previousSum)) => Some((maybeU, sg.plus(v, previousSum))) - case _ => Some((maybeU, v)) - } + .scanLeft(None: Option[(Option[U], V)]) { case (acc, (maybeU, v)) => + acc match { + case Some((_, previousSum)) => Some((maybeU, sg.plus(v, previousSum))) + case _ => Some((maybeU, v)) + } } - .flatMap { - case ((k, s), acc) => - for (uv <- acc; u <- uv._1) yield { - (k, (u, uv._2)) - } + .flatMap { case ((k, s), acc) => + for { + uv <- acc + u <- uv._1 + } yield { + (k, (u, uv._2)) + } } } } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala index 30fa9857b1..59846773fd 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala @@ -12,21 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe.Pipe import cascading.flow.FlowDef /** - * This object has all the implicit functions and values that are used - * to make the scalding DSL, which includes the functions for automatically - * creating cascading.tuple.Fields objects from scala tuples of Strings, Symbols - * or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which - * adds the scala.collections-like API to Pipe. + * This object has all the implicit functions and values that are used to make the scalding DSL, which + * includes the functions for automatically creating cascading.tuple.Fields objects from scala tuples of + * Strings, Symbols or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which adds the + * scala.collections-like API to Pipe. * - * It's useful to import Dsl._ when you are writing scalding code outside - * of a Job. + * It's useful to import Dsl._ when you are writing scalding code outside of a Job. */ object Dsl extends FieldConversions with java.io.Serializable { implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala b/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala index a049bb1a7f..7a5b7468ea 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala @@ -29,54 +29,63 @@ import java.io.Serializable import java.util.UUID import scala.collection.mutable import scala.concurrent.duration.SECONDS -import scala.concurrent.{Await, Future, Promise, blocking, duration, ExecutionContext => ConcurrentExecutionContext} +import scala.concurrent.{ + blocking, + duration, + Await, + ExecutionContext => ConcurrentExecutionContext, + Future, + Promise +} import scala.util.{Failure, Success, Try} import scala.util.hashing.MurmurHash3 /** - * Execution[T] represents and computation that can be run and - * will produce a value T and keep track of counters incremented - * inside of TypedPipes using a Stat. + * Execution[T] represents and computation that can be run and will produce a value T and keep track of + * counters incremented inside of TypedPipes using a Stat. * - * Execution[T] is the recommended way to compose multistep computations - * that involve branching (if/then), intermediate calls to remote - * services, file operations, or looping (e.g. testing for convergence). + * Execution[T] is the recommended way to compose multistep computations that involve branching (if/then), + * intermediate calls to remote services, file operations, or looping (e.g. testing for convergence). * - * Library functions are encouraged to implement functions from - * TypedPipes or ValuePipes to Execution[R] for some result R. - * Refrain from calling run in library code. Let the caller - * of your library call run. + * Library functions are encouraged to implement functions from TypedPipes or ValuePipes to Execution[R] for + * some result R. Refrain from calling run in library code. Let the caller of your library call run. * - * Note this is a Monad, meaning flatMap composes in series as you expect. - * It is also an applicative functor, which means zip (called join - * in some libraries) composes two Executions is parallel. Prefer - * zip to flatMap if you want to run two Executions in parallel. + * Note this is a Monad, meaning flatMap composes in series as you expect. It is also an applicative functor, + * which means zip (called join in some libraries) composes two Executions is parallel. Prefer zip to flatMap + * if you want to run two Executions in parallel. */ sealed trait Execution[+T] extends Serializable { self: Product => - import Execution.{ EvalCache, FlatMapped, GetCounters, ResetCounters, Mapped, OnComplete, RecoverWith, Zipped } + import Execution.{ + EvalCache, + FlatMapped, + GetCounters, + Mapped, + OnComplete, + RecoverWith, + ResetCounters, + Zipped + } /** * Lift an Execution into a Try * - * When this function is called the Execution should never be failed - * instead only the Try. + * When this function is called the Execution should never be failed instead only the Try. */ def liftToTry: Execution[Try[T]] = - map(e => Success(e)).recoverWith{ case throwable => Execution.from(Failure(throwable)) } + map(e => Success(e)).recoverWith { case throwable => Execution.from(Failure(throwable)) } /** - * Scala uses the filter method in for syntax for pattern matches that can fail. - * If this filter is false, the result of run will be an exception in the future + * Scala uses the filter method in for syntax for pattern matches that can fail. If this filter is false, + * the result of run will be an exception in the future */ def filter(pred: T => Boolean): Execution[T] = flatMap { case good if pred(good) => Execution.from(good) - case failed => Execution.from(sys.error("Filter failed on: " + failed.toString)) + case failed => Execution.from(sys.error("Filter failed on: " + failed.toString)) } /** - * First run this Execution, then move to the result - * of the function + * First run this Execution, then move to the result of the function */ def flatMap[U](fn: T => Execution[U]): Execution[U] = FlatMapped(this, fn) @@ -88,63 +97,54 @@ sealed trait Execution[+T] extends Serializable { self: Product => flatMap(ev) /** - * Apply a pure function to the result. This may not - * be called if subsequently the result is discarded with .unit - * For side effects see onComplete. + * Apply a pure function to the result. This may not be called if subsequently the result is discarded with + * .unit For side effects see onComplete. */ def map[U](fn: T => U): Execution[U] = Mapped(this, fn) /** - * Reads the counters into the value, but does not reset them. - * You may want .getAndResetCounters. + * Reads the counters into the value, but does not reset them. You may want .getAndResetCounters. */ def getCounters: Execution[(T, ExecutionCounters)] = GetCounters(this) /** - * Reads the counters and resets them to zero. Probably what - * you want in a loop that is using counters to check for - * convergence. + * Reads the counters and resets them to zero. Probably what you want in a loop that is using counters to + * check for convergence. */ def getAndResetCounters: Execution[(T, ExecutionCounters)] = getCounters.resetCounters /** - * This function is called when the current run is completed. This is - * only a side effect (see unit return). + * This function is called when the current run is completed. This is only a side effect (see unit return). * - * ALSO You must .run the result. If - * you throw away the result of this call, your fn will never be - * called. When you run the result, the Future you get will not - * be complete unless fn has completed running. If fn throws, it - * will be handled be the scala.concurrent.ExecutionContext.reportFailure - * NOT by returning a Failure in the Future. + * ALSO You must .run the result. If you throw away the result of this call, your fn will never be called. + * When you run the result, the Future you get will not be complete unless fn has completed running. If fn + * throws, it will be handled be the scala.concurrent.ExecutionContext.reportFailure NOT by returning a + * Failure in the Future. */ def onComplete(fn: Try[T] => Unit): Execution[T] = OnComplete(this, fn) /** - * This allows you to handle a failure by giving a replacement execution - * in some cases. This execution may be a retry if you know that your - * execution can have spurious errors, or it could be a constant or an - * alternate way to compute. Be very careful creating looping retries that - * could hammer your cluster when the data is missing or when - * when there is some real problem with your job logic. + * This allows you to handle a failure by giving a replacement execution in some cases. This execution may + * be a retry if you know that your execution can have spurious errors, or it could be a constant or an + * alternate way to compute. Be very careful creating looping retries that could hammer your cluster when + * the data is missing or when when there is some real problem with your job logic. */ def recoverWith[U >: T](rec: PartialFunction[Throwable, Execution[U]]): Execution[U] = RecoverWith(this, rec) /** - * Resets the counters back to zero. This is useful if - * you want to reset before a zip or a call to flatMap + * Resets the counters back to zero. This is useful if you want to reset before a zip or a call to flatMap */ def resetCounters: Execution[T] = ResetCounters(this) /** - * This causes the Execution to occur. The result is not cached, so each call - * to run will result in the computation being re-run. Avoid calling this - * until the last possible moment by using flatMap, zip and recoverWith. + * This causes the Execution to occur. The result is not cached, so each call to run will result in the + * computation being re-run. Avoid calling this until the last possible moment by using flatMap, zip and + * recoverWith. * * Seriously: pro-style is for this to be called only once in a program. */ @@ -172,35 +172,29 @@ sealed trait Execution[+T] extends Serializable { self: Product => } /** - * This is the internal method that must be implemented - * Given a config, mode, and cache of evaluations for this config and mode, - * return the new cache with as much evaluation as possible before the future - * completes, and a future of the result, counters and cache after the future - * is complete + * This is the internal method that must be implemented Given a config, mode, and cache of evaluations for + * this config and mode, return the new cache with as much evaluation as possible before the future + * completes, and a future of the result, counters and cache after the future is complete */ - protected def runStats(conf: Config, - mode: Mode, - cache: EvalCache)(implicit cec: ConcurrentExecutionContext): Trampoline[CFuture[(T, Map[Long, ExecutionCounters])]] + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ): Trampoline[CFuture[(T, Map[Long, ExecutionCounters])]] /** - * This is convenience for when we don't care about the result. - * like .map(_ => ()) + * This is convenience for when we don't care about the result. like .map(_ => ()) */ def unit: Execution[Unit] = map(_ => ()) /** - * This waits synchronously on run, using the global execution context - * Avoid calling this if possible, prefering run or just Execution - * composition. Every time someone calls this, be very suspect. It is - * always code smell. Very seldom should you need to wait on a future. + * This waits synchronously on run, using the global execution context Avoid calling this if possible, + * prefering run or just Execution composition. Every time someone calls this, be very suspect. It is always + * code smell. Very seldom should you need to wait on a future. */ def waitFor(conf: Config, mode: Mode): Try[T] = - Try(Await.result(run(conf, mode)(ConcurrentExecutionContext.global), - duration.Duration.Inf)) + Try(Await.result(run(conf, mode)(ConcurrentExecutionContext.global), duration.Duration.Inf)) /** - * This is here to silence warnings in for comprehensions, but is - * identical to .filter. + * This is here to silence warnings in for comprehensions, but is identical to .filter. * * Users should never directly call this method, call .filter */ @@ -215,10 +209,10 @@ sealed trait Execution[+T] extends Serializable { self: Product => override val hashCode: Int = MurmurHash3.productHash(self) /** - * since executions, particularly Zips can cause two executions to merge - * we can have exponential cost to computing equals if we are not careful + * since executions, particularly Zips can cause two executions to merge we can have exponential cost to + * computing equals if we are not careful */ - override def equals(other: Any): Boolean = { + override def equals(other: Any): Boolean = other match { case otherEx: Execution[_] => if (otherEx eq this) true @@ -267,12 +261,11 @@ sealed trait Execution[+T] extends Serializable { self: Product => } case _ => false } - } } /** - * Execution has many methods for creating Execution[T] instances, which - * are the preferred way to compose computations in scalding libraries. + * Execution has many methods for creating Execution[T] instances, which are the preferred way to compose + * computations in scalding libraries. */ object Execution { private[Execution] class AsyncSemaphore(initialPermits: Int = 0) { @@ -317,8 +310,7 @@ object Execution { } /** - * This is an instance of Monad for execution so it can be used - * in functions that apply to all Monads + * This is an instance of Monad for execution so it can be used in functions that apply to all Monads */ implicit object ExecutionMonad extends Monad[Execution] { override def apply[T](t: T): Execution[T] = Execution.from(t) @@ -331,28 +323,26 @@ object Execution { TransformedConfig(ex, c) /** - * Distributes the file onto each map/reduce node, - * so you can use it for Scalding source creation and TypedPipe, KeyedList, etc. transformations. - * Using the [[com.twitter.scalding.filecache.CachedFile]] outside of Execution will probably not work. + * Distributes the file onto each map/reduce node, so you can use it for Scalding source creation and + * TypedPipe, KeyedList, etc. transformations. Using the [[com.twitter.scalding.filecache.CachedFile]] + * outside of Execution will probably not work. * - * For multiple files you must nested your execution, see docs of [[com.twitter.scalding.filecache.DistributedCacheFile]] + * For multiple files you must nested your execution, see docs of + * [[com.twitter.scalding.filecache.DistributedCacheFile]] */ - def withCachedFile[T](path: String)(fn: CachedFile => Execution[T]): Execution[T] = { + def withCachedFile[T](path: String)(fn: CachedFile => Execution[T]): Execution[T] = Execution.getMode.flatMap { mode => val cachedFile = DistributedCacheFile.cachedFile(path, mode) withConfig(fn(cachedFile))(_.addDistributedCacheFiles(cachedFile)) } - } /** - * This function allows running the passed execution with its own cache. - * This will mean anything inside won't benefit from Execution's global attempts to avoid - * repeated executions. + * This function allows running the passed execution with its own cache. This will mean anything inside + * won't benefit from Execution's global attempts to avoid repeated executions. * - * The main use case here is when generating a lot of Execution results which are large. - * Executions caching in this case can lead to out of memory errors as the cache keeps - * references to many heap objects. + * The main use case here is when generating a lot of Execution results which are large. Executions caching + * in this case can lead to out of memory errors as the cache keeps references to many heap objects. * * Ex. (0 until 1000).map { _ => Execution.withNewCache(myLargeObjectProducingExecution)} */ @@ -365,24 +355,24 @@ object Execution { implicit def semigroup[T: Semigroup]: Semigroup[Execution[T]] = Semigroup.from[Execution[T]] { (a, b) => a.zip(b).map { case (ta, tb) => Semigroup.plus(ta, tb) } } + /** - * This is the standard monoid on an Applicative (zip, then inside the Execution do plus) - * useful to combine unit Executions: - * Monoid.sum(ex1, ex2, ex3, ex4): Execution[Unit] - * where each are exi are Execution[Unit] + * This is the standard monoid on an Applicative (zip, then inside the Execution do plus) useful to combine + * unit Executions: Monoid.sum(ex1, ex2, ex3, ex4): Execution[Unit] where each are exi are Execution[Unit] */ - implicit def monoid[T: Monoid]: Monoid[Execution[T]] = Monoid.from(Execution.from(Monoid.zero[T])) { (a, b) => - a.zip(b).map { case (ta, tb) => Monoid.plus(ta, tb) } + implicit def monoid[T: Monoid]: Monoid[Execution[T]] = Monoid.from(Execution.from(Monoid.zero[T])) { + (a, b) => + a.zip(b).map { case (ta, tb) => Monoid.plus(ta, tb) } } /** - * This is a mutable state that is kept internal to an execution - * as it is evaluating. + * This is a mutable state that is kept internal to an execution as it is evaluating. */ private[scalding] class EvalCache(val writer: Execution.Writer) { type Counters = Map[Long, ExecutionCounters] - private[this] val cache = new FutureCacheGeneric[(Config, Execution[Any]), (Any, Counters), CPromise, CFuture] + private[this] val cache = + new FutureCacheGeneric[(Config, Execution[Any]), (Any, Counters), CPromise, CFuture] private[this] val toWriteCache = new FutureCacheGeneric[(Config, ToWrite[_]), Counters, CPromise, CFuture] // This method with return a 'clean' cache, that shares @@ -392,20 +382,29 @@ object Execution { def getOrLock(cfg: Config, write: ToWrite[_]): Either[CPromise[Counters], CFuture[Counters]] = toWriteCache.getOrPromise((cfg, write)) - def getOrElseInsertWithFeedback[T](cfg: Config, ex: Execution[T], - res: => CFuture[(T, Counters)]): (Boolean, CFuture[(T, Counters)]) = + def getOrElseInsertWithFeedback[T]( + cfg: Config, + ex: Execution[T], + res: => CFuture[(T, Counters)] + ): (Boolean, CFuture[(T, Counters)]) = // This cast is safe because we always insert with match T types - cache.getOrElseUpdateIsNew((cfg, ex), res) + cache + .getOrElseUpdateIsNew((cfg, ex), res) .asInstanceOf[(Boolean, CFuture[(T, Counters)])] - def getOrElseInsert[T](cfg: Config, ex: Execution[T], - res: => CFuture[(T, Counters)] ): CFuture[(T, Counters)] = + def getOrElseInsert[T]( + cfg: Config, + ex: Execution[T], + res: => CFuture[(T, Counters)] + ): CFuture[(T, Counters)] = getOrElseInsertWithFeedback(cfg, ex, res)._2 } - - private[scalding] final case class FutureConst[T](get: ConcurrentExecutionContext => Future[T]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class FutureConst[T](get: ConcurrentExecutionContext => Future[T]) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline { lazy val fut = for { futt <- toFuture(Try(get(cec))) @@ -420,8 +419,11 @@ object Execution { // Note that unit is not optimized away, since Futures are often used with side-effects, so, // we ensure that get is always called in contrast to Mapped, which assumes that fn is pure. } - private[scalding] final case class FlatMapped[S, T](prev: Execution[S], fn: S => Execution[T]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class FlatMapped[S, T](prev: Execution[S], fn: S => Execution[T]) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { case CFuture(fut1, cancelHandler1) => lazy val uncachedCFut = for { (s, st1) <- fut1 @@ -441,109 +443,139 @@ object Execution { } private[scalding] final case class Mapped[S, T](prev: Execution[S], fn: S => T) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => cache.getOrElseInsert(conf, this, cfuture.map { case (s, stats) => (fn(s), stats) }) } } - private[scalding] final case class GetCounters[T](prev: Execution[T]) extends Execution[(T, ExecutionCounters)] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class GetCounters[T](prev: Execution[T]) + extends Execution[(T, ExecutionCounters)] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => - cache.getOrElseInsert(conf, this, - cfuture.map { - case (t, c) => - val totalCount = Monoid.sum(c.map(_._2)) - ((t, totalCount), c) - }) + cache.getOrElseInsert( + conf, + this, + cfuture.map { case (t, c) => + val totalCount = Monoid.sum(c.map(_._2)) + ((t, totalCount), c) + } + ) } } private[scalding] final case class ResetCounters[T](prev: Execution[T]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => - cache.getOrElseInsert(conf, this, - cfuture.map { case (t, _) => (t, Map.empty[Long, ExecutionCounters]) }) + cache + .getOrElseInsert(conf, this, cfuture.map { case (t, _) => (t, Map.empty[Long, ExecutionCounters]) }) } } - private[scalding] final case class TransformedConfig[T](prev: Execution[T], fn: Config => Config) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = { + private[scalding] final case class TransformedConfig[T](prev: Execution[T], fn: Config => Config) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { val mutatedConfig = fn(conf) Trampoline.call(prev.runStats(mutatedConfig, mode, cache)) } } /** - * This class allows running the passed execution with its own cache. - * This will mean anything inside won't benefit from Execution's global attempts to avoid - * repeated executions. + * This class allows running the passed execution with its own cache. This will mean anything inside won't + * benefit from Execution's global attempts to avoid repeated executions. * - * The main use case here is when generating a lot of Execution results which are large. - * Executions caching in this case can lead to out of memory errors as the cache keeps - * references to many heap objects. + * The main use case here is when generating a lot of Execution results which are large. Executions caching + * in this case can lead to out of memory errors as the cache keeps references to many heap objects. * - * We operate here by getting a copy of the super EvalCache, without its cache's. - * This is so we can share the singleton thread for scheduling jobs against Cascading. + * We operate here by getting a copy of the super EvalCache, without its cache's. This is so we can share + * the singleton thread for scheduling jobs against Cascading. */ private[scalding] final case class WithNewCache[T](prev: Execution[T]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { val ec = cache.cleanCache Trampoline.call(prev.runStats(conf, mode, ec)) } } - private[scalding] final case class OnComplete[T](prev: Execution[T], fn: Try[T] => Unit) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class OnComplete[T](prev: Execution[T], fn: Try[T] => Unit) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => - cache.getOrElseInsert(conf, this, cfuture.mapFuture { fut => - /** - * The result we give is only completed AFTER fn is run - * so callers can wait on the result of this OnComplete - */ - val finished = Promise[(T, Map[Long, ExecutionCounters])]() - fut.onComplete { tryT => - try { - fn(tryT.map(_._1)) - } finally { - // Do our best to signal when we are done - finished.complete(tryT) + cache.getOrElseInsert( + conf, + this, + cfuture.mapFuture { fut => + /** + * The result we give is only completed AFTER fn is run so callers can wait on the result of this + * OnComplete + */ + val finished = Promise[(T, Map[Long, ExecutionCounters])]() + fut.onComplete { tryT => + try { + fn(tryT.map(_._1)) + } finally { + // Do our best to signal when we are done + finished.complete(tryT) + } } + finished.future } - finished.future - }) + ) } } - private[scalding] final case class RecoverWith[T](prev: Execution[T], fn: PartialFunction[Throwable, Execution[T]]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class RecoverWith[T]( + prev: Execution[T], + fn: PartialFunction[Throwable, Execution[T]] + ) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline.call(prev.runStats(conf, mode, cache)).map { case CFuture(fut, cancelHandler) => lazy val uncachedFut = { - fut - .map {v => (v, CancellationHandler.empty) } // map this to the right shape - .recoverWith { - val flowStop: PartialFunction[Throwable, Future[Nothing]] = { - case t: FlowStopException => // do not recover when the flow was stopped - Future.failed(t) - } - - flowStop orElse fn.andThen { ex0 => - // we haven't optimized ex0 yet - val ex = optimize(conf, ex0) - val CFuture(f, c) = ex.runStats(conf, mode, cache).get - f.map { v => (v, c) } - } + fut + .map(v => (v, CancellationHandler.empty)) // map this to the right shape + .recoverWith { + val flowStop: PartialFunction[Throwable, Future[Nothing]] = { + case t: FlowStopException => // do not recover when the flow was stopped + Future.failed(t) } + + flowStop.orElse(fn.andThen { ex0 => + // we haven't optimized ex0 yet + val ex = optimize(conf, ex0) + val CFuture(f, c) = ex.runStats(conf, mode, cache).get + f.map(v => (v, c)) + }) + } } - val recoveredFut = cache.getOrElseInsert(conf, this, CFuture(uncachedFut.map(_._1), CancellationHandler.fromFuture(uncachedFut.map(_._2)))) + val recoveredFut = cache.getOrElseInsert( + conf, + this, + CFuture(uncachedFut.map(_._1), CancellationHandler.fromFuture(uncachedFut.map(_._2))) + ) CFuture(recoveredFut.future, cancelHandler.compose(recoveredFut.cancellationHandler)) } } /** - * Standard scala zip waits forever on the left side, even if the right side fails - */ - def failFastZip[T, U](ft: Future[T], fu: Future[U])(implicit cec: ConcurrentExecutionContext): Future[(T, U)] = { + * Standard scala zip waits forever on the left side, even if the right side fails + */ + def failFastZip[T, U](ft: Future[T], fu: Future[U])(implicit + cec: ConcurrentExecutionContext + ): Future[(T, U)] = { type State = Either[(T, Promise[U]), (U, Promise[T])] val middleState = Promise[State]() @@ -593,48 +625,68 @@ object Execution { } middleState.future.flatMap { - case Left((t, pu)) => pu.future.map((t, _)) + case Left((t, pu)) => pu.future.map((t, _)) case Right((u, pt)) => pt.future.map((_, u)) } } - private[scalding] final case class Zipped[S, T](one: Execution[S], two: Execution[T]) extends Execution[(S, T)] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + private[scalding] final case class Zipped[S, T](one: Execution[S], two: Execution[T]) + extends Execution[(S, T)] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = for { futCancel1 <- Trampoline.call(one.runStats(conf, mode, cache)) futCancel2 <- Trampoline.call(two.runStats(conf, mode, cache)) } yield { - cache.getOrElseInsert(conf, this, - futCancel1.zip(futCancel2).map { case ((s, ss), (t, st)) => ((s, t), ss ++ st) }) + cache.getOrElseInsert( + conf, + this, + futCancel1.zip(futCancel2).map { case ((s, ss), (t, st)) => ((s, t), ss ++ st) } + ) } } private[scalding] final case class UniqueIdExecution[T](fn: UniqueID => Execution[T]) extends Execution[T] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = { - Trampoline(cache.getOrElseInsert(conf, this, { - val (uid, nextConf) = conf.ensureUniqueId - val next0 = fn(uid) - // next0 has not been optimized yet, we need to try - val next = optimize(conf, next0) - next.runStats(nextConf, mode, cache).get - })) - } + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline( + cache.getOrElseInsert( + conf, + this, { + val (uid, nextConf) = conf.ensureUniqueId + val next0 = fn(uid) + // next0 has not been optimized yet, we need to try + val next = optimize(conf, next0) + next.runStats(nextConf, mode, cache).get + } + ) + ) } /* * This allows you to run any cascading flowDef as an Execution. */ - private[scalding] final case class FlowDefExecution(result: (Config, Mode) => FlowDef) extends Execution[Unit] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = { - Trampoline(cache.getOrElseInsert(conf, this, { - cache.writer match { - case ar: AsyncFlowDefRunner => - ar.validateAndRun(conf)(result(_, mode)).map { m => ((), Map(m)) } - case other => - CFuture.failed( - new IllegalArgumentException( - s"requires cascading Mode producing AsyncFlowDefRunner, found mode: $mode and writer ${other.getClass}: $other")) - } - })) - } + private[scalding] final case class FlowDefExecution(result: (Config, Mode) => FlowDef) + extends Execution[Unit] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline( + cache.getOrElseInsert( + conf, + this, + cache.writer match { + case ar: AsyncFlowDefRunner => + ar.validateAndRun(conf)(result(_, mode)).map(m => ((), Map(m))) + case other => + CFuture.failed( + new IllegalArgumentException( + s"requires cascading Mode producing AsyncFlowDefRunner, found mode: $mode and writer ${other.getClass}: $other" + ) + ) + } + ) + ) } /* @@ -649,51 +701,53 @@ object Execution { def pipe: TypedPipe[T] def replacePipe(p: TypedPipe[T]): ToWrite[T] = this match { - case ToWrite.Force(_) => ToWrite.Force(p) - case ToWrite.ToIterable(_) => ToWrite.ToIterable(p) + case ToWrite.Force(_) => ToWrite.Force(p) + case ToWrite.ToIterable(_) => ToWrite.ToIterable(p) case ToWrite.SimpleWrite(_, sink) => ToWrite.SimpleWrite(p, sink) } } object ToWrite extends Serializable { final case class Force[T](@transient pipe: TypedPipe[T]) extends ToWrite[T] final case class ToIterable[T](@transient pipe: TypedPipe[T]) extends ToWrite[T] - final case class SimpleWrite[T](@transient pipe: TypedPipe[T], @transient sink: TypedSink[T]) extends ToWrite[T] + final case class SimpleWrite[T](@transient pipe: TypedPipe[T], @transient sink: TypedSink[T]) + extends ToWrite[T] final case class OptimizedWrite[F[_], T](@transient original: F[T], toWrite: ToWrite[T]) /** - * Optimize these writes into new writes and provide a mapping from - * the original TypedPipe to the new TypedPipe + * Optimize these writes into new writes and provide a mapping from the original TypedPipe to the new + * TypedPipe */ - def optimizeWriteBatch(writes: List[ToWrite[_]], rules: Seq[Rule[TypedPipe]]): List[OptimizedWrite[TypedPipe, _]] = { + def optimizeWriteBatch( + writes: List[ToWrite[_]], + rules: Seq[Rule[TypedPipe]] + ): List[OptimizedWrite[TypedPipe, _]] = { val dag = Dag.empty(typed.OptimizationRules.toLiteral) - val (d1, ws) = writes.foldLeft((dag, List.empty[OptimizedWrite[Id, _]])) { - case ((dag, ws), toWrite) => - val (d1, id) = dag.addRoot(toWrite.pipe) - (d1, OptimizedWrite(id, toWrite) :: ws) + val (d1, ws) = writes.foldLeft((dag, List.empty[OptimizedWrite[Id, _]])) { case ((dag, ws), toWrite) => + val (d1, id) = dag.addRoot(toWrite.pipe) + (d1, OptimizedWrite(id, toWrite) :: ws) } // now we optimize the graph val d2 = d1.applySeq(rules) // convert back to TypedPipe: - ws.foldLeft(List.empty[OptimizedWrite[TypedPipe, _]]) { - case (tail, optWrite) => - def go[A](optWriteId: OptimizedWrite[Id, A]): OptimizedWrite[TypedPipe, A] = { - val idA = optWriteId.original - val origPipe = d1.evaluate(idA) - val optPipe = d2.evaluate(idA) - OptimizedWrite(original = origPipe, - toWrite = optWriteId.toWrite.replacePipe(optPipe)) - } - go(optWrite) :: tail + ws.foldLeft(List.empty[OptimizedWrite[TypedPipe, _]]) { case (tail, optWrite) => + def go[A](optWriteId: OptimizedWrite[Id, A]): OptimizedWrite[TypedPipe, A] = { + val idA = optWriteId.original + val origPipe = d1.evaluate(idA) + val optPipe = d2.evaluate(idA) + OptimizedWrite(original = origPipe, toWrite = optWriteId.toWrite.replacePipe(optPipe)) + } + go(optWrite) :: tail } } } /** - * Something that can handle a batch of writes that may be optimized - * before running. Return a unique Long for each run and Counters + * Something that can handle a batch of writes that may be optimized before running. Return a unique Long + * for each run and Counters */ trait Writer { + /** * This is called by an Execution to begin processing */ @@ -705,80 +759,80 @@ object Execution { def finished(): Unit /** - * do a batch of writes, possibly optimizing, and return a new unique - * Long. + * do a batch of writes, possibly optimizing, and return a new unique Long. * * empty writes are legitmate and should still return a Long */ - def execute( - conf: Config, - writes: List[ToWrite[_]])(implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] /** * This should only be called after a call to execute */ private[Execution] def getForced[T]( - conf: Config, - initial: TypedPipe[T] - )(implicit cec: ConcurrentExecutionContext): Future[TypedPipe[T]] + conf: Config, + initial: TypedPipe[T] + )(implicit cec: ConcurrentExecutionContext): Future[TypedPipe[T]] /** * This should only be called after a call to execute */ private[Execution] def getIterable[T]( - conf: Config, - initial: TypedPipe[T] - )(implicit cec: ConcurrentExecutionContext): Future[Iterable[T]] + conf: Config, + initial: TypedPipe[T] + )(implicit cec: ConcurrentExecutionContext): Future[Iterable[T]] } /** - * This is the fundamental execution that actually happens in TypedPipes, all the rest - * are based on on this one. By keeping the Pipe and the Sink, can inspect the Execution - * DAG and optimize it later (a goal, but not done yet). + * This is the fundamental execution that actually happens in TypedPipes, all the rest are based on on this + * one. By keeping the Pipe and the Sink, can inspect the Execution DAG and optimize it later (a goal, but + * not done yet). */ private[scalding] final case class WriteExecution[T]( - head: ToWrite[_], - tail: List[ToWrite[_]], - result: ((Config, Mode, Writer, ConcurrentExecutionContext)) => Future[T]) extends Execution[T] { + head: ToWrite[_], + tail: List[ToWrite[_]], + result: ((Config, Mode, Writer, ConcurrentExecutionContext)) => Future[T] + ) extends Execution[T] { /** - * We override this here to enable inlining the zip optimization - * below. + * We override this here to enable inlining the zip optimization below. * - * This is such an important optimization, that we apply it locally. - * It is a bit ugly to have it here and in ExecutionOptimizationRules - * but since this is so important, we do so anyway. - * - * Note Execution optimizations are not always applied, they are something - * users can disable, which they may since in some cases giant Execution - * graphs have seen stack overflows. It doesn't hurt to apply this optimization - * here, but it doesn't cover all cases since it only combines adjacent - * writes. + * This is such an important optimization, that we apply it locally. It is a bit ugly to have it here and + * in ExecutionOptimizationRules but since this is so important, we do so anyway. * + * Note Execution optimizations are not always applied, they are something users can disable, which they + * may since in some cases giant Execution graphs have seen stack overflows. It doesn't hurt to apply this + * optimization here, but it doesn't cover all cases since it only combines adjacent writes. */ override def map[U](mapFn: T => U): Execution[U] = - - WriteExecution(head, - tail, - ExecutionOptimizationRules.MapWrite.ComposeMap(result, mapFn)) - - private def unwrapListEither[A, B, C](it: List[(A, Either[B, C])]): (List[(A, B)], List[(A, C)]) = it match { - case (a, Left(b)) :: tail => - val (l, r) = unwrapListEither(tail) - ((a, b) :: l, r) - case (a, Right(c)) :: tail => - val (l, r) = unwrapListEither(tail) - (l, (a, c) :: r) - case Nil => (Nil, Nil) - } + WriteExecution(head, tail, ExecutionOptimizationRules.MapWrite.ComposeMap(result, mapFn)) + + private def unwrapListEither[A, B, C](it: List[(A, Either[B, C])]): (List[(A, B)], List[(A, C)]) = + it match { + case (a, Left(b)) :: tail => + val (l, r) = unwrapListEither(tail) + ((a, b) :: l, r) + case (a, Right(c)) :: tail => + val (l, r) = unwrapListEither(tail) + (l, (a, c) :: r) + case Nil => (Nil, Nil) + } // We look up to see if any of our ToWrite elements have already been ran // if so we remove them from the cache. // Anything not already ran we run as part of a single flow def, using their combined counters for the others - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { lazy val uncachedFutureCancel = { - val cacheLookup: List[(ToWrite[_], (Either[CPromise[Map[Long, ExecutionCounters]], CFuture[Map[Long, ExecutionCounters]]]))] = - (head :: tail).map{ tw => (tw, cache.getOrLock(conf, tw)) } + val cacheLookup: List[ + ( + ToWrite[_], + (Either[CPromise[Map[Long, ExecutionCounters]], CFuture[Map[Long, ExecutionCounters]]]) + ) + ] = + (head :: tail).map(tw => (tw, cache.getOrLock(conf, tw))) val (weDoOperation, someoneElseDoesOperation) = unwrapListEither(cacheLookup) val otherResult = CFuture.failFastSequence(someoneElseDoesOperation.map(_._2)) @@ -797,9 +851,8 @@ object Execution { // Complete all of the promises we put into the cache // with this future counters set - all.foreach { - case (toWrite, cpromise) => - cpromise.completeWith(CFuture(futCounters, cancelHandler)) + all.foreach { case (toWrite, cpromise) => + cpromise.completeWith(CFuture(futCounters, cancelHandler)) } CFuture(futCounters, cancelHandler) case Nil => @@ -810,7 +863,9 @@ object Execution { val fut = for { (lCounters, fdCounters) <- bothFutures.future - t <- result((conf, mode, cache.writer, cec)) // TODO do i need to do something here to make this cancellable? + t <- result( + (conf, mode, cache.writer, cec) + ) // TODO do i need to do something here to make this cancellable? summedCounters = (fdCounters :: lCounters).reduce(_ ++ _) } yield (t, summedCounters) @@ -821,30 +876,25 @@ object Execution { Trampoline(cache.getOrElseInsert(conf, this, uncachedFutureCancel)) } - /** - * This is such an important optimization, that we apply it locally. - * It is a bit ugly to have it here and in ExecutionOptimizationRules - * but since this is so important, we do so anyway. + * This is such an important optimization, that we apply it locally. It is a bit ugly to have it here and + * in ExecutionOptimizationRules but since this is so important, we do so anyway. * - * Note Execution optimizations are not always applied, they are something - * users can disable, which they may since in some cases giant Execution - * graphs have seen stack overflows. It doesn't hurt to apply this optimization - * here, but it doesn't cover all cases since it only combines adjacent - * writes. + * Note Execution optimizations are not always applied, they are something users can disable, which they + * may since in some cases giant Execution graphs have seen stack overflows. It doesn't hurt to apply this + * optimization here, but it doesn't cover all cases since it only combines adjacent writes. * - * Note, each Write is individually cached so it won't happen twice, - * but it is usually better to compose into the biggest set of writes - * so the planner can optimize the largest graph possible. + * Note, each Write is individually cached so it won't happen twice, but it is usually better to compose + * into the biggest set of writes so the planner can optimize the largest graph possible. * - * run this and that in parallel, without any dependency. This will - * be done in a single cascading flow if possible. + * run this and that in parallel, without any dependency. This will be done in a single cascading flow if + * possible. * * If both sides are write executions then merge them */ override def zip[U](that: Execution[U]): Execution[(T, U)] = that match { - case w1@WriteExecution(_, _, _) => + case w1 @ WriteExecution(_, _, _) => ExecutionOptimizationRules.ZipWrite.mergeWrite(this, w1) case that => Zipped(this, that) } @@ -854,20 +904,22 @@ object Execution { * This is called Reader, because it just returns its input to run as the output */ private[scalding] case object ReaderExecution extends Execution[(Config, Mode)] { - protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit cec: ConcurrentExecutionContext) = + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = Trampoline(CFuture.successful(((conf, mode), Map.empty))) override def equals(that: Any): Boolean = that match { // this has to be here or we get an infinite loop in the default equals case _: ReaderExecution.type => true - case _ => false + case _ => false } } private def toFuture[R](t: Try[R]): Future[R] = t match { - case Success(s) => Future.successful(s) + case Success(s) => Future.successful(s) case Failure(err) => Future.failed(err) } @@ -877,26 +929,22 @@ object Execution { def failed(t: Throwable): Execution[Nothing] = fromTry(Failure(t)) /** - * This makes a constant execution that runs no job. - * Note this is a lazy parameter that is evaluated every - * time run is called and does so in the ExecutionContext - * given to run + * This makes a constant execution that runs no job. Note this is a lazy parameter that is evaluated every + * time run is called and does so in the ExecutionContext given to run */ - def from[T](t: => T): Execution[T] = fromFuture { implicit ec => Future(t) } + def from[T](t: => T): Execution[T] = fromFuture(implicit ec => Future(t)) + /** - * This evaluates the argument every time run is called, and does - * so in the ExecutionContext given to run + * This evaluates the argument every time run is called, and does so in the ExecutionContext given to run */ def fromTry[T](t: => Try[T]): Execution[T] = fromFuture { implicit ec => Future(t).flatMap(toFuture) } /** - * The call to fn will happen when the run method on the result is called. - * The ConcurrentExecutionContext will be the same one used on run. - * This is intended for cases where you need to make asynchronous calls - * in the middle or end of execution. Presumably this is used with flatMap - * either before or after + * The call to fn will happen when the run method on the result is called. The ConcurrentExecutionContext + * will be the same one used on run. This is intended for cases where you need to make asynchronous calls in + * the middle or end of execution. Presumably this is used with flatMap either before or after */ def fromFuture[T](fn: ConcurrentExecutionContext => Future[T]): Execution[T] = FutureConst(fn) @@ -904,23 +952,16 @@ object Execution { val unit: Execution[Unit] = from(()) /** - * This converts a function into an Execution monad. The flowDef returned - * is never mutated. + * This converts a function into an Execution monad. The flowDef returned is never mutated. */ def fromFn(fn: (Config, Mode) => FlowDef): Execution[Unit] = FlowDefExecution(fn) def forceToDisk[T](t: TypedPipe[T]): Execution[TypedPipe[T]] = - WriteExecution( - ToWrite.Force(t), - Nil, - { case (conf, _, w, cec) => w.getForced(conf, t)(cec) }) + WriteExecution(ToWrite.Force(t), Nil, { case (conf, _, w, cec) => w.getForced(conf, t)(cec) }) def toIterable[T](t: TypedPipe[T]): Execution[Iterable[T]] = - WriteExecution( - ToWrite.ToIterable(t), - Nil, - { case (conf, _, w, cec) => w.getIterable(conf, t)(cec) }) + WriteExecution(ToWrite.ToIterable(t), Nil, { case (conf, _, w, cec) => w.getIterable(conf, t)(cec) }) /** * The simplest form, just sink the typed pipe into the sink and get a unit execution back @@ -929,17 +970,16 @@ object Execution { write(pipe, sink, ()) private[scalding] def write[T, U](pipe: TypedPipe[T], sink: TypedSink[T], presentType: => U): Execution[U] = - WriteExecution(ToWrite.SimpleWrite(pipe, sink), - Nil, - { tup => Future(presentType)(tup._4) }) + WriteExecution(ToWrite.SimpleWrite(pipe, sink), Nil, tup => Future(presentType)(tup._4)) /** * Convenience method to get the Args */ def getArgs: Execution[Args] = ReaderExecution.map(_._1.getArgs) + /** - * Use this to read the configuration, which may contain Args or options - * which describe input on which to run + * Use this to read the configuration, which may contain Args or options which describe input on which to + * run */ def getConfig: Execution[Config] = ReaderExecution.map(_._1) @@ -950,22 +990,15 @@ object Execution { def getConfigMode: Execution[(Config, Mode)] = ReaderExecution /** - * This is convenience method only here to make it slightly cleaner - * to get Args, which are in the Config + * This is convenience method only here to make it slightly cleaner to get Args, which are in the Config */ def withArgs[T](fn: Args => Execution[T]): Execution[T] = - getConfig.flatMap { conf => fn(conf.getArgs) } + getConfig.flatMap(conf => fn(conf.getArgs)) /** - * Use this to use counters/stats with Execution. You do this: - * Execution.withId { implicit uid => - * val myStat = Stat("myStat") // uid is implicitly pulled in - * pipe.map { t => - * if(someCase(t)) myStat.inc - * fn(t) - * } - * .writeExecution(mySink) - * } + * Use this to use counters/stats with Execution. You do this: Execution.withId { implicit uid => val myStat + * = Stat("myStat") // uid is implicitly pulled in pipe.map { t => if(someCase(t)) myStat.inc fn(t) } + * .writeExecution(mySink) } */ def withId[T](fn: UniqueID => Execution[T]): Execution[T] = UniqueIdExecution(fn) @@ -1005,20 +1038,24 @@ object Execution { /** * combine several executions and run them in parallel when .run is called */ - def zip[A, B, C, D](ax: Execution[A], - bx: Execution[B], - cx: Execution[C], - dx: Execution[D]): Execution[(A, B, C, D)] = + def zip[A, B, C, D]( + ax: Execution[A], + bx: Execution[B], + cx: Execution[C], + dx: Execution[D] + ): Execution[(A, B, C, D)] = ax.zip(bx).zip(cx).zip(dx).map { case (((a, b), c), d) => (a, b, c, d) } /** * combine several executions and run them in parallel when .run is called */ - def zip[A, B, C, D, E](ax: Execution[A], - bx: Execution[B], - cx: Execution[C], - dx: Execution[D], - ex: Execution[E]): Execution[(A, B, C, D, E)] = + def zip[A, B, C, D, E]( + ax: Execution[A], + bx: Execution[B], + cx: Execution[C], + dx: Execution[D], + ex: Execution[E] + ): Execution[(A, B, C, D, E)] = ax.zip(bx).zip(cx).zip(dx).zip(ex).map { case ((((a, b), c), d), e) => (a, b, c, d, e) } // Avoid recreating the empty Execution each time @@ -1036,7 +1073,7 @@ object Execution { def sequence[T](exs: Seq[Execution[T]]): Execution[Seq[T]] = { @annotation.tailrec def go(xs: List[Execution[T]], acc: Execution[List[T]]): Execution[List[T]] = xs match { - case Nil => acc + case Nil => acc case h :: tail => go(tail, h.zip(acc).map(ConsList())) } // This pushes all of them onto a list, and then reverse to keep order @@ -1044,60 +1081,63 @@ object Execution { } /** - * Run a sequence of executions but only permitting parallelism amount to run at the - * same time. + * Run a sequence of executions but only permitting parallelism amount to run at the same time. * - * @param executions List of executions to run - * @param parallelism Number to run in parallel - * @return Execution Seq + * @param executions + * List of executions to run + * @param parallelism + * Number to run in parallel + * @return + * Execution Seq */ def withParallelism[T](executions: Seq[Execution[T]], parallelism: Int): Execution[Seq[T]] = { require(parallelism > 0, s"Parallelism must be > 0: $parallelism") val sem = new AsyncSemaphore(parallelism) - def waitRun(e: Execution[T]): Execution[T] = { - Execution.fromFuture(_ => sem.acquire()) + def waitRun(e: Execution[T]): Execution[T] = + Execution + .fromFuture(_ => sem.acquire()) .flatMap(p => e.liftToTry.map((_, p))) .onComplete { case Success((_, p)) => p.release() - case Failure(ex) => throw ex // should never happen or there is a logic bug + case Failure(ex) => throw ex // should never happen or there is a logic bug } - .flatMap{ case (ex, _) => fromTry(ex) } - } + .flatMap { case (ex, _) => fromTry(ex) } Execution.sequence(executions.map(waitRun)) } } /** - * This represents the counters portion of the JobStats that are returned. - * Counters are just a vector of longs with counter name, group keys. + * This represents the counters portion of the JobStats that are returned. Counters are just a vector of longs + * with counter name, group keys. */ trait ExecutionCounters { + /** * immutable set of the keys. */ def keys: Set[StatKey] + /** - * Same as get(key).getOrElse(0L) - * Note if a counter is never incremented, get returns None. - * But you can't tell 0L that comes from None vs. a counter - * that was incremented then decremented. + * Same as get(key).getOrElse(0L) Note if a counter is never incremented, get returns None. But you can't + * tell 0L that comes from None vs. a counter that was incremented then decremented. */ def apply(key: StatKey): Long = get(key).getOrElse(0L) + /** * If the counter is present, return it. */ def get(key: StatKey): Option[Long] - def toMap: Map[StatKey, Long] = keys.map { k => (k, get(k).getOrElse(0L)) }.toMap + def toMap: Map[StatKey, Long] = keys.map(k => (k, get(k).getOrElse(0L))).toMap } /** - * The companion gives several ways to create ExecutionCounters from - * other CascadingStats, JobStats, or Maps + * The companion gives several ways to create ExecutionCounters from other CascadingStats, JobStats, or Maps */ object ExecutionCounters { + /** * This is the zero of the ExecutionCounter Monoid */ @@ -1108,8 +1148,7 @@ object ExecutionCounters { } /** - * Just gets the counters from the CascadingStats and ignores - * all the other fields present + * Just gets the counters from the CascadingStats and ignores all the other fields present */ def fromCascading(cs: cascading.stats.CascadingStats): ExecutionCounters = new ExecutionCounters { import scala.collection.JavaConverters._ @@ -1140,6 +1179,7 @@ object ExecutionCounters { def get(k: StatKey) = counters.get(k.group).flatMap(_.get(k.counter)) } } + /** * A Simple wrapper over a Map[StatKey, Long] */ @@ -1151,16 +1191,12 @@ object ExecutionCounters { } /** - * This allows us to merge the results of two computations. It just - * does pointwise addition. + * This allows us to merge the results of two computations. It just does pointwise addition. */ implicit def monoid: Monoid[ExecutionCounters] = new Monoid[ExecutionCounters] { override def isNonZero(that: ExecutionCounters) = that.keys.nonEmpty def zero = ExecutionCounters.empty - def plus(left: ExecutionCounters, right: ExecutionCounters) = { - fromMap((left.keys ++ right.keys) - .map { k => (k, left(k) + right(k)) } - .toMap) - } + def plus(left: ExecutionCounters, right: ExecutionCounters) = + fromMap((left.keys ++ right.keys).map(k => (k, left(k) + right(k))).toMap) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala index 56b88843c9..dd579b8938 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding /* @@ -66,7 +66,7 @@ object ExecutionApp { // We can have something left in the last bucket, so extract it. val nonHadoop = finalLast match { case Some(x) => tmpNonHadoop :+ x - case None => tmpNonHadoop + case None => tmpNonHadoop } // Throwaway hadoop config @@ -83,8 +83,7 @@ trait ExecutionApp extends java.io.Serializable { /** * The first argument should be the mode name (hdfs or local) * - * The default for this is to parse all hadoop arguments - * and put them into the config. Any unparsed hadoop + * The default for this is to parse all hadoop arguments and put them into the config. Any unparsed hadoop * arguments are put into the Args. */ def config(inputArgs: Array[String]): (Config, Mode) = { @@ -102,7 +101,9 @@ trait ExecutionApp extends java.io.Serializable { Config .hadoopWithDefaults(hconf) .setArgs(args) - .setExecutionCleanupOnFinish(true) // since ExecutionApp returns Execution[Unit], temp paths can't escape + .setExecutionCleanupOnFinish( + true + ) // since ExecutionApp returns Execution[Unit], temp paths can't escape /* * Make sure the hadoop config is set in sync with the config * which should not matter for execution, but especially legacy @@ -114,9 +115,8 @@ trait ExecutionApp extends java.io.Serializable { (config, mode) } - def main(args: Array[String]): Unit = { + def main(args: Array[String]): Unit = config(args) match { case (conf, mode) => job.waitFor(conf, mode).get } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala index 8c64118dc7..5e379f59a4 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala @@ -12,22 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.hadoop.HadoopFlow import cascading.flow.planner.BaseFlowStep -import cascading.flow.{ Flow, FlowDef, FlowStepStrategy } +import cascading.flow.{Flow, FlowDef, FlowStepStrategy} import cascading.pipe.Pipe import com.twitter.scalding.estimation.memory.MemoryEstimatorStepStrategy import com.twitter.scalding.reducer_estimation.ReducerEstimatorStepStrategy import com.twitter.scalding.serialization.CascadingBinaryComparator import com.twitter.scalding.typed.cascading_backend.CascadingBackend import org.apache.hadoop.mapred.JobConf -import org.slf4j.{ Logger, LoggerFactory } +import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ import scala.concurrent.Future -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} /* * This has all the state needed to build a single flow @@ -39,22 +39,21 @@ trait ExecutionContext { def flowDef: FlowDef def mode: CascadingMode - private def getIdentifierOpt(descriptions: Seq[String]): Option[String] = { + private def getIdentifierOpt(descriptions: Seq[String]): Option[String] = if (descriptions.nonEmpty) Some(descriptions.distinct.mkString(", ")) else None - } private def updateStepConfigWithDescriptions(step: BaseFlowStep[JobConf]): Unit = { val conf = step.getConfig - getIdentifierOpt(ExecutionContext.getDesc(step)).foreach(descriptionString => { + getIdentifierOpt(ExecutionContext.getDesc(step)).foreach { descriptionString => conf.set(Config.StepDescriptions, descriptionString) - }) + } } /** * @return - * Success(Some(flow)) -- when everything is right and we can build a flow from flowDef - * Success(None) -- when flowDef doesn't have sinks, even after we applied pending writes - * Failure(exception) -- when it’s impossible to build a flow + * Success(Some(flow)) -- when everything is right and we can build a flow from flowDef Success(None) -- + * when flowDef doesn't have sinks, even after we applied pending writes Failure(exception) -- when it’s + * impossible to build a flow */ final def buildFlow: Try[Option[Flow[_]]] = // For some horrible reason, using Try( ) instead of the below gets me stuck: @@ -103,9 +102,8 @@ trait ExecutionContext { flow match { case hadoopFlow: HadoopFlow => val flowSteps = hadoopFlow.getFlowSteps.asScala - flowSteps.foreach { - case baseFlowStep: BaseFlowStep[JobConf] => - updateStepConfigWithDescriptions(baseFlowStep) + flowSteps.foreach { case baseFlowStep: BaseFlowStep[JobConf] => + updateStepConfigWithDescriptions(baseFlowStep) } case _ => // descriptions not yet supported in other modes } @@ -115,13 +113,18 @@ trait ExecutionContext { mode match { case _: HadoopMode => val reducerEstimatorStrategy: Seq[FlowStepStrategy[JobConf]] = config - .get(Config.ReducerEstimators).toList.map(_ => ReducerEstimatorStepStrategy) + .get(Config.ReducerEstimators) + .toList + .map(_ => ReducerEstimatorStepStrategy) val memoryEstimatorStrategy: Seq[FlowStepStrategy[JobConf]] = config - .get(Config.MemoryEstimators).toList.map(_ => MemoryEstimatorStepStrategy) + .get(Config.MemoryEstimators) + .toList + .map(_ => MemoryEstimatorStepStrategy) val otherStrategies: Seq[FlowStepStrategy[JobConf]] = config.getFlowStepStrategies.map { case Success(fn) => fn(mode, configWithId) - case Failure(e) => throw new Exception("Failed to decode flow step strategy when submitting job", e) + case Failure(e) => + throw new Exception("Failed to decode flow step strategy when submitting job", e) } val optionalFinalStrategy = FlowStepStrategies() @@ -133,27 +136,28 @@ trait ExecutionContext { config.getFlowListeners.foreach { case Success(fn) => flow.addListener(fn(mode, configWithId)) - case Failure(e) => throw new Exception("Failed to decode flow listener", e) + case Failure(e) => throw new Exception("Failed to decode flow listener", e) } config.getFlowStepListeners.foreach { case Success(fn) => flow.addStepListener(fn(mode, configWithId)) - case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) + case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) } case _: CascadingLocal => config.getFlowStepStrategies.foreach { case Success(fn) => flow.setFlowStepStrategy(fn(mode, configWithId)) - case Failure(e) => throw new Exception("Failed to decode flow step strategy when submitting job", e) + case Failure(e) => + throw new Exception("Failed to decode flow step strategy when submitting job", e) } config.getFlowListeners.foreach { case Success(fn) => flow.addListener(fn(mode, configWithId)) - case Failure(e) => throw new Exception("Failed to decode flow listener", e) + case Failure(e) => throw new Exception("Failed to decode flow listener", e) } config.getFlowStepListeners.foreach { case Success(fn) => flow.addStepListener(fn(mode, configWithId)) - case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) + case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) } case _ => () @@ -165,14 +169,13 @@ trait ExecutionContext { } /** - * Asynchronously execute the plan currently - * contained in the FlowDef + * Asynchronously execute the plan currently contained in the FlowDef */ final def run: Future[JobStats] = buildFlow match { case Success(Some(flow)) => Execution.run(flow) - case Success(None) => Future.successful(JobStats.empty) - case Failure(err) => Future.failed(err) + case Success(None) => Future.successful(JobStats.empty) + case Failure(err) => Future.failed(err) } /** @@ -181,7 +184,7 @@ trait ExecutionContext { final def waitFor: Try[JobStats] = buildFlow.flatMap { case Some(flow) => Execution.waitFor(flow) - case None => Success(JobStats.empty) + case None => Success(JobStats.empty) } } @@ -195,12 +198,11 @@ trait ExecutionContext { object ExecutionContext { private val LOG: Logger = LoggerFactory.getLogger(ExecutionContext.getClass) - private[scalding] def getDesc[T](baseFlowStep: BaseFlowStep[T]): Seq[String] = { + private[scalding] def getDesc[T](baseFlowStep: BaseFlowStep[T]): Seq[String] = baseFlowStep.getGraph.vertexSet.asScala.flatMap { case pipe: Pipe => RichPipe.getPipeDescriptions(pipe) - case _ => List() // no descriptions + case _ => List() // no descriptions }(collection.breakOut) - } /* * implicit val ec = ExecutionContext.newContext(config) * can be used inside of a Job to get an ExecutionContext if you want @@ -216,4 +218,3 @@ object ExecutionContext { implicit def modeFromContext(implicit ec: ExecutionContext): Mode = ec.mode implicit def flowDefFromContext(implicit ec: ExecutionContext): FlowDef = ec.flowDef } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala index 072715aa37..763c31ce0a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala @@ -5,16 +5,16 @@ import com.twitter.scalding.ExecutionOptimizationRules.ZipMap.{MapLeft, MapRight import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapFn import com.twitter.scalding.typed.functions.{ComposedFunctions, Identity, Swap} import scala.annotation.tailrec -import scala.concurrent.{Future, ExecutionContext => ConcurrentExecutionContext} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} object ExecutionOptimizationRules { type LiteralExecution[T] = Literal[Execution, T] /** - * Since our Execution is covariant, but the Literal is not - * this is actually safe in this context, but not in general + * Since our Execution is covariant, but the Literal is not this is actually safe in this context, but not + * in general */ - def widen[T](l: LiteralExecution[_ <: T]): LiteralExecution[T] = { + def widen[T](l: LiteralExecution[_ <: T]): LiteralExecution[T] = // to prove this is safe, see that if you have // LiteralExecution[_ <: T] we can call .evaluate to get // Execution[_ <: T] which due to covariance is @@ -24,13 +24,12 @@ object ExecutionOptimizationRules { // that would be wasteful to apply since the final // result is identity. l.asInstanceOf[LiteralExecution[T]] - } def toLiteral: FunctionK[Execution, LiteralExecution] = Memoize.functionK[Execution, LiteralExecution]( new Memoize.RecursiveK[Execution, LiteralExecution] { override def toFunction[A] = { - case (e@Execution.ReaderExecution, _) => + case (e @ Execution.ReaderExecution, _) => Literal.Const(e) case (e: Execution.FutureConst[a], _) => Literal.Const(e) @@ -41,7 +40,12 @@ object ExecutionOptimizationRules { case (e: Execution.WriteExecution[a], _) => Literal.Const(e) case (e: Execution.GetCounters[a], f) => - widen(Literal.Unary[Execution, a, (a, ExecutionCounters)](f(e.prev), Execution.GetCounters(_: Execution[a]))) + widen( + Literal.Unary[Execution, a, (a, ExecutionCounters)]( + f(e.prev), + Execution.GetCounters(_: Execution[a]) + ) + ) case (e: Execution.ResetCounters[a], f) => Literal.Unary(f(e.prev), Execution.ResetCounters(_: Execution[a])) case (e: Execution.WithNewCache[a], f) => @@ -63,13 +67,11 @@ object ExecutionOptimizationRules { ) /** - * If `Execution` is `FlowDefExecution` or `WriteExecution`, - * we are considering those executions as slow, since they will schedule some expensive work, - * like Hadoop or Spark Job. + * If `Execution` is `FlowDefExecution` or `WriteExecution`, we are considering those executions as slow, + * since they will schedule some expensive work, like Hadoop or Spark Job. * - * If `Execution` is `FlatMapped` or `UniqueIdExecution`, - * we are considering those executions as slow, - * since we don't know which execution they can produce. + * If `Execution` is `FlatMapped` or `UniqueIdExecution`, we are considering those executions as slow, since + * we don't know which execution they can produce. * * Everything else we are considering as fast execution compare to `FlowDefExecution` and `WriteExecution`. */ @@ -77,13 +79,11 @@ object ExecutionOptimizationRules { areFastExecution(e :: Nil) /** - * If `Execution` is `FlowDefExecution` or `WriteExecution`, - * we are considering those executions as slow, since they will schedule some expensive work, - * like Hadoop or Spark Job. + * If `Execution` is `FlowDefExecution` or `WriteExecution`, we are considering those executions as slow, + * since they will schedule some expensive work, like Hadoop or Spark Job. * - * If `Execution` is `FlatMapped` or `UniqueIdExecution`, - * we are considering those executions as slow, - * since we don't know which execution they can produce. + * If `Execution` is `FlatMapped` or `UniqueIdExecution`, we are considering those executions as slow, since + * we don't know which execution they can produce. * * Everything else we are considering as fast execution compare to `FlowDefExecution` and `WriteExecution`. */ @@ -93,34 +93,31 @@ object ExecutionOptimizationRules { case Nil => true case h :: tail => h match { - case Execution.UniqueIdExecution(_) => false - case Execution.FlowDefExecution(_) => false + case Execution.UniqueIdExecution(_) => false + case Execution.FlowDefExecution(_) => false case Execution.WriteExecution(_, _, _) => false - case Execution.FlatMapped(_, _) => false + case Execution.FlatMapped(_, _) => false - case Execution.ReaderExecution => areFastExecution(tail) - case Execution.FutureConst(_) => areFastExecution(tail) - case Execution.GetCounters(e) => areFastExecution(e :: tail) - case Execution.ResetCounters(e) => areFastExecution(e :: tail) - case Execution.WithNewCache(e) => areFastExecution(e :: tail) + case Execution.ReaderExecution => areFastExecution(tail) + case Execution.FutureConst(_) => areFastExecution(tail) + case Execution.GetCounters(e) => areFastExecution(e :: tail) + case Execution.ResetCounters(e) => areFastExecution(e :: tail) + case Execution.WithNewCache(e) => areFastExecution(e :: tail) case Execution.TransformedConfig(e, _) => areFastExecution(e :: tail) - case Execution.OnComplete(e, _) => areFastExecution(e :: tail) - case Execution.RecoverWith(e, _) => areFastExecution(e :: tail) - case Execution.Mapped(e, _) => areFastExecution(e :: tail) - case Execution.Zipped(one, two) => areFastExecution(one :: two :: tail) + case Execution.OnComplete(e, _) => areFastExecution(e :: tail) + case Execution.RecoverWith(e, _) => areFastExecution(e :: tail) + case Execution.Mapped(e, _) => areFastExecution(e :: tail) + case Execution.Zipped(one, two) => areFastExecution(one :: two :: tail) } } /** - * This is a rather complex optimization rule, but also very important. - * After this runs, there will only be 1 WriteExecution in a graph, - * other than within recoverWith/flatMap/uniqueId nodes. + * This is a rather complex optimization rule, but also very important. After this runs, there will only be + * 1 WriteExecution in a graph, other than within recoverWith/flatMap/uniqueId nodes. * - * This is the best we can do without running those functions. - * The motivation for this is to allow the user to write Executions - * as is convenient in code, but still have full access to a TypedPipe - * graph when planning a stage. Without this, we can wind up recomputing - * work that we don't need to do. + * This is the best we can do without running those functions. The motivation for this is to allow the user + * to write Executions as is convenient in code, but still have full access to a TypedPipe graph when + * planning a stage. Without this, we can wind up recomputing work that we don't need to do. */ case object ZipWrite extends Rule[Execution] { import Execution._ @@ -142,18 +139,20 @@ object ExecutionOptimizationRules { (in._2._1, (in._1, in._2._2)) } case class ComposedFn[A1, A2, A, B1, B2, B]( - fn1: Function1[(A1, A2), A], - fn2: Function1[(B1, B2), B] + fn1: Function1[(A1, A2), A], + fn2: Function1[(B1, B2), B] ) extends Function1[((A1, B1), (A2, B2)), (A, B)] { - override def apply(v1: ((A1, B1), (A2, B2))): (A, B) = (fn1(v1._1._1, v1._2._1), fn2(v1._1._2, v1._2._2)) + override def apply(v1: ((A1, B1), (A2, B2))): (A, B) = + (fn1(v1._1._1, v1._2._1), fn2(v1._1._2, v1._2._2)) } case class ComposeWriteFn[A, B, C, D, E]( - fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], - fn2: ((A, B, C, ConcurrentExecutionContext)) => Future[E]) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[(D, E)]] { + fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], + fn2: ((A, B, C, ConcurrentExecutionContext)) => Future[E] + ) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[(D, E)]] { def apply(tup: (A, B, C, ConcurrentExecutionContext)): Future[(D, E)] = - (Execution.failFastZip(fn1(tup), fn2(tup))(tup._4)) + Execution.failFastZip(fn1(tup), fn2(tup))(tup._4) } def mergeWrite[A, B](w1: WriteExecution[A], w2: WriteExecution[B]): WriteExecution[(A, B)] = { @@ -162,26 +161,29 @@ object ExecutionOptimizationRules { } /** - * This is the fundamental type we use to optimize zips, basically we - * expand graphs of WriteExecution, Zipped, Mapped. - * Our goal to optimize any `Execution`'s DAG to have at most one write. + * This is the fundamental type we use to optimize zips, basically we expand graphs of WriteExecution, + * Zipped, Mapped. Our goal to optimize any `Execution`'s DAG to have at most one write. * * This is achieved by optimizing any `Execution` to either: - * - `NonWrite` execution - * - `Write` execution - * - composed execution which has both write and non write. + * - `NonWrite` execution + * - `Write` execution + * - composed execution which has both write and non write. */ private sealed trait FlattenedZip[+A] private object FlattenedZip { final case class NonWrite[T](nonWrite: Execution[T]) extends FlattenedZip[T] final case class Write[T](write: WriteExecution[T]) extends FlattenedZip[T] - final case class Composed[T1, T2, T](write: WriteExecution[T1], nonWrite: Execution[T2], compose: Function1[(T1, T2), T]) extends FlattenedZip[T] + final case class Composed[T1, T2, T]( + write: WriteExecution[T1], + nonWrite: Execution[T2], + compose: Function1[(T1, T2), T] + ) extends FlattenedZip[T] def toExecution[A](ex: FlattenedZip[A]): Execution[A] = ex match { - case NonWrite(nonWrite) => nonWrite - case Write(write) => write - case c@Composed(_, _, _) => c.write.zip(c.nonWrite).map(c.compose) + case NonWrite(nonWrite) => nonWrite + case Write(write) => write + case c @ Composed(_, _, _) => c.write.zip(c.nonWrite).map(c.compose) } def map[A, B](ex: FlattenedZip[A], fn: A => B): FlattenedZip[B] = ex match { @@ -193,65 +195,82 @@ object ExecutionOptimizationRules { Composed(write, nonWrite, ComposedMapFn(compose, fn)) } - def zip[A, B](left: FlattenedZip[A], right: FlattenedZip[B]): FlattenedZip[(A, B)] = (left, right) match { - case (left@NonWrite(_), right@NonWrite(_)) => - NonWrite(left.nonWrite.zip(right.nonWrite)) - case (left@NonWrite(_), right@Write(_)) => - Composed(right.write, left.nonWrite, Swap[B, A]()) - case (left@NonWrite(_), right@Composed(_, _, _)) => - zipNonWriteComposed(left, right) - - case (left@Write(_), right@NonWrite(_)) => - Composed(left.write, right.nonWrite, Identity[(A, B)]()) - case (left@Write(_), right@Write(_)) => - Write(mergeWrite(left.write, right.write)) - case (left@Write(_), right@Composed(_, _, _)) => - zipWriteComposed(left, right) - - case (left@Composed(_, _, _), right@NonWrite(_)) => - map(zipNonWriteComposed(right, left), Swap[B, A]()) - case (left@Composed(_, _, _), right@Write(_)) => - map(zipWriteComposed(right, left), Swap[B, A]()) - case (left@Composed(_, _, _), right@Composed(_, _, _)) => - Composed(mergeWrite(left.write, right.write), left.nonWrite.zip(right.nonWrite), - ComposedFn(left.compose, right.compose)) - } - - private def zipNonWriteComposed[A, B1, B2, B](left: NonWrite[A], right: Composed[B1, B2, B]): Composed[B1, (B2, A), (A, B)] = - Composed(right.write, right.nonWrite.zip(left.nonWrite), - ComposedMapFn(ComposedMapFn(UnTwist(), MapLeft[(B1, B2), A, B](right.compose)), Swap[B, A]())) + def zip[A, B](left: FlattenedZip[A], right: FlattenedZip[B]): FlattenedZip[(A, B)] = + (left, right) match { + case (left @ NonWrite(_), right @ NonWrite(_)) => + NonWrite(left.nonWrite.zip(right.nonWrite)) + case (left @ NonWrite(_), right @ Write(_)) => + Composed(right.write, left.nonWrite, Swap[B, A]()) + case (left @ NonWrite(_), right @ Composed(_, _, _)) => + zipNonWriteComposed(left, right) + + case (left @ Write(_), right @ NonWrite(_)) => + Composed(left.write, right.nonWrite, Identity[(A, B)]()) + case (left @ Write(_), right @ Write(_)) => + Write(mergeWrite(left.write, right.write)) + case (left @ Write(_), right @ Composed(_, _, _)) => + zipWriteComposed(left, right) + + case (left @ Composed(_, _, _), right @ NonWrite(_)) => + map(zipNonWriteComposed(right, left), Swap[B, A]()) + case (left @ Composed(_, _, _), right @ Write(_)) => + map(zipWriteComposed(right, left), Swap[B, A]()) + case (left @ Composed(_, _, _), right @ Composed(_, _, _)) => + Composed( + mergeWrite(left.write, right.write), + left.nonWrite.zip(right.nonWrite), + ComposedFn(left.compose, right.compose) + ) + } - private def zipWriteComposed[A, B1, B2, B](left: Write[A], right: Composed[B1, B2, B]): Composed[(A, B1), B2, (A, B)] = - Composed(mergeWrite(left.write, right.write), right.nonWrite, - ComposedMapFn(Twist(), MapRight[A, (B1, B2), B](right.compose))) + private def zipNonWriteComposed[A, B1, B2, B]( + left: NonWrite[A], + right: Composed[B1, B2, B] + ): Composed[B1, (B2, A), (A, B)] = + Composed( + right.write, + right.nonWrite.zip(left.nonWrite), + ComposedMapFn(ComposedMapFn(UnTwist(), MapLeft[(B1, B2), A, B](right.compose)), Swap[B, A]()) + ) + + private def zipWriteComposed[A, B1, B2, B]( + left: Write[A], + right: Composed[B1, B2, B] + ): Composed[(A, B1), B2, (A, B)] = + Composed( + mergeWrite(left.write, right.write), + right.nonWrite, + ComposedMapFn(Twist(), MapRight[A, (B1, B2), B](right.compose)) + ) /** * Convert an Execution to the Flattened (tuple-ized) representation */ def apply[A](ex: Execution[A]): FlattenedZip[A] = ex match { - case Zipped(left, right) => zip(apply(left), apply(right)) - case Mapped(that, fn) => map(apply(that), fn) - case write@WriteExecution(_, _, _) => FlattenedZip.Write(write) - case notZipMap => FlattenedZip.NonWrite(notZipMap) + case Zipped(left, right) => zip(apply(left), apply(right)) + case Mapped(that, fn) => map(apply(that), fn) + case write @ WriteExecution(_, _, _) => FlattenedZip.Write(write) + case notZipMap => FlattenedZip.NonWrite(notZipMap) } } /** - * Apply the optimization of merging all zipped/mapped WriteExecution - * into a single value. If ex is already optimal (0 or 1 write) return None + * Apply the optimization of merging all zipped/mapped WriteExecution into a single value. If ex is + * already optimal (0 or 1 write) return None */ def optimize[A](ex: Execution[A]): Option[Execution[A]] = { def writes(execution: Execution[_]): Int = { @tailrec def loop(executions: List[Execution[_]], acc: Int): Int = executions match { case Nil => acc - case head :: tail => head match { - case Zipped(left, right) => loop(left :: right :: tail, acc) - case Mapped(that, _) => loop(that :: tail, acc) - case WriteExecution(_, _, _) => loop(tail, acc + 1) - case _ => loop(tail, acc) - } + case head :: tail => + head match { + case Zipped(left, right) => loop(left :: right :: tail, acc) + case Mapped(that, _) => loop(that :: tail, acc) + case WriteExecution(_, _, _) => loop(tail, acc + 1) + case _ => loop(tail, acc) + } } loop(execution :: Nil, 0) } @@ -263,8 +282,8 @@ object ExecutionOptimizationRules { } def apply[A](on: Dag[Execution]) = { - case z@Zipped(_, _) => optimize(z) - case _ => + case z @ Zipped(_, _) => optimize(z) + case _ => // since this optimization only applies to zips, there // is no need to check on nodes that aren't zips. None @@ -289,19 +308,22 @@ object ExecutionOptimizationRules { } object ZipFlatMap extends PartialRule[Execution] { - case class LeftZipRight[S, T, B](left: Execution[B], fn: S => Execution[T]) extends (S => Execution[(B, T)]) { + case class LeftZipRight[S, T, B](left: Execution[B], fn: S => Execution[T]) + extends (S => Execution[(B, T)]) { private val fun = fn.andThen(left.zip) override def apply(s: S): Execution[(B, T)] = fun(s) } - case class RightZipLeft[S, T, B](right: Execution[B], fn: S => Execution[T]) extends (S => Execution[(T, B)]) { + case class RightZipLeft[S, T, B](right: Execution[B], fn: S => Execution[T]) + extends (S => Execution[(T, B)]) { private val fun = fn.andThen(_.zip(right)) override def apply(s: S): Execution[(T, B)] = fun(s) } - case class NestedZip[S, T, B, A](right: Execution[B], lfn: S => Execution[T], rfn: B => Execution[A]) extends (S => Execution[(T, A)]) { + case class NestedZip[S, T, B, A](right: Execution[B], lfn: S => Execution[T], rfn: B => Execution[A]) + extends (S => Execution[(T, A)]) { private val fun = lfn.andThen { lr => Execution.FlatMapped(right, rfn.andThen(lr.zip)) } @@ -310,7 +332,8 @@ object ExecutionOptimizationRules { } override def applyWhere[T](on: Dag[Execution]) = { - case Execution.Zipped(Execution.FlatMapped(left, lfn), Execution.FlatMapped(right, rfn)) if isFastExecution(left) && isFastExecution(right) => + case Execution.Zipped(Execution.FlatMapped(left, lfn), Execution.FlatMapped(right, rfn)) + if isFastExecution(left) && isFastExecution(right) => Execution.FlatMapped(left, NestedZip(right, lfn, rfn)) case Execution.Zipped(Execution.FlatMapped(left, fn), right) if isFastExecution(left) => Execution.FlatMapped(left, RightZipLeft(right, fn)) @@ -321,8 +344,9 @@ object ExecutionOptimizationRules { object MapWrite extends PartialRule[Execution] { case class ComposeMap[A, B, C, D, E]( - fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], - fn2: D => E) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[E]] { + fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], + fn2: D => E + ) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[E]] { def apply(tup: (A, B, C, ConcurrentExecutionContext)): Future[E] = fn1(tup).map(fn2)(tup._4) @@ -336,13 +360,11 @@ object ExecutionOptimizationRules { case object FuseMaps extends PartialRule[Execution] { import Execution._ - def applyWhere[A](on: Dag[Execution]) = { - case Mapped(Mapped(ex, fn0), fn1) => - Mapped(ex, ComposedFunctions.ComposedMapFn(fn0, fn1)) + def applyWhere[A](on: Dag[Execution]) = { case Mapped(Mapped(ex, fn0), fn1) => + Mapped(ex, ComposedFunctions.ComposedMapFn(fn0, fn1)) } } - val std: Rule[Execution] = Rule.orElse( List( @@ -354,13 +376,12 @@ object ExecutionOptimizationRules { ) ) - def apply[A](e: Execution[A], r: Rule[Execution]): Execution[A] = { + def apply[A](e: Execution[A], r: Rule[Execution]): Execution[A] = try { Dag.applyRule(e, toLiteral, r) } catch { case _: StackOverflowError => e } - } def stdOptimizations[A](e: Execution[A]): Execution[A] = apply(e, std) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala index 2ce63102a4..4f79353175 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala @@ -3,25 +3,37 @@ package com.twitter.scalding import com.twitter.algebird.Semigroup object ExecutionUtil { + /** * Generate a list of executions from a date range * - * @param duration Duration to split daterange - * @param fn Function to run a execution given a date range - * @return Sequence of Executions per Day + * @param duration + * Duration to split daterange + * @param fn + * Function to run a execution given a date range + * @return + * Sequence of Executions per Day */ - def executionsFromDates[T](duration: Duration)(fn: DateRange => Execution[T])(implicit dr: DateRange): Seq[Execution[T]] = + def executionsFromDates[T](duration: Duration)(fn: DateRange => Execution[T])(implicit + dr: DateRange + ): Seq[Execution[T]] = dr.each(duration).map(fn).toSeq /** * Split a DateRange and allow for max parallel running of executions * - * @param duration Duration to split daterange - * @param parallelism How many jobs to run in parallel - * @param fn Function to run a execution given a date range - * @return Seq of Dates split by Duration with corresponding execution result + * @param duration + * Duration to split daterange + * @param parallelism + * How many jobs to run in parallel + * @param fn + * Function to run a execution given a date range + * @return + * Seq of Dates split by Duration with corresponding execution result */ - def runDatesWithParallelism[T](duration: Duration, parallelism: Int = 1)(fn: DateRange => Execution[T])(implicit dr: DateRange): Execution[Seq[(DateRange, T)]] = { + def runDatesWithParallelism[T](duration: Duration, parallelism: Int = 1)( + fn: DateRange => Execution[T] + )(implicit dr: DateRange): Execution[Seq[(DateRange, T)]] = { val dates = dr.each(duration).toSeq Execution.withParallelism(dates.map(fn), parallelism).map(e => dates.zip(e)) @@ -30,34 +42,37 @@ object ExecutionUtil { /** * Split a DateRange and allow for max parallel running of executions * - * @param duration Duration to split daterange - * @param parallelism How many jobs to run in parallel - * @param fn Function to run a execution given a date range - * @return Execution of Sequences + * @param duration + * Duration to split daterange + * @param parallelism + * How many jobs to run in parallel + * @param fn + * Function to run a execution given a date range + * @return + * Execution of Sequences */ - def runDateRangeWithParallelism[T](duration: Duration, parallelism: Int = 1)(fn: DateRange => Execution[T])(implicit dr: DateRange): Execution[Seq[T]] = - runDatesWithParallelism(duration, parallelism)(fn).map(_.map{ case (_, t) => t }) + def runDateRangeWithParallelism[T](duration: Duration, parallelism: Int = 1)(fn: DateRange => Execution[T])( + implicit dr: DateRange + ): Execution[Seq[T]] = + runDatesWithParallelism(duration, parallelism)(fn).map(_.map { case (_, t) => t }) /** - * Same as runDateRangeWithParallelism, but sums the sequence - * of values after running. This is useful when you want to do a - * calculation in parallel over many durations and join the results - * together. + * Same as runDateRangeWithParallelism, but sums the sequence of values after running. This is useful when + * you want to do a calculation in parallel over many durations and join the results together. * - * For example, a common use case is when T is - * a TypedPipe[U] and you want to independently compute - * the pipes on each day and union them into a - * single TypedPipe at the end. + * For example, a common use case is when T is a TypedPipe[U] and you want to independently compute the + * pipes on each day and union them into a single TypedPipe at the end. * - * Another possible use case would be if the executions were created by - * summing intermediate monoids (e.g. T was a Map[String,HLL] since - * algebird supports monoids for maps and hll) and you wanted to do a - * final aggregation of the Monoids computed for each duration. + * Another possible use case would be if the executions were created by summing intermediate monoids (e.g. T + * was a Map[String,HLL] since algebird supports monoids for maps and hll) and you wanted to do a final + * aggregation of the Monoids computed for each duration. */ - def runDateRangeWithParallelismSum[T](duration: Duration, parallelism: Int = 1)(fn: DateRange => Execution[T])(implicit dr: DateRange, semigroup: Semigroup[T]): Execution[T] = { + def runDateRangeWithParallelismSum[T](duration: Duration, parallelism: Int = 1)( + fn: DateRange => Execution[T] + )(implicit dr: DateRange, semigroup: Semigroup[T]): Execution[T] = { require(dr.each(duration).nonEmpty, "Date Range can not be empty") runDateRangeWithParallelism(duration, parallelism)(fn)(dr) - .map(_.reduceLeft[T]{ case (l, r) => Semigroup.plus(l, r) }) + .map(_.reduceLeft[T] { case (l, r) => Semigroup.plus(l, r) }) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala index bcc02e4ce1..201c8fea1a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields @@ -26,11 +26,11 @@ import scala.collection.JavaConverters._ trait LowPriorityFieldConversions { protected def anyToFieldArg(f: Any): Comparable[_] = f match { - case x: Symbol => x.name - case y: String => y + case x: Symbol => x.name + case y: String => y case z: java.lang.Integer => z case v: Enumeration#Value => v.toString - case fld: Field[_] => fld.id + case fld: Field[_] => fld.id case flds: Fields => { if (flds.size == 1) { flds.get(0) @@ -42,18 +42,15 @@ trait LowPriorityFieldConversions { } /** - * Handles treating any TupleN as a Fields object. - * This is low priority because List is also a Product, but this method - * will not work for List (because List is Product2(head, tail) and so - * productIterator won't work as expected. - * Lists are handled by an implicit in FieldConversions, which have - * higher priority. + * Handles treating any TupleN as a Fields object. This is low priority because List is also a Product, but + * this method will not work for List (because List is Product2(head, tail) and so productIterator won't + * work as expected. Lists are handled by an implicit in FieldConversions, which have higher priority. */ implicit def productToFields(f: Product): Fields = { - val fields = new Fields(f.productIterator.map { anyToFieldArg }.toSeq: _*) + val fields = new Fields(f.productIterator.map(anyToFieldArg).toSeq: _*) f.productIterator.foreach { case field: Field[_] => fields.setComparator(field.id, field.ord) - case _ => + case _ => } fields } @@ -62,27 +59,23 @@ trait LowPriorityFieldConversions { trait FieldConversions extends LowPriorityFieldConversions { // Cascading Fields are either java.lang.String or java.lang.Integer, both are comparable. - def asList(f: Fields): List[Comparable[_]] = { + def asList(f: Fields): List[Comparable[_]] = f.iterator.asScala.toList.asInstanceOf[List[Comparable[_]]] - } // Cascading Fields are either java.lang.String or java.lang.Integer, both are comparable. def asSet(f: Fields): Set[Comparable[_]] = asList(f).toSet // TODO get the comparator also - def getField(f: Fields, idx: Int): Fields = { new Fields(f.get(idx)) } + def getField(f: Fields, idx: Int): Fields = new Fields(f.get(idx)) - def hasInts(f: Fields): Boolean = f.iterator.asScala.exists { _.isInstanceOf[java.lang.Integer] } + def hasInts(f: Fields): Boolean = f.iterator.asScala.exists(_.isInstanceOf[java.lang.Integer]) /** - * Rather than give the full power of cascading's selectors, we have - * a simpler set of rules encoded below: - * 1) if the input is non-definite (ALL, GROUP, ARGS, etc...) ALL is the output. - * Perhaps only fromFields=ALL will make sense - * 2) If one of from or to is a strict super set of the other, SWAP is used. - * 3) If they are equal, REPLACE is used. - * 4) Otherwise, ALL is used. + * Rather than give the full power of cascading's selectors, we have a simpler set of rules encoded below: + * 1) if the input is non-definite (ALL, GROUP, ARGS, etc...) ALL is the output. Perhaps only fromFields=ALL + * will make sense 2) If one of from or to is a strict super set of the other, SWAP is used. 3) If they are + * equal, REPLACE is used. 4) Otherwise, ALL is used. */ - def defaultMode(fromFields: Fields, toFields: Fields): Fields = { + def defaultMode(fromFields: Fields, toFields: Fields): Fields = if (toFields.isArguments || (fromFields.isAll && toFields.isAll)) { // 1. In this case we replace the input with the output or: // 2. if you go from all to all, you must mean replace (ALL would fail at the cascading layer) @@ -94,19 +87,18 @@ trait FieldConversions extends LowPriorityFieldConversions { val fromSet = asSet(fromFields) val toSet = asSet(toFields) (fromSet.subsetOf(toSet), toSet.subsetOf(fromSet)) match { - case (true, true) => Fields.REPLACE //equal + case (true, true) => Fields.REPLACE //equal case (true, false) => Fields.SWAP //output super set, replaces input case (false, true) => Fields.SWAP //throw away some input /* - * the next case is that they are disjoint or have some nontrivial intersection - * if disjoint, everything is fine. - * if they intersect, it is ill-defined and cascading is going to throw an error BEFORE - * starting the flow. - */ + * the next case is that they are disjoint or have some nontrivial intersection + * if disjoint, everything is fine. + * if they intersect, it is ill-defined and cascading is going to throw an error BEFORE + * starting the flow. + */ case (false, false) => Fields.ALL } } - } //Single entry fields: implicit def unitToFields(u: Unit): Fields = Fields.NONE // linter:ignore @@ -114,20 +106,20 @@ trait FieldConversions extends LowPriorityFieldConversions { implicit def integerToFields(x: java.lang.Integer): Fields = new Fields(x) implicit def stringToFields(x: String): Fields = new Fields(x) implicit def enumValueToFields(x: Enumeration#Value): Fields = new Fields(x.toString) + /** * '* means Fields.ALL, otherwise we take the .name */ - implicit def symbolToFields(x: Symbol): Fields = { + implicit def symbolToFields(x: Symbol): Fields = if (x == '*) { Fields.ALL } else { new Fields(x.name) } - } implicit def fieldToFields(f: Field[_]): RichFields = RichFields(f) @tailrec - final def newSymbol(avoid: Set[Symbol], guess: Symbol, trial: Int = 0): Symbol = { + final def newSymbol(avoid: Set[Symbol], guess: Symbol, trial: Int = 0): Symbol = if (!avoid(guess)) { //We are good: guess @@ -141,7 +133,6 @@ trait FieldConversions extends LowPriorityFieldConversions { newSymbol(avoid, guess, trial + 1) } } - } final def ensureUniqueFields(left: Fields, right: Fields, rightPipe: Pipe): (Fields, Pipe) = { val leftSet = asSet(left) @@ -150,8 +141,9 @@ trait FieldConversions extends LowPriorityFieldConversions { (right, rightPipe) } else { // Rename the collisions with random integer names: - val leftSetSyms = leftSet.map { f => Symbol(f.toString) } - val (_, reversedRename) = asList(right).map { f => Symbol(f.toString) } + val leftSetSyms = leftSet.map(f => Symbol(f.toString)) + val (_, reversedRename) = asList(right) + .map(f => Symbol(f.toString)) .foldLeft((leftSetSyms, List[Symbol]())) { (takenRename, name) => val (taken, renames) = takenRename val newName = newSymbol(taken, name) @@ -163,41 +155,44 @@ trait FieldConversions extends LowPriorityFieldConversions { } /** - * Multi-entry fields. This are higher priority than Product conversions so - * that List will not conflict with Product. + * Multi-entry fields. This are higher priority than Product conversions so that List will not conflict with + * Product. */ implicit def fromEnum[T <: Enumeration](enumeration: T): Fields = - new Fields(enumeration.values.toList.map { _.toString }: _*) + new Fields(enumeration.values.toList.map(_.toString): _*) implicit def fields[T <: TraversableOnce[Symbol]](f: T): Fields = new Fields(f.toSeq.map(_.name): _*) implicit def strFields[T <: TraversableOnce[String]](f: T): Fields = new Fields(f.toSeq: _*) - implicit def intFields[T <: TraversableOnce[Int]](f: T): Fields = { - new Fields(f.toSeq.map { new java.lang.Integer(_) }: _*) - } + implicit def intFields[T <: TraversableOnce[Int]](f: T): Fields = + new Fields(f.toSeq.map(new java.lang.Integer(_)): _*) implicit def fieldFields[T <: TraversableOnce[Field[_]]](f: T): RichFields = RichFields(f.toSeq) + /** - * Useful to convert f : Any* to Fields. This handles mixed cases ("hey", 'you). - * Not sure we should be this flexible, but given that Cascading will throw an - * exception before scheduling the job, I guess this is okay. + * Useful to convert f : Any* to Fields. This handles mixed cases ("hey", 'you). Not sure we should be this + * flexible, but given that Cascading will throw an exception before scheduling the job, I guess this is + * okay. */ implicit def parseAnySeqToFields[T <: TraversableOnce[Any]](anyf: T): Fields = { - val fields = new Fields(anyf.toSeq.map { anyToFieldArg }: _*) + val fields = new Fields(anyf.toSeq.map(anyToFieldArg): _*) anyf.foreach { case field: Field[_] => fields.setComparator(field.id, field.ord) - case _ => + case _ => } fields } //Handle a pair generally: - implicit def tuple2ToFieldsPair[T, U](pair: (T, U))(implicit tf: T => Fields, uf: U => Fields): (Fields, Fields) = { + implicit def tuple2ToFieldsPair[T, U]( + pair: (T, U) + )(implicit tf: T => Fields, uf: U => Fields): (Fields, Fields) = { val f1 = tf(pair._1) val f2 = uf(pair._2) (f1, f2) } + /** - * We can't set the field Manifests because cascading doesn't (yet) expose field type information - * in the Fields API. + * We can't set the field Manifests because cascading doesn't (yet) expose field type information in the + * Fields API. */ implicit def fieldsToRichFields(fields: Fields): RichFields = { if (!fields.isDefined) { @@ -217,13 +212,15 @@ trait FieldConversions extends LowPriorityFieldConversions { // "one at a time" by querying for a specific index, while the Comparators are only // available "all at once" by calling getComparators.) - new RichFields(asList(fields).zip(fields.getComparators).map { - case (id: Comparable[_], comparator: Comparator[_]) => id match { - case x: java.lang.Integer => IntField(x)(Ordering.comparatorToOrdering(comparator), None) - case y: String => StringField(y)(Ordering.comparatorToOrdering(comparator), None) - case z => sys.error("not expecting object of type " + z.getClass + " as field name") + new RichFields( + asList(fields).zip(fields.getComparators).map { case (id: Comparable[_], comparator: Comparator[_]) => + id match { + case x: java.lang.Integer => IntField(x)(Ordering.comparatorToOrdering(comparator), None) + case y: String => StringField(y)(Ordering.comparatorToOrdering(comparator), None) + case z => sys.error("not expecting object of type " + z.getClass + " as field name") + } } - }) + ) } } @@ -234,7 +231,7 @@ trait FieldConversions extends LowPriorityFieldConversions { // val myFields: Fields = ... // myFields.toFieldList -case class RichFields(val toFieldList: List[Field[_]]) extends Fields(toFieldList.map { _.id }: _*) { +case class RichFields(val toFieldList: List[Field[_]]) extends Fields(toFieldList.map(_.id): _*) { toFieldList.foreach { field: Field[_] => setComparator(field.id, field.ord) } } @@ -251,15 +248,22 @@ sealed trait Field[T] extends java.io.Serializable { } @DefaultSerializer(classOf[serialization.IntFieldSerializer]) -final case class IntField[T](override val id: java.lang.Integer)(implicit override val ord: Ordering[T], override val mf: Option[Manifest[T]]) extends Field[T] +final case class IntField[T](override val id: java.lang.Integer)(implicit + override val ord: Ordering[T], + override val mf: Option[Manifest[T]] +) extends Field[T] @DefaultSerializer(classOf[serialization.StringFieldSerializer]) -final case class StringField[T](override val id: String)(implicit override val ord: Ordering[T], override val mf: Option[Manifest[T]]) extends Field[T] +final case class StringField[T](override val id: String)(implicit + override val ord: Ordering[T], + override val mf: Option[Manifest[T]] +) extends Field[T] object Field { def apply[T](index: Int)(implicit ord: Ordering[T], mf: Manifest[T]) = IntField[T](index)(ord, Some(mf)) def apply[T](name: String)(implicit ord: Ordering[T], mf: Manifest[T]) = StringField[T](name)(ord, Some(mf)) - def apply[T](symbol: Symbol)(implicit ord: Ordering[T], mf: Manifest[T]) = StringField[T](symbol.name)(ord, Some(mf)) + def apply[T](symbol: Symbol)(implicit ord: Ordering[T], mf: Manifest[T]) = + StringField[T](symbol.name)(ord, Some(mf)) def singleOrdered[T](name: String)(implicit ord: Ordering[T]): Fields = { val f = new Fields(name) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala index 9fa03781d3..305e5cae86 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala @@ -12,13 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.{InputStream, OutputStream} import java.util.{Properties, UUID} import cascading.scheme.Scheme -import cascading.scheme.hadoop.{SequenceFile => CHSequenceFile, TextDelimited => CHTextDelimited, TextLine => CHTextLine} +import cascading.scheme.hadoop.{ + SequenceFile => CHSequenceFile, + TextDelimited => CHTextDelimited, + TextLine => CHTextLine +} import cascading.scheme.local.{TextDelimited => CLTextDelimited, TextLine => CLTextLine} import cascading.tap.hadoop.Hfs import cascading.tap.{MultiSourceTap, SinkMode, Tap} @@ -51,9 +55,11 @@ abstract class SchemedSource extends Source { } trait HfsTapProvider { - def createHfsTap(scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], - path: String, - sinkMode: SinkMode): Hfs = + def createHfsTap( + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + path: String, + sinkMode: SinkMode + ): Hfs = new ScaldingHfs(scheme, path, sinkMode) } @@ -67,6 +73,7 @@ private[scalding] object CastFileTap { * A trait which provides a method to create a local tap. */ trait LocalSourceOverride extends SchemedSource { + /** A path to use for the local tap. */ def localPaths: Iterable[String] @@ -76,19 +83,20 @@ trait LocalSourceOverride extends SchemedSource { /** * Creates a local tap. * - * @param sinkMode The mode for handling output conflicts. - * @return A tap. + * @param sinkMode + * The mode for handling output conflicts. + * @return + * A tap. */ def createLocalTap(sinkMode: SinkMode): Tap[JobConf, _, _] = { - val taps = localPaths.map { - p: String => - CastFileTap(new FileTap(localScheme, p, sinkMode)) + val taps = localPaths.map { p: String => + CastFileTap(new FileTap(localScheme, p, sinkMode)) }.toList taps match { - case Nil => throw new InvalidSourceException("LocalPaths is empty") + case Nil => throw new InvalidSourceException("LocalPaths is empty") case oneTap :: Nil => oneTap - case many => new ScaldingMultiSourceTap(many) + case many => new ScaldingMultiSourceTap(many) } } } @@ -101,7 +109,7 @@ object HiddenFileFilter extends PathFilter { } object SuccessFileFilter extends PathFilter { - def accept(p: Path) = { p.getName == "_SUCCESS" } + def accept(p: Path) = p.getName == "_SUCCESS" } object AcceptAllPathFilter extends PathFilter { @@ -114,38 +122,41 @@ object FileSource { private[this] def verboseLogEnabled(conf: Configuration): Boolean = conf.getBoolean(Config.VerboseFileSourceLoggingKey, false) - private[this] def ifVerboseLog(conf: Configuration)(msgFn: => String): Unit = { + private[this] def ifVerboseLog(conf: Configuration)(msgFn: => String): Unit = if (verboseLogEnabled(conf)) { - val stack = Thread.currentThread - .getStackTrace - .iterator + val stack = Thread.currentThread.getStackTrace.iterator .drop(2) // skip getStackTrace and ifVerboseLog .mkString("\n") // evaluate call by name param once val msg = msgFn - LOG.info( - s""" + LOG.info(s""" |***FileSource Verbose Log*** |$stack | |$msg """.stripMargin) } - } - def glob(glob: String, conf: Configuration, filter: PathFilter = AcceptAllPathFilter): Iterable[FileStatus] = { + def glob( + glob: String, + conf: Configuration, + filter: PathFilter = AcceptAllPathFilter + ): Iterable[FileStatus] = { val path = new Path(glob) - Option(path.getFileSystem(conf).globStatus(path, filter)).map { - _.toIterable // convert java Array to scala Iterable - }.getOrElse { - Iterable.empty - } + Option(path.getFileSystem(conf).globStatus(path, filter)) + .map { + _.toIterable // convert java Array to scala Iterable + } + .getOrElse { + Iterable.empty + } } /** - * @return whether globPath contains non hidden files + * @return + * whether globPath contains non hidden files */ def globHasNonHiddenPaths(globPath: String, conf: Configuration): Boolean = { val res = glob(globPath, conf, HiddenFileFilter) @@ -167,16 +178,22 @@ object FileSource { } /** - * @return whether globPath contains a _SUCCESS file + * @return + * whether globPath contains a _SUCCESS file */ - def globHasSuccessFile(globPath: String, conf: Configuration): Boolean = allGlobFilesWithSuccess(globPath, conf, hiddenFilter = false) + def globHasSuccessFile(globPath: String, conf: Configuration): Boolean = + allGlobFilesWithSuccess(globPath, conf, hiddenFilter = false) /** * Determines whether each file in the glob has a _SUCCESS sibling file in the same directory - * @param globPath path to check - * @param conf Hadoop Configuration to create FileSystem - * @param hiddenFilter true, if only non-hidden files are checked - * @return true if the directory has files after filters are applied + * @param globPath + * path to check + * @param conf + * Hadoop Configuration to create FileSystem + * @param hiddenFilter + * true, if only non-hidden files are checked + * @return + * true if the directory has files after filters are applied */ def allGlobFilesWithSuccess(globPath: String, conf: Configuration, hiddenFilter: Boolean): Boolean = { // Produce tuples (dirName, hasSuccess, hasNonHidden) keyed by dir @@ -200,13 +217,14 @@ object FileSource { } // OR by key - val uniqueUsedDirs = MapAlgebra.sumByKey(usedDirs) + val uniqueUsedDirs = MapAlgebra + .sumByKey(usedDirs) .filter { case (_, (_, hasNonHidden)) => (!hiddenFilter || hasNonHidden.get) } // there is at least one valid path, and all paths have success // - uniqueUsedDirs.nonEmpty && uniqueUsedDirs.forall { - case (_, (hasSuccess, _)) => hasSuccess.get + uniqueUsedDirs.nonEmpty && uniqueUsedDirs.forall { case (_, (hasSuccess, _)) => + hasSuccess.get } } } @@ -217,67 +235,71 @@ object FileSource { abstract class FileSource extends SchemedSource with LocalSourceOverride with HfsTapProvider { /** - * Determines if a path is 'valid' for this source. In strict mode all paths must be valid. - * In non-strict mode, all invalid paths will be filtered out. + * Determines if a path is 'valid' for this source. In strict mode all paths must be valid. In non-strict + * mode, all invalid paths will be filtered out. * * Subclasses can override this to validate paths. * - * The default implementation is a quick sanity check to look for missing or empty directories. - * It is necessary but not sufficient -- there are cases where this will return true but there is - * in fact missing data. + * The default implementation is a quick sanity check to look for missing or empty directories. It is + * necessary but not sufficient -- there are cases where this will return true but there is in fact missing + * data. * * TODO: consider writing a more in-depth version of this method in [[TimePathedSource]] that looks for * TODO: missing days / hours etc. */ - protected def pathIsGood(globPattern: String, conf: Configuration) = { + protected def pathIsGood(globPattern: String, conf: Configuration) = if (conf.getBoolean("scalding.require_success_file", false)) { FileSource.allGlobFilesWithSuccess(globPattern, conf, true) } else { FileSource.globHasNonHiddenPaths(globPattern, conf) } - } def hdfsPaths: Iterable[String] // By default, we write to the LAST path returned by hdfsPaths def hdfsWritePath: String = hdfsPaths.last - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { // TODO support strict in Local case Local(_) => { readOrWrite match { - case Read => createLocalTap(sinkMode) + case Read => createLocalTap(sinkMode) case Write => new FileTap(localScheme, localWritePath, sinkMode) } } - case hdfsMode @ Hdfs(_, _) => readOrWrite match { - case Read => createHdfsReadTap(hdfsMode) - case Write => CastHfsTap(createHfsTap(hdfsScheme, hdfsWritePath, sinkMode)) - } + case hdfsMode @ Hdfs(_, _) => + readOrWrite match { + case Read => createHdfsReadTap(hdfsMode) + case Write => CastHfsTap(createHfsTap(hdfsScheme, hdfsWritePath, sinkMode)) + } case _ => { - val tryTtp = Try(TestTapFactory(this, hdfsScheme, sinkMode)).map { - // these java types are invariant, so we cast here - _.createTap(readOrWrite) - .asInstanceOf[Tap[Any, Any, Any]] - }.orElse { - Try(TestTapFactory(this, localScheme.getSourceFields, sinkMode)).map { + val tryTtp = Try(TestTapFactory(this, hdfsScheme, sinkMode)) + .map { + // these java types are invariant, so we cast here _.createTap(readOrWrite) .asInstanceOf[Tap[Any, Any, Any]] } - } + .orElse { + Try(TestTapFactory(this, localScheme.getSourceFields, sinkMode)).map { + _.createTap(readOrWrite) + .asInstanceOf[Tap[Any, Any, Any]] + } + } tryTtp match { case Success(s) => s - case Failure(e) => throw new java.lang.IllegalArgumentException(s"Failed to create tap for: $toString, with error: ${e.getMessage}", e) + case Failure(e) => + throw new java.lang.IllegalArgumentException( + s"Failed to create tap for: $toString, with error: ${e.getMessage}", + e + ) } } } - } // This is only called when Mode.sourceStrictness is true - protected def hdfsReadPathsAreGood(conf: Configuration) = { - hdfsPaths.forall { pathIsGood(_, conf) } - } + protected def hdfsReadPathsAreGood(conf: Configuration) = + hdfsPaths.forall(pathIsGood(_, conf)) /* * This throws InvalidSourceException if: @@ -285,50 +307,46 @@ abstract class FileSource extends SchemedSource with LocalSourceOverride with Hf * 2) we are not in the above, but some source has no input whatsoever * TODO this only does something for HDFS now. Maybe we should do the same for LocalMode */ - override def validateTaps(mode: Mode): Unit = { + override def validateTaps(mode: Mode): Unit = mode match { case Hdfs(strict, conf) => { if (strict && (!hdfsReadPathsAreGood(conf))) { throw new InvalidSourceException( "[" + this.toString + "] Data is missing from one or more paths in: " + - hdfsPaths.toString) - } else if (!hdfsPaths.exists { pathIsGood(_, conf) }) { + hdfsPaths.toString + ) + } else if (!hdfsPaths.exists(pathIsGood(_, conf))) { //Check that there is at least one good path: - throw new InvalidSourceException( - "[" + this.toString + "] No good paths in: " + hdfsPaths.toString) + throw new InvalidSourceException("[" + this.toString + "] No good paths in: " + hdfsPaths.toString) } } case Local(strict) => { - val files = localPaths.map{ p => new java.io.File(p) } + val files = localPaths.map(p => new java.io.File(p)) if (strict && !files.forall(_.exists)) { - throw new InvalidSourceException( - "[" + this.toString + s"] Data is missing from: ${localPaths.filterNot { p => new java.io.File(p).exists }}") + throw new InvalidSourceException("[" + this.toString + s"] Data is missing from: ${localPaths + .filterNot(p => new java.io.File(p).exists)}") } else if (!files.exists(_.exists)) { - throw new InvalidSourceException( - "[" + this.toString + "] No good paths in: " + hdfsPaths.toString) + throw new InvalidSourceException("[" + this.toString + "] No good paths in: " + hdfsPaths.toString) } } case _ => () } - } /* * Get all the set of valid paths based on source strictness. */ - protected def goodHdfsPaths(hdfsMode: Hdfs): Iterable[String] = { + protected def goodHdfsPaths(hdfsMode: Hdfs): Iterable[String] = hdfsMode match { //we check later that all the paths are good case Hdfs(true, _) => hdfsPaths // If there are no matching paths, this is still an error, we need at least something: - case Hdfs(false, conf) => hdfsPaths.filter{ pathIsGood(_, conf) } + case Hdfs(false, conf) => hdfsPaths.filter(pathIsGood(_, conf)) } - } protected def createHdfsReadTap(hdfsMode: Hdfs): Tap[JobConf, _, _] = { val taps: List[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] = - goodHdfsPaths(hdfsMode) - .toList.map { path => CastHfsTap(createHfsTap(hdfsScheme, path, sinkMode)) } + goodHdfsPaths(hdfsMode).toList.map(path => CastHfsTap(createHfsTap(hdfsScheme, path, sinkMode))) taps.size match { case 0 => { // This case is going to result in an error, but we don't want to throw until @@ -344,7 +362,11 @@ abstract class FileSource extends SchemedSource with LocalSourceOverride with Hf } class ScaldingMultiSourceTap(taps: Seq[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]]) - extends MultiSourceTap[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], JobConf, RecordReader[_, _]](taps: _*) { + extends MultiSourceTap[ + Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + JobConf, + RecordReader[_, _] + ](taps: _*) { private final val randomId = UUID.randomUUID.toString override def getIdentifier() = randomId override def hashCode: Int = randomId.hashCode @@ -359,7 +381,9 @@ trait TextSourceScheme extends SchemedSource { val textEncoding: String = CHTextLine.DEFAULT_CHARSET override def localScheme = new CLTextLine(new Fields("offset", "line"), Fields.ALL, textEncoding) - override def hdfsScheme = HadoopSchemeInstance(new CHTextLine(CHTextLine.DEFAULT_SOURCE_FIELDS, textEncoding)) + override def hdfsScheme = HadoopSchemeInstance( + new CHTextLine(CHTextLine.DEFAULT_SOURCE_FIELDS, textEncoding) + ) } trait TextLineScheme extends TextSourceScheme with SingleMappable[String] { @@ -368,8 +392,7 @@ trait TextLineScheme extends TextSourceScheme with SingleMappable[String] { } /** - * Mix this in for delimited schemes such as TSV or one-separated values - * By default, TSV is given + * Mix this in for delimited schemes such as TSV or one-separated values By default, TSV is given */ trait DelimitedScheme extends SchemedSource { //override these as needed: @@ -390,13 +413,17 @@ trait DelimitedScheme extends SchemedSource { val safe = true //These should not be changed: - override def localScheme = new CLTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe) + override def localScheme = + new CLTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe) override def hdfsScheme = { assert( types == null || fields.size == types.size, - "Fields [" + fields + "] of different size than types array [" + types.mkString(",") + "]") - HadoopSchemeInstance(new CHTextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe)) + "Fields [" + fields + "] of different size than types array [" + types.mkString(",") + "]" + ) + HadoopSchemeInstance( + new CHTextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe) + ) } } @@ -408,26 +435,15 @@ trait SequenceFileScheme extends SchemedSource { } /** - * Ensures that a _SUCCESS file is present in every directory included by a glob, - * as well as the requirements of [[FileSource.pathIsGood]]. The set of directories to check for - * _SUCCESS - * is determined by examining the list of all paths returned by globPaths and adding parent - * directories of the non-hidden files encountered. - * pathIsGood should still be considered just a best-effort test. As an illustration the following - * layout with an in-flight job is accepted for the glob dir*/*: - *
- *   dir1/_temporary
- *   dir2/file1
- *   dir2/_SUCCESS
- * 
+ * Ensures that a _SUCCESS file is present in every directory included by a glob, as well as the requirements + * of [[FileSource.pathIsGood]]. The set of directories to check for _SUCCESS is determined by examining the + * list of all paths returned by globPaths and adding parent directories of the non-hidden files encountered. + * pathIsGood should still be considered just a best-effort test. As an illustration the following layout with + * an in-flight job is accepted for the glob dir*/*:
 dir1/_temporary dir2/file1 dir2/_SUCCESS 
* * Similarly if dir1 is physically empty pathIsGood is still true for dir*/* above * - * On the other hand it will reject an empty output directory of a finished job: - *
- *   dir1/_SUCCESS
- * 
- * + * On the other hand it will reject an empty output directory of a finished job:
 dir1/_SUCCESS 
*/ trait SuccessFileSource extends FileSource { override protected def pathIsGood(p: String, conf: Configuration) = @@ -435,19 +451,20 @@ trait SuccessFileSource extends FileSource { } /** - * Use this class to add support for Cascading local mode via the Hadoop tap. - * Put another way, this runs a Hadoop tap outside of Hadoop in the Cascading local mode + * Use this class to add support for Cascading local mode via the Hadoop tap. Put another way, this runs a + * Hadoop tap outside of Hadoop in the Cascading local mode */ trait LocalTapSource extends LocalSourceOverride { override def createLocalTap(sinkMode: SinkMode): Tap[JobConf, _, _] = { val taps = localPaths.map { p => - new LocalTap(p, hdfsScheme, sinkMode).asInstanceOf[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] + new LocalTap(p, hdfsScheme, sinkMode) + .asInstanceOf[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] }.toSeq taps match { - case Nil => throw new InvalidSourceException("LocalPaths is empty") + case Nil => throw new InvalidSourceException("LocalPaths is empty") case oneTap :: Nil => oneTap - case many => new ScaldingMultiSourceTap(many) + case many => new ScaldingMultiSourceTap(many) } } } @@ -465,8 +482,7 @@ abstract class FixedPathSource(path: String*) extends FileSource { override def equals(that: Any): Boolean = (that != null) && (that.toString == toString) /** - * Similar in behavior to {@link TimePathedSource.writePathFor}. - * Strip out the trailing slash star. + * Similar in behavior to {@link TimePathedSource.writePathFor}. Strip out the trailing slash star. */ protected def stripTrailing(path: String): String = { assert(path != "*", "Path must not be *") @@ -483,39 +499,48 @@ abstract class FixedPathSource(path: String*) extends FileSource { * Tab separated value source */ -case class Tsv(p: String, override val fields: Fields = Fields.ALL, - override val skipHeader: Boolean = false, override val writeHeader: Boolean = false, - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) with DelimitedScheme +case class Tsv( + p: String, + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) + with DelimitedScheme /** - * Allows the use of multiple Tsv input paths. The Tsv files will - * be process through your flow as if they are a single pipe. Tsv - * files must have the same schema. - * For more details on how multiple files are handled check the - * cascading docs. + * Allows the use of multiple Tsv input paths. The Tsv files will be process through your flow as if they are + * a single pipe. Tsv files must have the same schema. For more details on how multiple files are handled + * check the cascading docs. */ -case class MultipleTsvFiles(p: Seq[String], override val fields: Fields = Fields.ALL, - override val skipHeader: Boolean = false, override val writeHeader: Boolean = false) extends FixedPathSource(p: _*) - with DelimitedScheme +case class MultipleTsvFiles( + p: Seq[String], + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false +) extends FixedPathSource(p: _*) + with DelimitedScheme /** - * Csv value source - * separated by commas and quotes wrapping all fields + * Csv value source separated by commas and quotes wrapping all fields */ -case class Csv(p: String, - override val separator: String = ",", - override val fields: Fields = Fields.ALL, - override val skipHeader: Boolean = false, - override val writeHeader: Boolean = false, - override val quote: String = "\"", - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) with DelimitedScheme +case class Csv( + p: String, + override val separator: String = ",", + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val quote: String = "\"", + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) + with DelimitedScheme /** * One separated value (commonly used by Pig) */ -case class Osv(p: String, f: Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) - with DelimitedScheme { +case class Osv(p: String, f: Fields = Fields.ALL, override val sinkMode: SinkMode = SinkMode.REPLACE) + extends FixedPathSource(p) + with DelimitedScheme { override val fields = f override val separator = "\u0001" } @@ -529,7 +554,10 @@ object TextLine { new TextLine(p, sm, textEncoding) } -class TextLine(p: String, override val sinkMode: SinkMode, override val textEncoding: String) extends FixedPathSource(p) with TextLineScheme with TypedSink[String] { +class TextLine(p: String, override val sinkMode: SinkMode, override val textEncoding: String) + extends FixedPathSource(p) + with TextLineScheme + with TypedSink[String] { // For some Java interop def this(p: String) = this(p, TextLine.defaultSinkMode, TextLine.defaultTextEncoding) @@ -540,10 +568,10 @@ class TextLine(p: String, override val sinkMode: SinkMode, override val textEnco /** * Alternate typed TextLine source that keeps both 'offset and 'line fields. */ -class OffsetTextLine(filepath: String, - override val sinkMode: SinkMode, - override val textEncoding: String) - extends FixedPathSource(filepath) with Mappable[(Long, String)] with TextSourceScheme { +class OffsetTextLine(filepath: String, override val sinkMode: SinkMode, override val textEncoding: String) + extends FixedPathSource(filepath) + with Mappable[(Long, String)] + with TextSourceScheme { override def converter[U >: (Long, String)] = TupleConverter.asSuperConverter[(Long, String), U](TupleConverter.of[(Long, String)]) @@ -557,28 +585,39 @@ object OffsetTextLine { val defaultTextEncoding: String = CHTextLine.DEFAULT_CHARSET val defaultSinkMode: SinkMode = SinkMode.REPLACE - def apply(p: String, sm: SinkMode = defaultSinkMode, textEncoding: String = defaultTextEncoding): OffsetTextLine = + def apply( + p: String, + sm: SinkMode = defaultSinkMode, + textEncoding: String = defaultTextEncoding + ): OffsetTextLine = new OffsetTextLine(p, sm, textEncoding) } case class SequenceFile(p: String, f: Fields = Fields.ALL, override val sinkMode: SinkMode = SinkMode.REPLACE) - extends FixedPathSource(p) with SequenceFileScheme with LocalTapSource { + extends FixedPathSource(p) + with SequenceFileScheme + with LocalTapSource { override val fields = f } -case class MultipleSequenceFiles(p: String*) extends FixedPathSource(p: _*) with SequenceFileScheme with LocalTapSource +case class MultipleSequenceFiles(p: String*) + extends FixedPathSource(p: _*) + with SequenceFileScheme + with LocalTapSource case class MultipleTextLineFiles(p: String*) extends FixedPathSource(p: _*) with TextLineScheme /** - * Delimited files source - * allowing to override separator and quotation characters and header configuration + * Delimited files source allowing to override separator and quotation characters and header configuration */ -case class MultipleDelimitedFiles(f: Fields, - override val separator: String, - override val quote: String, - override val skipHeader: Boolean, - override val writeHeader: Boolean, - p: String*) extends FixedPathSource(p: _*) with DelimitedScheme { +case class MultipleDelimitedFiles( + f: Fields, + override val separator: String, + override val quote: String, + override val skipHeader: Boolean, + override val writeHeader: Boolean, + p: String* +) extends FixedPathSource(p: _*) + with DelimitedScheme { override val fields = f } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala b/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala index f7773d049b..d787eed9f3 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding @@ -25,20 +25,20 @@ import java.util.WeakHashMap * * There are three kinds of things we want to attach to FlowDefs: * - * 1) which scalding Sources are being read (sourceMap), so we can - * call validateTaps on each of them before we run (see validateSources) + * 1) which scalding Sources are being read (sourceMap), so we can call validateTaps on each of them before we + * run (see validateSources) * - * 2) the configuration updates that need to be applied to the Pipe - * instances in the Typed API (this could be removed by better plumbing in CascadingBackend) + * 2) the configuration updates that need to be applied to the Pipe instances in the Typed API (this could be + * removed by better plumbing in CascadingBackend) * - * 3) The list of TypedPipe writes that have not yet been planned. We want - * to defer planning as long as possible so the optimizer can see as much - * as possible of the graph to make the best decisions. + * 3) The list of TypedPipe writes that have not yet been planned. We want to defer planning as long as + * possible so the optimizer can see as much as possible of the graph to make the best decisions. */ case class FlowState( - sourceMap: Map[String, Source], - flowConfigUpdates: Set[(String, String)], - pendingTypedWrites: List[FlowStateMap.TypedWrite[_]]) { + sourceMap: Map[String, Source], + flowConfigUpdates: Set[(String, String)], + pendingTypedWrites: List[FlowStateMap.TypedWrite[_]] +) { def getSourceNamed(name: String): Option[Source] = sourceMap.get(name) @@ -48,9 +48,11 @@ case class FlowState( sourceMap.values.toSet[Source].foreach(_.validateTaps(mode)) def merge(that: FlowState): FlowState = - FlowState(sourceMap = sourceMap ++ that.sourceMap, + FlowState( + sourceMap = sourceMap ++ that.sourceMap, flowConfigUpdates = flowConfigUpdates ++ that.flowConfigUpdates, - pendingTypedWrites = pendingTypedWrites ::: that.pendingTypedWrites) + pendingTypedWrites = pendingTypedWrites ::: that.pendingTypedWrites + ) } object FlowState { @@ -70,33 +72,29 @@ object FlowState { } /** - * This is a mutable threadsafe store for attaching scalding - * information to the mutable flowDef + * This is a mutable threadsafe store for attaching scalding information to the mutable flowDef * - * NOTE: there is a subtle bug in scala regarding case classes - * with multiple sets of arguments, and their equality. - * For this reason, we use Source.sourceId as the key in this map + * NOTE: there is a subtle bug in scala regarding case classes with multiple sets of arguments, and their + * equality. For this reason, we use Source.sourceId as the key in this map */ object FlowStateMap { // Make sure we don't hold FlowState after the FlowDef is gone @transient private val flowMap = new WeakHashMap[FlowDef, FlowState]() case class TypedWrite[T](pipe: TypedPipe[T], sink: TypedSink[T], mode: Mode) + /** * Function to update a state. * - * note if fn mutates the FlowStateMap, this can easily - * be incorrect (you can lose a write), any mutation - * that itself mutates the FlowState is responsible - * for returning the correct value from fn. + * note if fn mutates the FlowStateMap, this can easily be incorrect (you can lose a write), any mutation + * that itself mutates the FlowState is responsible for returning the correct value from fn. */ - private def mutate[T](fd: FlowDef)(fn: FlowState => (FlowState, T)): T = { + private def mutate[T](fd: FlowDef)(fn: FlowState => (FlowState, T)): T = flowMap.synchronized { val (newState, t) = fn(apply(fd)) flowMap.put(fd, newState) t } - } /** * Get the FlowState or return FlowState.empty @@ -105,15 +103,13 @@ object FlowStateMap { get(fd).getOrElse(FlowState.empty) def get(fd: FlowDef): Option[FlowState] = - flowMap.synchronized { Option(flowMap.get(fd)) } + flowMap.synchronized(Option(flowMap.get(fd))) def clear(fd: FlowDef): Unit = - flowMap.synchronized { flowMap.remove(fd) } + flowMap.synchronized(flowMap.remove(fd)) /** - * Merge a FlowState into the current one for - * this FlowDef and return the value before - * the merge + * Merge a FlowState into the current one for this FlowDef and return the value before the merge */ def merge(fd: FlowDef, state: FlowState): FlowState = mutate(fd) { fs => @@ -127,7 +123,7 @@ object FlowStateMap { * returns the original */ def removeWrites(fd: FlowDef): FlowState = - mutate(fd) { fs => (fs.copy(pendingTypedWrites = Nil), fs) } + mutate(fd)(fs => (fs.copy(pendingTypedWrites = Nil), fs)) def validateSources(flowDef: FlowDef, mode: Mode): Unit = /* @@ -140,4 +136,3 @@ object FlowStateMap { .validateSources(mode) } else () } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala index a1379c87fb..6b0516781b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala @@ -12,18 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields /** - * Implements reductions on top of a simple abstraction for the Fields-API - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. + * Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism + * trick to return the type called Self in each operation. */ -trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] - with Sortable[Self] { +trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] with Sortable[Self] { /* * prefer reduce or mapReduceMap. foldLeft will force all work to be * done on the reducers. If your function is not associative and @@ -32,16 +30,19 @@ trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Sel * NOTE: init needs to be serializable with Kryo (because we copy it for each * grouping to avoid possible errors using a mutable init object). */ - def foldLeft[X, T](fieldDef: (Fields, Fields))(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self + def foldLeft[X, T](fieldDef: (Fields, Fields))(init: X)( + fn: (X, T) => X + )(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self //If there is an ordering, we need to reverse the list - override def mapList[T, R](fieldDef: (Fields, Fields))(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = { + override def mapList[T, R]( + fieldDef: (Fields, Fields) + )(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = if (sorting.isDefined) { //the list is built in reverse order so we need to reverse it here - super.mapList[T, R](fieldDef) { l => fn(l.reverse) }(conv, setter) + super.mapList[T, R](fieldDef)(l => fn(l.reverse))(conv, setter) } else { // Ordering doesn't matter, so skip the reversal super.mapList[T, R](fieldDef)(fn)(conv, setter) } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala b/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala index feaf7377c2..0d7b2415ef 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala @@ -3,47 +3,332 @@ package com.twitter.scalding object FunctionImplicits { - implicit def function2ToTupledFunction1[T1, T2, R](f: Function2[T1, T2, R]): Function1[(T1, T2), R] = f.tupled + implicit def function2ToTupledFunction1[T1, T2, R](f: Function2[T1, T2, R]): Function1[(T1, T2), R] = + f.tupled - implicit def function3ToTupledFunction1[T1, T2, T3, R](f: Function3[T1, T2, T3, R]): Function1[(T1, T2, T3), R] = f.tupled + implicit def function3ToTupledFunction1[T1, T2, T3, R]( + f: Function3[T1, T2, T3, R] + ): Function1[(T1, T2, T3), R] = f.tupled - implicit def function4ToTupledFunction1[T1, T2, T3, T4, R](f: Function4[T1, T2, T3, T4, R]): Function1[(T1, T2, T3, T4), R] = f.tupled + implicit def function4ToTupledFunction1[T1, T2, T3, T4, R]( + f: Function4[T1, T2, T3, T4, R] + ): Function1[(T1, T2, T3, T4), R] = f.tupled - implicit def function5ToTupledFunction1[T1, T2, T3, T4, T5, R](f: Function5[T1, T2, T3, T4, T5, R]): Function1[(T1, T2, T3, T4, T5), R] = f.tupled + implicit def function5ToTupledFunction1[T1, T2, T3, T4, T5, R]( + f: Function5[T1, T2, T3, T4, T5, R] + ): Function1[(T1, T2, T3, T4, T5), R] = f.tupled - implicit def function6ToTupledFunction1[T1, T2, T3, T4, T5, T6, R](f: Function6[T1, T2, T3, T4, T5, T6, R]): Function1[(T1, T2, T3, T4, T5, T6), R] = f.tupled + implicit def function6ToTupledFunction1[T1, T2, T3, T4, T5, T6, R]( + f: Function6[T1, T2, T3, T4, T5, T6, R] + ): Function1[(T1, T2, T3, T4, T5, T6), R] = f.tupled - implicit def function7ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, R](f: Function7[T1, T2, T3, T4, T5, T6, T7, R]): Function1[(T1, T2, T3, T4, T5, T6, T7), R] = f.tupled + implicit def function7ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, R]( + f: Function7[T1, T2, T3, T4, T5, T6, T7, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7), R] = f.tupled - implicit def function8ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, R](f: Function8[T1, T2, T3, T4, T5, T6, T7, T8, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8), R] = f.tupled + implicit def function8ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, R]( + f: Function8[T1, T2, T3, T4, T5, T6, T7, T8, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8), R] = f.tupled - implicit def function9ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, R](f: Function9[T1, T2, T3, T4, T5, T6, T7, T8, T9, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9), R] = f.tupled + implicit def function9ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, R]( + f: Function9[T1, T2, T3, T4, T5, T6, T7, T8, T9, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9), R] = f.tupled - implicit def function10ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R](f: Function10[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10), R] = f.tupled + implicit def function10ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R]( + f: Function10[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10), R] = f.tupled - implicit def function11ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R](f: Function11[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11), R] = f.tupled + implicit def function11ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R]( + f: Function11[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11), R] = f.tupled - implicit def function12ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R](f: Function12[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12), R] = f.tupled + implicit def function12ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R]( + f: Function12[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12), R] = f.tupled - implicit def function13ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R](f: Function13[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13), R] = f.tupled + implicit def function13ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R]( + f: Function13[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13), R] = f.tupled - implicit def function14ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R](f: Function14[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14), R] = f.tupled + implicit def function14ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R]( + f: Function14[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14), R] = f.tupled - implicit def function15ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R](f: Function15[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15), R] = f.tupled + implicit def function15ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + R + ]( + f: Function15[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15), R] = f.tupled - implicit def function16ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R](f: Function16[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16), R] = f.tupled + implicit def function16ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + R + ]( + f: Function16[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16), R] = f.tupled - implicit def function17ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R](f: Function17[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17), R] = f.tupled + implicit def function17ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + R + ]( + f: Function17[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17), R] = f.tupled - implicit def function18ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R](f: Function18[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18), R] = f.tupled + implicit def function18ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + R + ]( + f: Function18[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18), R] = + f.tupled - implicit def function19ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R](f: Function19[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19), R] = f.tupled + implicit def function19ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + R + ]( + f: Function19[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19), R] = + f.tupled - implicit def function20ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R](f: Function20[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20), R] = f.tupled + implicit def function20ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + R + ]( + f: Function20[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20), + R + ] = f.tupled - implicit def function21ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R](f: Function21[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21), R] = f.tupled + implicit def function21ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + R + ]( + f: Function21[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21), + R + ] = f.tupled - implicit def function22ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R](f: Function22[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22), R] = f.tupled + implicit def function22ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + R + ]( + f: Function22[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22), + R + ] = f.tupled } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FutureCache.scala b/scalding-core/src/main/scala/com/twitter/scalding/FutureCache.scala index a78d56e646..8c4f6d257e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FutureCache.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FutureCache.scala @@ -1,7 +1,7 @@ package com.twitter.scalding import java.util.concurrent.ConcurrentHashMap -import scala.concurrent.{ Future, Promise, ExecutionContext => ConcurrentExecutionContext } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} trait PromiseLike[P[_], F[_]] { def apply[T](): P[T] @@ -24,8 +24,7 @@ object PromiseLike { } /** - * This is a map for values that are produced in futures - * as is common in Execution + * This is a map for values that are produced in futures as is common in Execution */ class FutureCacheGeneric[-K, V, P[_], F[_]](implicit pl: PromiseLike[P, F]) { private[this] val cache = new ConcurrentHashMap[K, F[V]]() @@ -48,8 +47,7 @@ class FutureCacheGeneric[-K, V, P[_], F[_]](implicit pl: PromiseLike[P, F]) { } /** - * If you get a Left value as a result you MUST complete that Promise - * or you may deadlock other callers + * If you get a Left value as a result you MUST complete that Promise or you may deadlock other callers */ def getOrPromise(k: K): Either[P[V], F[V]] = { /* @@ -60,7 +58,7 @@ class FutureCacheGeneric[-K, V, P[_], F[_]](implicit pl: PromiseLike[P, F]) { val cancelFut = pl.future(cpromise) cache.putIfAbsent(k, cancelFut) match { - case null => Left(cpromise) + case null => Left(cpromise) case existsFut => Right(existsFut) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala index 1470b93a20..59a50bc573 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala @@ -5,918 +5,1007 @@ import cascading.tuple.TupleEntry trait GeneratedTupleConverters extends LowPriorityTupleConverters { - case class TupleConverter1[A]( gA : TupleGetter[A]) extends TupleConverter[Tuple1[A]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple1(gA.get(tup, 0)) - } - def arity = 1 + case class TupleConverter1[A](gA: TupleGetter[A]) extends TupleConverter[Tuple1[A]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple1(gA.get(tup, 0)) + } + def arity = 1 } - implicit def tuple1Converter[A](implicit - gA : TupleGetter[A]): TupleConverter[Tuple1[A]] = TupleConverter1(gA) + implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleConverter[Tuple1[A]] = TupleConverter1( + gA + ) - case class TupleConverter2[A,B]( gA : TupleGetter[A], - gB : TupleGetter[B]) extends TupleConverter[Tuple2[A,B]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple2(gA.get(tup, 0), - gB.get(tup, 1)) - } - def arity = 2 + case class TupleConverter2[A, B](gA: TupleGetter[A], gB: TupleGetter[B]) + extends TupleConverter[Tuple2[A, B]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple2(gA.get(tup, 0), gB.get(tup, 1)) + } + def arity = 2 } - implicit def tuple2Converter[A,B](implicit - gA : TupleGetter[A], - gB : TupleGetter[B]): TupleConverter[Tuple2[A,B]] = TupleConverter2(gA, gB) + implicit def tuple2Converter[A, B](implicit + gA: TupleGetter[A], + gB: TupleGetter[B] + ): TupleConverter[Tuple2[A, B]] = TupleConverter2(gA, gB) - case class TupleConverter3[A,B,C]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C]) extends TupleConverter[Tuple3[A,B,C]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple3(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2)) - } - def arity = 3 + case class TupleConverter3[A, B, C](gA: TupleGetter[A], gB: TupleGetter[B], gC: TupleGetter[C]) + extends TupleConverter[Tuple3[A, B, C]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple3(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2)) + } + def arity = 3 } - implicit def tuple3Converter[A,B,C](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C]): TupleConverter[Tuple3[A,B,C]] = TupleConverter3(gA, gB, gC) + implicit def tuple3Converter[A, B, C](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C] + ): TupleConverter[Tuple3[A, B, C]] = TupleConverter3(gA, gB, gC) - case class TupleConverter4[A,B,C,D]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D]) extends TupleConverter[Tuple4[A,B,C,D]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple4(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3)) - } - def arity = 4 + case class TupleConverter4[A, B, C, D]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D] + ) extends TupleConverter[Tuple4[A, B, C, D]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple4(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3)) + } + def arity = 4 } - implicit def tuple4Converter[A,B,C,D](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D]): TupleConverter[Tuple4[A,B,C,D]] = TupleConverter4(gA, gB, gC, gD) + implicit def tuple4Converter[A, B, C, D](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D] + ): TupleConverter[Tuple4[A, B, C, D]] = TupleConverter4(gA, gB, gC, gD) - case class TupleConverter5[A,B,C,D,E]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E]) extends TupleConverter[Tuple5[A,B,C,D,E]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple5(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4)) - } - def arity = 5 + case class TupleConverter5[A, B, C, D, E]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E] + ) extends TupleConverter[Tuple5[A, B, C, D, E]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple5(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3), gE.get(tup, 4)) + } + def arity = 5 } - implicit def tuple5Converter[A,B,C,D,E](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E]): TupleConverter[Tuple5[A,B,C,D,E]] = TupleConverter5(gA, gB, gC, gD, gE) + implicit def tuple5Converter[A, B, C, D, E](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E] + ): TupleConverter[Tuple5[A, B, C, D, E]] = TupleConverter5(gA, gB, gC, gD, gE) - case class TupleConverter6[A,B,C,D,E,F]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F]) extends TupleConverter[Tuple6[A,B,C,D,E,F]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple6(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5)) - } - def arity = 6 + case class TupleConverter6[A, B, C, D, E, F]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F] + ) extends TupleConverter[Tuple6[A, B, C, D, E, F]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple6(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3), gE.get(tup, 4), gF.get(tup, 5)) + } + def arity = 6 } - implicit def tuple6Converter[A,B,C,D,E,F](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F]): TupleConverter[Tuple6[A,B,C,D,E,F]] = TupleConverter6(gA, gB, gC, gD, gE, gF) + implicit def tuple6Converter[A, B, C, D, E, F](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F] + ): TupleConverter[Tuple6[A, B, C, D, E, F]] = TupleConverter6(gA, gB, gC, gD, gE, gF) - case class TupleConverter7[A,B,C,D,E,F,G]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G]) extends TupleConverter[Tuple7[A,B,C,D,E,F,G]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple7(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6)) - } - def arity = 7 + case class TupleConverter7[A, B, C, D, E, F, G]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G] + ) extends TupleConverter[Tuple7[A, B, C, D, E, F, G]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple7( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6) + ) + } + def arity = 7 } - implicit def tuple7Converter[A,B,C,D,E,F,G](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G]): TupleConverter[Tuple7[A,B,C,D,E,F,G]] = TupleConverter7(gA, gB, gC, gD, gE, gF, gG) + implicit def tuple7Converter[A, B, C, D, E, F, G](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G] + ): TupleConverter[Tuple7[A, B, C, D, E, F, G]] = TupleConverter7(gA, gB, gC, gD, gE, gF, gG) - case class TupleConverter8[A,B,C,D,E,F,G,H]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H]) extends TupleConverter[Tuple8[A,B,C,D,E,F,G,H]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple8(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7)) - } - def arity = 8 + case class TupleConverter8[A, B, C, D, E, F, G, H]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H] + ) extends TupleConverter[Tuple8[A, B, C, D, E, F, G, H]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple8( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7) + ) + } + def arity = 8 } - implicit def tuple8Converter[A,B,C,D,E,F,G,H](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H]): TupleConverter[Tuple8[A,B,C,D,E,F,G,H]] = TupleConverter8(gA, gB, gC, gD, gE, gF, gG, gH) + implicit def tuple8Converter[A, B, C, D, E, F, G, H](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H] + ): TupleConverter[Tuple8[A, B, C, D, E, F, G, H]] = TupleConverter8(gA, gB, gC, gD, gE, gF, gG, gH) - case class TupleConverter9[A,B,C,D,E,F,G,H,I]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I]) extends TupleConverter[Tuple9[A,B,C,D,E,F,G,H,I]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple9(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8)) - } - def arity = 9 + case class TupleConverter9[A, B, C, D, E, F, G, H, I]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I] + ) extends TupleConverter[Tuple9[A, B, C, D, E, F, G, H, I]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple9( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8) + ) + } + def arity = 9 } - implicit def tuple9Converter[A,B,C,D,E,F,G,H,I](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I]): TupleConverter[Tuple9[A,B,C,D,E,F,G,H,I]] = TupleConverter9(gA, gB, gC, gD, gE, gF, gG, gH, gI) + implicit def tuple9Converter[A, B, C, D, E, F, G, H, I](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I] + ): TupleConverter[Tuple9[A, B, C, D, E, F, G, H, I]] = TupleConverter9(gA, gB, gC, gD, gE, gF, gG, gH, gI) - case class TupleConverter10[A,B,C,D,E,F,G,H,I,J]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J]) extends TupleConverter[Tuple10[A,B,C,D,E,F,G,H,I,J]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple10(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9)) - } - def arity = 10 + case class TupleConverter10[A, B, C, D, E, F, G, H, I, J]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J] + ) extends TupleConverter[Tuple10[A, B, C, D, E, F, G, H, I, J]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple10( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9) + ) + } + def arity = 10 } - implicit def tuple10Converter[A,B,C,D,E,F,G,H,I,J](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J]): TupleConverter[Tuple10[A,B,C,D,E,F,G,H,I,J]] = TupleConverter10(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ) + implicit def tuple10Converter[A, B, C, D, E, F, G, H, I, J](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J] + ): TupleConverter[Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter10(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ) - case class TupleConverter11[A,B,C,D,E,F,G,H,I,J,K]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K]) extends TupleConverter[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple11(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10)) - } - def arity = 11 + case class TupleConverter11[A, B, C, D, E, F, G, H, I, J, K]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K] + ) extends TupleConverter[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple11( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10) + ) + } + def arity = 11 } - implicit def tuple11Converter[A,B,C,D,E,F,G,H,I,J,K](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K]): TupleConverter[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] = TupleConverter11(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK) + implicit def tuple11Converter[A, B, C, D, E, F, G, H, I, J, K](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K] + ): TupleConverter[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter11(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK) - case class TupleConverter12[A,B,C,D,E,F,G,H,I,J,K,L]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L]) extends TupleConverter[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple12(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11)) - } - def arity = 12 + case class TupleConverter12[A, B, C, D, E, F, G, H, I, J, K, L]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L] + ) extends TupleConverter[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple12( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11) + ) + } + def arity = 12 } - implicit def tuple12Converter[A,B,C,D,E,F,G,H,I,J,K,L](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L]): TupleConverter[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] = TupleConverter12(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL) + implicit def tuple12Converter[A, B, C, D, E, F, G, H, I, J, K, L](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L] + ): TupleConverter[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter12(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL) - case class TupleConverter13[A,B,C,D,E,F,G,H,I,J,K,L,M]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M]) extends TupleConverter[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple13(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12)) - } - def arity = 13 + case class TupleConverter13[A, B, C, D, E, F, G, H, I, J, K, L, M]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M] + ) extends TupleConverter[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple13( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12) + ) + } + def arity = 13 } - implicit def tuple13Converter[A,B,C,D,E,F,G,H,I,J,K,L,M](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M]): TupleConverter[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] = TupleConverter13(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM) + implicit def tuple13Converter[A, B, C, D, E, F, G, H, I, J, K, L, M](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M] + ): TupleConverter[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter13(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM) - case class TupleConverter14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N]) extends TupleConverter[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple14(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13)) - } - def arity = 14 + case class TupleConverter14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N] + ) extends TupleConverter[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple14( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13) + ) + } + def arity = 14 } - implicit def tuple14Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N]): TupleConverter[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] = TupleConverter14(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN) + implicit def tuple14Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N] + ): TupleConverter[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter14(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN) - case class TupleConverter15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O]) extends TupleConverter[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple15(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14)) - } - def arity = 15 + case class TupleConverter15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O] + ) extends TupleConverter[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple15( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14) + ) + } + def arity = 15 } - implicit def tuple15Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O]): TupleConverter[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] = TupleConverter15(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO) + implicit def tuple15Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O] + ): TupleConverter[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter15(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO) - case class TupleConverter16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P]) extends TupleConverter[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple16(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15)) - } - def arity = 16 + case class TupleConverter16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P] + ) extends TupleConverter[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple16( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15) + ) + } + def arity = 16 } - implicit def tuple16Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P]): TupleConverter[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] = TupleConverter16(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP) + implicit def tuple16Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P] + ): TupleConverter[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter16(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP) - case class TupleConverter17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q]) extends TupleConverter[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple17(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16)) - } - def arity = 17 + case class TupleConverter17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q] + ) extends TupleConverter[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple17( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16) + ) + } + def arity = 17 } - implicit def tuple17Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q]): TupleConverter[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] = TupleConverter17(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ) + implicit def tuple17Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q] + ): TupleConverter[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter17(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ) - case class TupleConverter18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R]) extends TupleConverter[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple18(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17)) - } - def arity = 18 + case class TupleConverter18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R] + ) extends TupleConverter[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple18( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17) + ) + } + def arity = 18 } - implicit def tuple18Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R]): TupleConverter[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] = TupleConverter18(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR) + implicit def tuple18Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R] + ): TupleConverter[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter18(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR) - case class TupleConverter19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S]) extends TupleConverter[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple19(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18)) - } - def arity = 19 + case class TupleConverter19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S] + ) extends TupleConverter[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple19( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18) + ) + } + def arity = 19 } - implicit def tuple19Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S]): TupleConverter[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] = TupleConverter19(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS) + implicit def tuple19Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S] + ): TupleConverter[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter19(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS) - case class TupleConverter20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T]) extends TupleConverter[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple20(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19)) - } - def arity = 20 + case class TupleConverter20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T] + ) extends TupleConverter[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple20( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19) + ) + } + def arity = 20 } - implicit def tuple20Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T]): TupleConverter[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] = TupleConverter20(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT) + implicit def tuple20Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T] + ): TupleConverter[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter20(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT) - case class TupleConverter21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U]) extends TupleConverter[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple21(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19), - gU.get(tup, 20)) - } - def arity = 21 + case class TupleConverter21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U] + ) extends TupleConverter[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple21( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19), + gU.get(tup, 20) + ) + } + def arity = 21 } - implicit def tuple21Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U]): TupleConverter[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] = TupleConverter21(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU) + implicit def tuple21Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U] + ): TupleConverter[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter21(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU) - case class TupleConverter22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]( gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U], - gV : TupleGetter[V]) extends TupleConverter[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] { - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple22(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19), - gU.get(tup, 20), - gV.get(tup, 21)) - } - def arity = 22 + case class TupleConverter22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U], + gV: TupleGetter[V] + ) extends TupleConverter[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple22( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19), + gU.get(tup, 20), + gV.get(tup, 21) + ) + } + def arity = 22 } - implicit def tuple22Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U], - gV : TupleGetter[V]): TupleConverter[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] = TupleConverter22(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU, gV) + implicit def tuple22Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U], + gV: TupleGetter[V] + ): TupleConverter[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter22(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU, gV) } trait GeneratedTupleSetters extends LowPriorityTupleSetters { @@ -930,7 +1019,7 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } implicit def tup1Setter[Z <: Tuple1[_]]: TupleSetter[Z] = TupleSetter1[Z]() - case class TupleSetter2[Z <: Tuple2[_,_]]() extends TupleSetter[Z] { + case class TupleSetter2[Z <: Tuple2[_, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(2) tup.set(0, arg._1) @@ -939,9 +1028,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 2 } - implicit def tup2Setter[Z <: Tuple2[_,_]]: TupleSetter[Z] = TupleSetter2[Z]() + implicit def tup2Setter[Z <: Tuple2[_, _]]: TupleSetter[Z] = TupleSetter2[Z]() - case class TupleSetter3[Z <: Tuple3[_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter3[Z <: Tuple3[_, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(3) tup.set(0, arg._1) @@ -951,9 +1040,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 3 } - implicit def tup3Setter[Z <: Tuple3[_,_,_]]: TupleSetter[Z] = TupleSetter3[Z]() + implicit def tup3Setter[Z <: Tuple3[_, _, _]]: TupleSetter[Z] = TupleSetter3[Z]() - case class TupleSetter4[Z <: Tuple4[_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter4[Z <: Tuple4[_, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(4) tup.set(0, arg._1) @@ -964,9 +1053,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 4 } - implicit def tup4Setter[Z <: Tuple4[_,_,_,_]]: TupleSetter[Z] = TupleSetter4[Z]() + implicit def tup4Setter[Z <: Tuple4[_, _, _, _]]: TupleSetter[Z] = TupleSetter4[Z]() - case class TupleSetter5[Z <: Tuple5[_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter5[Z <: Tuple5[_, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(5) tup.set(0, arg._1) @@ -978,9 +1067,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 5 } - implicit def tup5Setter[Z <: Tuple5[_,_,_,_,_]]: TupleSetter[Z] = TupleSetter5[Z]() + implicit def tup5Setter[Z <: Tuple5[_, _, _, _, _]]: TupleSetter[Z] = TupleSetter5[Z]() - case class TupleSetter6[Z <: Tuple6[_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter6[Z <: Tuple6[_, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(6) tup.set(0, arg._1) @@ -993,9 +1082,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 6 } - implicit def tup6Setter[Z <: Tuple6[_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter6[Z]() + implicit def tup6Setter[Z <: Tuple6[_, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter6[Z]() - case class TupleSetter7[Z <: Tuple7[_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter7[Z <: Tuple7[_, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(7) tup.set(0, arg._1) @@ -1009,9 +1098,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 7 } - implicit def tup7Setter[Z <: Tuple7[_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter7[Z]() + implicit def tup7Setter[Z <: Tuple7[_, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter7[Z]() - case class TupleSetter8[Z <: Tuple8[_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter8[Z <: Tuple8[_, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(8) tup.set(0, arg._1) @@ -1026,9 +1115,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 8 } - implicit def tup8Setter[Z <: Tuple8[_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter8[Z]() + implicit def tup8Setter[Z <: Tuple8[_, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter8[Z]() - case class TupleSetter9[Z <: Tuple9[_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter9[Z <: Tuple9[_, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(9) tup.set(0, arg._1) @@ -1044,9 +1133,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 9 } - implicit def tup9Setter[Z <: Tuple9[_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter9[Z]() + implicit def tup9Setter[Z <: Tuple9[_, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter9[Z]() - case class TupleSetter10[Z <: Tuple10[_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter10[Z <: Tuple10[_, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(10) tup.set(0, arg._1) @@ -1063,9 +1152,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 10 } - implicit def tup10Setter[Z <: Tuple10[_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter10[Z]() + implicit def tup10Setter[Z <: Tuple10[_, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter10[Z]() - case class TupleSetter11[Z <: Tuple11[_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter11[Z <: Tuple11[_, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(11) tup.set(0, arg._1) @@ -1083,9 +1172,9 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 11 } - implicit def tup11Setter[Z <: Tuple11[_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter11[Z]() + implicit def tup11Setter[Z <: Tuple11[_, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter11[Z]() - case class TupleSetter12[Z <: Tuple12[_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter12[Z <: Tuple12[_, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(12) tup.set(0, arg._1) @@ -1104,9 +1193,10 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 12 } - implicit def tup12Setter[Z <: Tuple12[_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter12[Z]() + implicit def tup12Setter[Z <: Tuple12[_, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter12[Z]() - case class TupleSetter13[Z <: Tuple13[_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter13[Z <: Tuple13[_, _, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(13) tup.set(0, arg._1) @@ -1126,9 +1216,10 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 13 } - implicit def tup13Setter[Z <: Tuple13[_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter13[Z]() + implicit def tup13Setter[Z <: Tuple13[_, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter13[Z]() - case class TupleSetter14[Z <: Tuple14[_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter14[Z <: Tuple14[_, _, _, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(14) tup.set(0, arg._1) @@ -1149,9 +1240,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 14 } - implicit def tup14Setter[Z <: Tuple14[_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter14[Z]() + implicit def tup14Setter[Z <: Tuple14[_, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter14[Z]() - case class TupleSetter15[Z <: Tuple15[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter15[Z <: Tuple15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(15) tup.set(0, arg._1) @@ -1173,9 +1266,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 15 } - implicit def tup15Setter[Z <: Tuple15[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter15[Z]() + implicit def tup15Setter[Z <: Tuple15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter15[Z]() - case class TupleSetter16[Z <: Tuple16[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter16[Z <: Tuple16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(16) tup.set(0, arg._1) @@ -1198,9 +1293,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 16 } - implicit def tup16Setter[Z <: Tuple16[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter16[Z]() + implicit def tup16Setter[Z <: Tuple16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter16[Z]() - case class TupleSetter17[Z <: Tuple17[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter17[Z <: Tuple17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(17) tup.set(0, arg._1) @@ -1224,9 +1321,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 17 } - implicit def tup17Setter[Z <: Tuple17[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter17[Z]() + implicit def tup17Setter[Z <: Tuple17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter17[Z]() - case class TupleSetter18[Z <: Tuple18[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter18[Z <: Tuple18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(18) tup.set(0, arg._1) @@ -1251,9 +1350,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 18 } - implicit def tup18Setter[Z <: Tuple18[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter18[Z]() + implicit def tup18Setter[Z <: Tuple18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter18[Z]() - case class TupleSetter19[Z <: Tuple19[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter19[Z <: Tuple19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(19) tup.set(0, arg._1) @@ -1279,9 +1380,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 19 } - implicit def tup19Setter[Z <: Tuple19[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter19[Z]() + implicit def tup19Setter[Z <: Tuple19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter19[Z]() - case class TupleSetter20[Z <: Tuple20[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter20[Z <: Tuple20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(20) tup.set(0, arg._1) @@ -1308,9 +1411,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 20 } - implicit def tup20Setter[Z <: Tuple20[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter20[Z]() + implicit def tup20Setter[Z <: Tuple20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter20[Z]() - case class TupleSetter21[Z <: Tuple21[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter21[Z <: Tuple21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(21) tup.set(0, arg._1) @@ -1338,9 +1443,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 21 } - implicit def tup21Setter[Z <: Tuple21[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter21[Z]() + implicit def tup21Setter[Z <: Tuple21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter21[Z]() - case class TupleSetter22[Z <: Tuple22[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]() extends TupleSetter[Z] { + case class TupleSetter22[Z <: Tuple22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(22) tup.set(0, arg._1) @@ -1369,6 +1476,7 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { } override def arity = 22 } - implicit def tup22Setter[Z <: Tuple22[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = TupleSetter22[Z]() + implicit def tup22Setter[Z <: Tuple22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter22[Z]() } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala index b769fced9e..76308c2d6f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala @@ -14,79 +14,123 @@ trait Mappable3[A, B, C] extends Mappable[Tuple3[A, B, C]] { } trait Mappable4[A, B, C, D] extends Mappable[Tuple4[A, B, C, D]] { - def converter[Z >: Tuple4[A, B, C, D]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) + def converter[Z >: Tuple4[A, B, C, D]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) } trait Mappable5[A, B, C, D, E] extends Mappable[Tuple5[A, B, C, D, E]] { - def converter[Z >: Tuple5[A, B, C, D, E]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) + def converter[Z >: Tuple5[A, B, C, D, E]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) } trait Mappable6[A, B, C, D, E, F] extends Mappable[Tuple6[A, B, C, D, E, F]] { - def converter[Z >: Tuple6[A, B, C, D, E, F]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) + def converter[Z >: Tuple6[A, B, C, D, E, F]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) } trait Mappable7[A, B, C, D, E, F, G] extends Mappable[Tuple7[A, B, C, D, E, F, G]] { - def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) + def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) } trait Mappable8[A, B, C, D, E, F, G, H] extends Mappable[Tuple8[A, B, C, D, E, F, G, H]] { - def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) + def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) } trait Mappable9[A, B, C, D, E, F, G, H, I] extends Mappable[Tuple9[A, B, C, D, E, F, G, H, I]] { - def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) + def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } trait Mappable10[A, B, C, D, E, F, G, H, I, J] extends Mappable[Tuple10[A, B, C, D, E, F, G, H, I, J]] { - def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) + def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } trait Mappable11[A, B, C, D, E, F, G, H, I, J, K] extends Mappable[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { - def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) + def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] extends Mappable[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { - def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) +trait Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] + extends Mappable[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] extends Mappable[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { - def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) +trait Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends Mappable[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] extends Mappable[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { - def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) +trait Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends Mappable[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] extends Mappable[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { - def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) +trait Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends Mappable[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] extends Mappable[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { - def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]]) +trait Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends Mappable[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + ) } -trait Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] extends Mappable[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { - def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]]) +trait Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends Mappable[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + ) } -trait Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] extends Mappable[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { - def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]]) +trait Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends Mappable[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + ) } -trait Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] extends Mappable[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { - def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]]) +trait Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends Mappable[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + ) } -trait Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] extends Mappable[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { - def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]]) +trait Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends Mappable[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] extends Mappable[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { - def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]]) +trait Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends Mappable[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] extends Mappable[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { - def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]]) +trait Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends Mappable[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala index a3353666aa..8c9a866e46 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala @@ -3,1159 +3,4630 @@ package com.twitter.scalding trait GeneratedTupleAdders { class Tuple1Adder[A](tup: Tuple1[A]) { - def :+[B](other: B) = { + def :+[B](other: B) = (tup._1, other) - } - def +:[B](other: B) = { + def +:[B](other: B) = (other, tup._1) - } - def ++[B](other: Tuple1[B]) = { + def ++[B](other: Tuple1[B]) = (tup._1, other._1) - } - def ++[B, C](other: Tuple2[B, C]) = { + def ++[B, C](other: Tuple2[B, C]) = (tup._1, other._1, other._2) - } - def ++[B, C, D](other: Tuple3[B, C, D]) = { + def ++[B, C, D](other: Tuple3[B, C, D]) = (tup._1, other._1, other._2, other._3) - } - def ++[B, C, D, E](other: Tuple4[B, C, D, E]) = { + def ++[B, C, D, E](other: Tuple4[B, C, D, E]) = (tup._1, other._1, other._2, other._3, other._4) - } - def ++[B, C, D, E, F](other: Tuple5[B, C, D, E, F]) = { + def ++[B, C, D, E, F](other: Tuple5[B, C, D, E, F]) = (tup._1, other._1, other._2, other._3, other._4, other._5) - } - def ++[B, C, D, E, F, G](other: Tuple6[B, C, D, E, F, G]) = { + def ++[B, C, D, E, F, G](other: Tuple6[B, C, D, E, F, G]) = (tup._1, other._1, other._2, other._3, other._4, other._5, other._6) - } - def ++[B, C, D, E, F, G, H](other: Tuple7[B, C, D, E, F, G, H]) = { + def ++[B, C, D, E, F, G, H](other: Tuple7[B, C, D, E, F, G, H]) = (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - def ++[B, C, D, E, F, G, H, I](other: Tuple8[B, C, D, E, F, G, H, I]) = { + def ++[B, C, D, E, F, G, H, I](other: Tuple8[B, C, D, E, F, G, H, I]) = (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - def ++[B, C, D, E, F, G, H, I, J](other: Tuple9[B, C, D, E, F, G, H, I, J]) = { + def ++[B, C, D, E, F, G, H, I, J](other: Tuple9[B, C, D, E, F, G, H, I, J]) = (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - def ++[B, C, D, E, F, G, H, I, J, K](other: Tuple10[B, C, D, E, F, G, H, I, J, K]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L](other: Tuple11[B, C, D, E, F, G, H, I, J, K, L]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M](other: Tuple12[B, C, D, E, F, G, H, I, J, K, L, M]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple13[B, C, D, E, F, G, H, I, J, K, L, M, N]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple14[B, C, D, E, F, G, H, I, J, K, L, M, N, O]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple15[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple16[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple17[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple18[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple19[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple20[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19, other._20) - } - - def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple21[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19, other._20, other._21) - } + def ++[B, C, D, E, F, G, H, I, J, K](other: Tuple10[B, C, D, E, F, G, H, I, J, K]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L](other: Tuple11[B, C, D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M](other: Tuple12[B, C, D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple13[B, C, D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + other: Tuple14[B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + other: Tuple15[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple16[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple17[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple18[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple19[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple20[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple21[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20, + other._21 + ) } implicit def tup1ToAdder[A](tup: Tuple1[A]): Tuple1Adder[A] = new Tuple1Adder(tup) class Tuple2Adder[A, B](tup: Tuple2[A, B]) { - def :+[C](other: C) = { + def :+[C](other: C) = (tup._1, tup._2, other) - } - def +:[C](other: C) = { + def +:[C](other: C) = (other, tup._1, tup._2) - } - def ++[C](other: Tuple1[C]) = { + def ++[C](other: Tuple1[C]) = (tup._1, tup._2, other._1) - } - def ++[C, D](other: Tuple2[C, D]) = { + def ++[C, D](other: Tuple2[C, D]) = (tup._1, tup._2, other._1, other._2) - } - def ++[C, D, E](other: Tuple3[C, D, E]) = { + def ++[C, D, E](other: Tuple3[C, D, E]) = (tup._1, tup._2, other._1, other._2, other._3) - } - def ++[C, D, E, F](other: Tuple4[C, D, E, F]) = { + def ++[C, D, E, F](other: Tuple4[C, D, E, F]) = (tup._1, tup._2, other._1, other._2, other._3, other._4) - } - def ++[C, D, E, F, G](other: Tuple5[C, D, E, F, G]) = { + def ++[C, D, E, F, G](other: Tuple5[C, D, E, F, G]) = (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5) - } - def ++[C, D, E, F, G, H](other: Tuple6[C, D, E, F, G, H]) = { + def ++[C, D, E, F, G, H](other: Tuple6[C, D, E, F, G, H]) = (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6) - } - def ++[C, D, E, F, G, H, I](other: Tuple7[C, D, E, F, G, H, I]) = { + def ++[C, D, E, F, G, H, I](other: Tuple7[C, D, E, F, G, H, I]) = (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - def ++[C, D, E, F, G, H, I, J](other: Tuple8[C, D, E, F, G, H, I, J]) = { + def ++[C, D, E, F, G, H, I, J](other: Tuple8[C, D, E, F, G, H, I, J]) = (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[C, D, E, F, G, H, I, J, K](other: Tuple9[C, D, E, F, G, H, I, J, K]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[C, D, E, F, G, H, I, J, K, L](other: Tuple10[C, D, E, F, G, H, I, J, K, L]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M](other: Tuple11[C, D, E, F, G, H, I, J, K, L, M]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple12[C, D, E, F, G, H, I, J, K, L, M, N]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple13[C, D, E, F, G, H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple14[C, D, E, F, G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple15[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple16[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple17[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple18[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18) - } - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple19[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19) - } - - def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple20[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19, other._20) - } + def ++[C, D, E, F, G, H, I, J, K](other: Tuple9[C, D, E, F, G, H, I, J, K]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[C, D, E, F, G, H, I, J, K, L](other: Tuple10[C, D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M](other: Tuple11[C, D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple12[C, D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple13[C, D, E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + other: Tuple14[C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple15[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple16[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple17[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple18[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple19[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple20[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20 + ) } implicit def tup2ToAdder[A, B](tup: Tuple2[A, B]): Tuple2Adder[A, B] = new Tuple2Adder(tup) class Tuple3Adder[A, B, C](tup: Tuple3[A, B, C]) { - def :+[D](other: D) = { + def :+[D](other: D) = (tup._1, tup._2, tup._3, other) - } - def +:[D](other: D) = { + def +:[D](other: D) = (other, tup._1, tup._2, tup._3) - } - def ++[D](other: Tuple1[D]) = { + def ++[D](other: Tuple1[D]) = (tup._1, tup._2, tup._3, other._1) - } - def ++[D, E](other: Tuple2[D, E]) = { + def ++[D, E](other: Tuple2[D, E]) = (tup._1, tup._2, tup._3, other._1, other._2) - } - def ++[D, E, F](other: Tuple3[D, E, F]) = { + def ++[D, E, F](other: Tuple3[D, E, F]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3) - } - def ++[D, E, F, G](other: Tuple4[D, E, F, G]) = { + def ++[D, E, F, G](other: Tuple4[D, E, F, G]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4) - } - def ++[D, E, F, G, H](other: Tuple5[D, E, F, G, H]) = { + def ++[D, E, F, G, H](other: Tuple5[D, E, F, G, H]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5) - } - def ++[D, E, F, G, H, I](other: Tuple6[D, E, F, G, H, I]) = { + def ++[D, E, F, G, H, I](other: Tuple6[D, E, F, G, H, I]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6) - } - def ++[D, E, F, G, H, I, J](other: Tuple7[D, E, F, G, H, I, J]) = { + def ++[D, E, F, G, H, I, J](other: Tuple7[D, E, F, G, H, I, J]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - def ++[D, E, F, G, H, I, J, K](other: Tuple8[D, E, F, G, H, I, J, K]) = { + def ++[D, E, F, G, H, I, J, K](other: Tuple8[D, E, F, G, H, I, J, K]) = (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[D, E, F, G, H, I, J, K, L](other: Tuple9[D, E, F, G, H, I, J, K, L]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[D, E, F, G, H, I, J, K, L, M](other: Tuple10[D, E, F, G, H, I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N](other: Tuple11[D, E, F, G, H, I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple12[D, E, F, G, H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple13[D, E, F, G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple14[D, E, F, G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple15[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple16[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple17[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17) - } - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple18[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18) - } - - def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple19[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18, other._19) - } + def ++[D, E, F, G, H, I, J, K, L](other: Tuple9[D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[D, E, F, G, H, I, J, K, L, M](other: Tuple10[D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N](other: Tuple11[D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple12[D, E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple13[D, E, F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple14[D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple15[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple16[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple17[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple18[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple19[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) } implicit def tup3ToAdder[A, B, C](tup: Tuple3[A, B, C]): Tuple3Adder[A, B, C] = new Tuple3Adder(tup) class Tuple4Adder[A, B, C, D](tup: Tuple4[A, B, C, D]) { - def :+[E](other: E) = { + def :+[E](other: E) = (tup._1, tup._2, tup._3, tup._4, other) - } - def +:[E](other: E) = { + def +:[E](other: E) = (other, tup._1, tup._2, tup._3, tup._4) - } - def ++[E](other: Tuple1[E]) = { + def ++[E](other: Tuple1[E]) = (tup._1, tup._2, tup._3, tup._4, other._1) - } - def ++[E, F](other: Tuple2[E, F]) = { + def ++[E, F](other: Tuple2[E, F]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2) - } - def ++[E, F, G](other: Tuple3[E, F, G]) = { + def ++[E, F, G](other: Tuple3[E, F, G]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3) - } - def ++[E, F, G, H](other: Tuple4[E, F, G, H]) = { + def ++[E, F, G, H](other: Tuple4[E, F, G, H]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4) - } - def ++[E, F, G, H, I](other: Tuple5[E, F, G, H, I]) = { + def ++[E, F, G, H, I](other: Tuple5[E, F, G, H, I]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5) - } - def ++[E, F, G, H, I, J](other: Tuple6[E, F, G, H, I, J]) = { + def ++[E, F, G, H, I, J](other: Tuple6[E, F, G, H, I, J]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6) - } - def ++[E, F, G, H, I, J, K](other: Tuple7[E, F, G, H, I, J, K]) = { + def ++[E, F, G, H, I, J, K](other: Tuple7[E, F, G, H, I, J, K]) = (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[E, F, G, H, I, J, K, L](other: Tuple8[E, F, G, H, I, J, K, L]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[E, F, G, H, I, J, K, L, M](other: Tuple9[E, F, G, H, I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[E, F, G, H, I, J, K, L, M, N](other: Tuple10[E, F, G, H, I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O](other: Tuple11[E, F, G, H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple12[E, F, G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple13[E, F, G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple14[E, F, G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple15[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple16[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple17[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17) - } - - def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple18[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17, other._18) - } + def ++[E, F, G, H, I, J, K, L](other: Tuple8[E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[E, F, G, H, I, J, K, L, M](other: Tuple9[E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[E, F, G, H, I, J, K, L, M, N](other: Tuple10[E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O](other: Tuple11[E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple12[E, F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple13[E, F, G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple14[E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple15[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple16[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple17[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple18[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) } - implicit def tup4ToAdder[A, B, C, D](tup: Tuple4[A, B, C, D]): Tuple4Adder[A, B, C, D] = new Tuple4Adder(tup) + implicit def tup4ToAdder[A, B, C, D](tup: Tuple4[A, B, C, D]): Tuple4Adder[A, B, C, D] = new Tuple4Adder( + tup + ) class Tuple5Adder[A, B, C, D, E](tup: Tuple5[A, B, C, D, E]) { - def :+[F](other: F) = { + def :+[F](other: F) = (tup._1, tup._2, tup._3, tup._4, tup._5, other) - } - def +:[F](other: F) = { + def +:[F](other: F) = (other, tup._1, tup._2, tup._3, tup._4, tup._5) - } - def ++[F](other: Tuple1[F]) = { + def ++[F](other: Tuple1[F]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1) - } - def ++[F, G](other: Tuple2[F, G]) = { + def ++[F, G](other: Tuple2[F, G]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2) - } - def ++[F, G, H](other: Tuple3[F, G, H]) = { + def ++[F, G, H](other: Tuple3[F, G, H]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3) - } - def ++[F, G, H, I](other: Tuple4[F, G, H, I]) = { + def ++[F, G, H, I](other: Tuple4[F, G, H, I]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4) - } - def ++[F, G, H, I, J](other: Tuple5[F, G, H, I, J]) = { + def ++[F, G, H, I, J](other: Tuple5[F, G, H, I, J]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5) - } - def ++[F, G, H, I, J, K](other: Tuple6[F, G, H, I, J, K]) = { + def ++[F, G, H, I, J, K](other: Tuple6[F, G, H, I, J, K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[F, G, H, I, J, K, L](other: Tuple7[F, G, H, I, J, K, L]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[F, G, H, I, J, K, L, M](other: Tuple8[F, G, H, I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[F, G, H, I, J, K, L, M, N](other: Tuple9[F, G, H, I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[F, G, H, I, J, K, L, M, N, O](other: Tuple10[F, G, H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P](other: Tuple11[F, G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple12[F, G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple13[F, G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple14[F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple15[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple16[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } - - def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple17[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16, other._17) - } + def ++[F, G, H, I, J, K, L](other: Tuple7[F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[F, G, H, I, J, K, L, M](other: Tuple8[F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[F, G, H, I, J, K, L, M, N](other: Tuple9[F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[F, G, H, I, J, K, L, M, N, O](other: Tuple10[F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P](other: Tuple11[F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple12[F, G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple13[F, G, H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple14[F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple15[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple16[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple17[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) } - implicit def tup5ToAdder[A, B, C, D, E](tup: Tuple5[A, B, C, D, E]): Tuple5Adder[A, B, C, D, E] = new Tuple5Adder(tup) + implicit def tup5ToAdder[A, B, C, D, E](tup: Tuple5[A, B, C, D, E]): Tuple5Adder[A, B, C, D, E] = + new Tuple5Adder(tup) class Tuple6Adder[A, B, C, D, E, F](tup: Tuple6[A, B, C, D, E, F]) { - def :+[G](other: G) = { + def :+[G](other: G) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other) - } - def +:[G](other: G) = { + def +:[G](other: G) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6) - } - def ++[G](other: Tuple1[G]) = { + def ++[G](other: Tuple1[G]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1) - } - def ++[G, H](other: Tuple2[G, H]) = { + def ++[G, H](other: Tuple2[G, H]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2) - } - def ++[G, H, I](other: Tuple3[G, H, I]) = { + def ++[G, H, I](other: Tuple3[G, H, I]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3) - } - def ++[G, H, I, J](other: Tuple4[G, H, I, J]) = { + def ++[G, H, I, J](other: Tuple4[G, H, I, J]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4) - } - def ++[G, H, I, J, K](other: Tuple5[G, H, I, J, K]) = { + def ++[G, H, I, J, K](other: Tuple5[G, H, I, J, K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5) - } - - def ++[G, H, I, J, K, L](other: Tuple6[G, H, I, J, K, L]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[G, H, I, J, K, L, M](other: Tuple7[G, H, I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[G, H, I, J, K, L, M, N](other: Tuple8[G, H, I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[G, H, I, J, K, L, M, N, O](other: Tuple9[G, H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[G, H, I, J, K, L, M, N, O, P](other: Tuple10[G, H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[G, H, I, J, K, L, M, N, O, P, Q](other: Tuple11[G, H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple12[G, H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple13[G, H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - - def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple14[G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple15[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } - - def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple16[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15, other._16) - } + def ++[G, H, I, J, K, L](other: Tuple6[G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[G, H, I, J, K, L, M](other: Tuple7[G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[G, H, I, J, K, L, M, N](other: Tuple8[G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[G, H, I, J, K, L, M, N, O](other: Tuple9[G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[G, H, I, J, K, L, M, N, O, P](other: Tuple10[G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q](other: Tuple11[G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple12[G, H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple13[G, H, I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple14[G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple15[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple16[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) } - implicit def tup6ToAdder[A, B, C, D, E, F](tup: Tuple6[A, B, C, D, E, F]): Tuple6Adder[A, B, C, D, E, F] = new Tuple6Adder(tup) + implicit def tup6ToAdder[A, B, C, D, E, F](tup: Tuple6[A, B, C, D, E, F]): Tuple6Adder[A, B, C, D, E, F] = + new Tuple6Adder(tup) class Tuple7Adder[A, B, C, D, E, F, G](tup: Tuple7[A, B, C, D, E, F, G]) { - def :+[H](other: H) = { + def :+[H](other: H) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other) - } - def +:[H](other: H) = { + def +:[H](other: H) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7) - } - def ++[H](other: Tuple1[H]) = { + def ++[H](other: Tuple1[H]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1) - } - def ++[H, I](other: Tuple2[H, I]) = { + def ++[H, I](other: Tuple2[H, I]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2) - } - def ++[H, I, J](other: Tuple3[H, I, J]) = { + def ++[H, I, J](other: Tuple3[H, I, J]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3) - } - def ++[H, I, J, K](other: Tuple4[H, I, J, K]) = { + def ++[H, I, J, K](other: Tuple4[H, I, J, K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4) - } - - def ++[H, I, J, K, L](other: Tuple5[H, I, J, K, L]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5) - } - - def ++[H, I, J, K, L, M](other: Tuple6[H, I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[H, I, J, K, L, M, N](other: Tuple7[H, I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[H, I, J, K, L, M, N, O](other: Tuple8[H, I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[H, I, J, K, L, M, N, O, P](other: Tuple9[H, I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[H, I, J, K, L, M, N, O, P, Q](other: Tuple10[H, I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[H, I, J, K, L, M, N, O, P, Q, R](other: Tuple11[H, I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple12[H, I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple13[H, I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple14[H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } - - def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple15[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14, other._15) - } + def ++[H, I, J, K, L](other: Tuple5[H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[H, I, J, K, L, M](other: Tuple6[H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[H, I, J, K, L, M, N](other: Tuple7[H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[H, I, J, K, L, M, N, O](other: Tuple8[H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[H, I, J, K, L, M, N, O, P](other: Tuple9[H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q](other: Tuple10[H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R](other: Tuple11[H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple12[H, I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple13[H, I, J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple14[H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple15[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) } - implicit def tup7ToAdder[A, B, C, D, E, F, G](tup: Tuple7[A, B, C, D, E, F, G]): Tuple7Adder[A, B, C, D, E, F, G] = new Tuple7Adder(tup) + implicit def tup7ToAdder[A, B, C, D, E, F, G]( + tup: Tuple7[A, B, C, D, E, F, G] + ): Tuple7Adder[A, B, C, D, E, F, G] = new Tuple7Adder(tup) class Tuple8Adder[A, B, C, D, E, F, G, H](tup: Tuple8[A, B, C, D, E, F, G, H]) { - def :+[I](other: I) = { + def :+[I](other: I) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other) - } - def +:[I](other: I) = { + def +:[I](other: I) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8) - } - def ++[I](other: Tuple1[I]) = { + def ++[I](other: Tuple1[I]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1) - } - def ++[I, J](other: Tuple2[I, J]) = { + def ++[I, J](other: Tuple2[I, J]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2) - } - def ++[I, J, K](other: Tuple3[I, J, K]) = { + def ++[I, J, K](other: Tuple3[I, J, K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3) - } - def ++[I, J, K, L](other: Tuple4[I, J, K, L]) = { + def ++[I, J, K, L](other: Tuple4[I, J, K, L]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4) - } - - def ++[I, J, K, L, M](other: Tuple5[I, J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5) - } - - def ++[I, J, K, L, M, N](other: Tuple6[I, J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[I, J, K, L, M, N, O](other: Tuple7[I, J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[I, J, K, L, M, N, O, P](other: Tuple8[I, J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[I, J, K, L, M, N, O, P, Q](other: Tuple9[I, J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[I, J, K, L, M, N, O, P, Q, R](other: Tuple10[I, J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[I, J, K, L, M, N, O, P, Q, R, S](other: Tuple11[I, J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple12[I, J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple13[I, J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } - def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple14[I, J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13, other._14) - } + def ++[I, J, K, L, M](other: Tuple5[I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[I, J, K, L, M, N](other: Tuple6[I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[I, J, K, L, M, N, O](other: Tuple7[I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[I, J, K, L, M, N, O, P](other: Tuple8[I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[I, J, K, L, M, N, O, P, Q](other: Tuple9[I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R](other: Tuple10[I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S](other: Tuple11[I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple12[I, J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple13[I, J, K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple14[I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) } - implicit def tup8ToAdder[A, B, C, D, E, F, G, H](tup: Tuple8[A, B, C, D, E, F, G, H]): Tuple8Adder[A, B, C, D, E, F, G, H] = new Tuple8Adder(tup) + implicit def tup8ToAdder[A, B, C, D, E, F, G, H]( + tup: Tuple8[A, B, C, D, E, F, G, H] + ): Tuple8Adder[A, B, C, D, E, F, G, H] = new Tuple8Adder(tup) class Tuple9Adder[A, B, C, D, E, F, G, H, I](tup: Tuple9[A, B, C, D, E, F, G, H, I]) { - def :+[J](other: J) = { + def :+[J](other: J) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other) - } - def +:[J](other: J) = { + def +:[J](other: J) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9) - } - def ++[J](other: Tuple1[J]) = { + def ++[J](other: Tuple1[J]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1) - } - def ++[J, K](other: Tuple2[J, K]) = { + def ++[J, K](other: Tuple2[J, K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2) - } - def ++[J, K, L](other: Tuple3[J, K, L]) = { + def ++[J, K, L](other: Tuple3[J, K, L]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3) - } - def ++[J, K, L, M](other: Tuple4[J, K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4) - } - - def ++[J, K, L, M, N](other: Tuple5[J, K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5) - } - - def ++[J, K, L, M, N, O](other: Tuple6[J, K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[J, K, L, M, N, O, P](other: Tuple7[J, K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[J, K, L, M, N, O, P, Q](other: Tuple8[J, K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[J, K, L, M, N, O, P, Q, R](other: Tuple9[J, K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[J, K, L, M, N, O, P, Q, R, S](other: Tuple10[J, K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[J, K, L, M, N, O, P, Q, R, S, T](other: Tuple11[J, K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple12[J, K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } - - def ++[J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple13[J, K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12, other._13) - } + def ++[J, K, L, M](other: Tuple4[J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[J, K, L, M, N](other: Tuple5[J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[J, K, L, M, N, O](other: Tuple6[J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[J, K, L, M, N, O, P](other: Tuple7[J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[J, K, L, M, N, O, P, Q](other: Tuple8[J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[J, K, L, M, N, O, P, Q, R](other: Tuple9[J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S](other: Tuple10[J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T](other: Tuple11[J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple12[J, K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple13[J, K, L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) } - implicit def tup9ToAdder[A, B, C, D, E, F, G, H, I](tup: Tuple9[A, B, C, D, E, F, G, H, I]): Tuple9Adder[A, B, C, D, E, F, G, H, I] = new Tuple9Adder(tup) + implicit def tup9ToAdder[A, B, C, D, E, F, G, H, I]( + tup: Tuple9[A, B, C, D, E, F, G, H, I] + ): Tuple9Adder[A, B, C, D, E, F, G, H, I] = new Tuple9Adder(tup) class Tuple10Adder[A, B, C, D, E, F, G, H, I, J](tup: Tuple10[A, B, C, D, E, F, G, H, I, J]) { - def :+[K](other: K) = { + def :+[K](other: K) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other) - } - def +:[K](other: K) = { + def +:[K](other: K) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10) - } - def ++[K](other: Tuple1[K]) = { + def ++[K](other: Tuple1[K]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1) - } - def ++[K, L](other: Tuple2[K, L]) = { + def ++[K, L](other: Tuple2[K, L]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2) - } - - def ++[K, L, M](other: Tuple3[K, L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3) - } - - def ++[K, L, M, N](other: Tuple4[K, L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4) - } - def ++[K, L, M, N, O](other: Tuple5[K, L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5) - } - - def ++[K, L, M, N, O, P](other: Tuple6[K, L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[K, L, M, N, O, P, Q](other: Tuple7[K, L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[K, L, M, N, O, P, Q, R](other: Tuple8[K, L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[K, L, M, N, O, P, Q, R, S](other: Tuple9[K, L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[K, L, M, N, O, P, Q, R, S, T](other: Tuple10[K, L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[K, L, M, N, O, P, Q, R, S, T, U](other: Tuple11[K, L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } - - def ++[K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple12[K, L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11, other._12) - } + def ++[K, L, M](other: Tuple3[K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3 + ) + + def ++[K, L, M, N](other: Tuple4[K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[K, L, M, N, O](other: Tuple5[K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[K, L, M, N, O, P](other: Tuple6[K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[K, L, M, N, O, P, Q](other: Tuple7[K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[K, L, M, N, O, P, Q, R](other: Tuple8[K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[K, L, M, N, O, P, Q, R, S](other: Tuple9[K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T](other: Tuple10[K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T, U](other: Tuple11[K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple12[K, L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) } - implicit def tup10ToAdder[A, B, C, D, E, F, G, H, I, J](tup: Tuple10[A, B, C, D, E, F, G, H, I, J]): Tuple10Adder[A, B, C, D, E, F, G, H, I, J] = new Tuple10Adder(tup) + implicit def tup10ToAdder[A, B, C, D, E, F, G, H, I, J]( + tup: Tuple10[A, B, C, D, E, F, G, H, I, J] + ): Tuple10Adder[A, B, C, D, E, F, G, H, I, J] = new Tuple10Adder(tup) class Tuple11Adder[A, B, C, D, E, F, G, H, I, J, K](tup: Tuple11[A, B, C, D, E, F, G, H, I, J, K]) { - def :+[L](other: L) = { + def :+[L](other: L) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other) - } - def +:[L](other: L) = { + def +:[L](other: L) = (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11) - } - def ++[L](other: Tuple1[L]) = { + def ++[L](other: Tuple1[L]) = (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1) - } - - def ++[L, M](other: Tuple2[L, M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2) - } - - def ++[L, M, N](other: Tuple3[L, M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3) - } - - def ++[L, M, N, O](other: Tuple4[L, M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4) - } - def ++[L, M, N, O, P](other: Tuple5[L, M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5) - } - - def ++[L, M, N, O, P, Q](other: Tuple6[L, M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[L, M, N, O, P, Q, R](other: Tuple7[L, M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[L, M, N, O, P, Q, R, S](other: Tuple8[L, M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[L, M, N, O, P, Q, R, S, T](other: Tuple9[L, M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[L, M, N, O, P, Q, R, S, T, U](other: Tuple10[L, M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } - - def ++[L, M, N, O, P, Q, R, S, T, U, V](other: Tuple11[L, M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10, other._11) - } + def ++[L, M](other: Tuple2[L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2 + ) + + def ++[L, M, N](other: Tuple3[L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3 + ) + + def ++[L, M, N, O](other: Tuple4[L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[L, M, N, O, P](other: Tuple5[L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[L, M, N, O, P, Q](other: Tuple6[L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[L, M, N, O, P, Q, R](other: Tuple7[L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[L, M, N, O, P, Q, R, S](other: Tuple8[L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[L, M, N, O, P, Q, R, S, T](other: Tuple9[L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[L, M, N, O, P, Q, R, S, T, U](other: Tuple10[L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[L, M, N, O, P, Q, R, S, T, U, V](other: Tuple11[L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) } - implicit def tup11ToAdder[A, B, C, D, E, F, G, H, I, J, K](tup: Tuple11[A, B, C, D, E, F, G, H, I, J, K]): Tuple11Adder[A, B, C, D, E, F, G, H, I, J, K] = new Tuple11Adder(tup) + implicit def tup11ToAdder[A, B, C, D, E, F, G, H, I, J, K]( + tup: Tuple11[A, B, C, D, E, F, G, H, I, J, K] + ): Tuple11Adder[A, B, C, D, E, F, G, H, I, J, K] = new Tuple11Adder(tup) class Tuple12Adder[A, B, C, D, E, F, G, H, I, J, K, L](tup: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]) { - def :+[M](other: M) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other) - } - def +:[M](other: M) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12) - } - - def ++[M](other: Tuple1[M]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1) - } - - def ++[M, N](other: Tuple2[M, N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2) - } - - def ++[M, N, O](other: Tuple3[M, N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3) - } - - def ++[M, N, O, P](other: Tuple4[M, N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4) - } - - def ++[M, N, O, P, Q](other: Tuple5[M, N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5) - } - - def ++[M, N, O, P, Q, R](other: Tuple6[M, N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[M, N, O, P, Q, R, S](other: Tuple7[M, N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[M, N, O, P, Q, R, S, T](other: Tuple8[M, N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[M, N, O, P, Q, R, S, T, U](other: Tuple9[M, N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } - - def ++[M, N, O, P, Q, R, S, T, U, V](other: Tuple10[M, N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9, other._10) - } + def :+[M](other: M) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other + ) + def +:[M](other: M) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12 + ) + + def ++[M](other: Tuple1[M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1 + ) + + def ++[M, N](other: Tuple2[M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2 + ) + + def ++[M, N, O](other: Tuple3[M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3 + ) + + def ++[M, N, O, P](other: Tuple4[M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[M, N, O, P, Q](other: Tuple5[M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[M, N, O, P, Q, R](other: Tuple6[M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[M, N, O, P, Q, R, S](other: Tuple7[M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[M, N, O, P, Q, R, S, T](other: Tuple8[M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[M, N, O, P, Q, R, S, T, U](other: Tuple9[M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[M, N, O, P, Q, R, S, T, U, V](other: Tuple10[M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) } - implicit def tup12ToAdder[A, B, C, D, E, F, G, H, I, J, K, L](tup: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]): Tuple12Adder[A, B, C, D, E, F, G, H, I, J, K, L] = new Tuple12Adder(tup) - - class Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M](tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]) { - def :+[N](other: N) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other) - } - def +:[N](other: N) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13) - } - - def ++[N](other: Tuple1[N]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1) - } - - def ++[N, O](other: Tuple2[N, O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2) - } - - def ++[N, O, P](other: Tuple3[N, O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3) - } - - def ++[N, O, P, Q](other: Tuple4[N, O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4) - } - - def ++[N, O, P, Q, R](other: Tuple5[N, O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4, other._5) - } - - def ++[N, O, P, Q, R, S](other: Tuple6[N, O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[N, O, P, Q, R, S, T](other: Tuple7[N, O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[N, O, P, Q, R, S, T, U](other: Tuple8[N, O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } - - def ++[N, O, P, Q, R, S, T, U, V](other: Tuple9[N, O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) - } + implicit def tup12ToAdder[A, B, C, D, E, F, G, H, I, J, K, L]( + tup: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L] + ): Tuple12Adder[A, B, C, D, E, F, G, H, I, J, K, L] = new Tuple12Adder(tup) + + class Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M]( + tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M] + ) { + def :+[N](other: N) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other + ) + def +:[N](other: N) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13 + ) + + def ++[N](other: Tuple1[N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1 + ) + + def ++[N, O](other: Tuple2[N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2 + ) + + def ++[N, O, P](other: Tuple3[N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3 + ) + + def ++[N, O, P, Q](other: Tuple4[N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[N, O, P, Q, R](other: Tuple5[N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[N, O, P, Q, R, S](other: Tuple6[N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[N, O, P, Q, R, S, T](other: Tuple7[N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[N, O, P, Q, R, S, T, U](other: Tuple8[N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[N, O, P, Q, R, S, T, U, V](other: Tuple9[N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) } - implicit def tup13ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M](tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]): Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M] = new Tuple13Adder(tup) - - class Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N](tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]) { - def :+[O](other: O) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other) - } - def +:[O](other: O) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14) - } - - def ++[O](other: Tuple1[O]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1) - } - - def ++[O, P](other: Tuple2[O, P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2) - } - - def ++[O, P, Q](other: Tuple3[O, P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3) - } - - def ++[O, P, Q, R](other: Tuple4[O, P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3, other._4) - } - - def ++[O, P, Q, R, S](other: Tuple5[O, P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3, other._4, other._5) - } - - def ++[O, P, Q, R, S, T](other: Tuple6[O, P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[O, P, Q, R, S, T, U](other: Tuple7[O, P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } - - def ++[O, P, Q, R, S, T, U, V](other: Tuple8[O, P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) - } + implicit def tup13ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M]( + tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M] + ): Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M] = new Tuple13Adder(tup) + + class Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + ) { + def :+[O](other: O) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other + ) + def +:[O](other: O) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14 + ) + + def ++[O](other: Tuple1[O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1 + ) + + def ++[O, P](other: Tuple2[O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2 + ) + + def ++[O, P, Q](other: Tuple3[O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3 + ) + + def ++[O, P, Q, R](other: Tuple4[O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[O, P, Q, R, S](other: Tuple5[O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[O, P, Q, R, S, T](other: Tuple6[O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[O, P, Q, R, S, T, U](other: Tuple7[O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[O, P, Q, R, S, T, U, V](other: Tuple8[O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) } - implicit def tup14ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N](tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]): Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N] = new Tuple14Adder(tup) - - class Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]) { - def :+[P](other: P) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other) - } - def +:[P](other: P) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15) - } - - def ++[P](other: Tuple1[P]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1) - } - - def ++[P, Q](other: Tuple2[P, Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2) - } - - def ++[P, Q, R](other: Tuple3[P, Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2, other._3) - } - - def ++[P, Q, R, S](other: Tuple4[P, Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2, other._3, other._4) - } - - def ++[P, Q, R, S, T](other: Tuple5[P, Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2, other._3, other._4, other._5) - } - - def ++[P, Q, R, S, T, U](other: Tuple6[P, Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2, other._3, other._4, other._5, other._6) - } - - def ++[P, Q, R, S, T, U, V](other: Tuple7[P, Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, other._1, other._2, other._3, other._4, other._5, other._6, other._7) - } + implicit def tup14ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + ): Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N] = new Tuple14Adder(tup) + + class Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ) { + def :+[P](other: P) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other + ) + def +:[P](other: P) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15 + ) + + def ++[P](other: Tuple1[P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1 + ) + + def ++[P, Q](other: Tuple2[P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2 + ) + + def ++[P, Q, R](other: Tuple3[P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3 + ) + + def ++[P, Q, R, S](other: Tuple4[P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[P, Q, R, S, T](other: Tuple5[P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[P, Q, R, S, T, U](other: Tuple6[P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[P, Q, R, S, T, U, V](other: Tuple7[P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) } - implicit def tup15ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]): Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] = new Tuple15Adder(tup) - - class Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]) { - def :+[Q](other: Q) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other) - } - def +:[Q](other: Q) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16) - } - - def ++[Q](other: Tuple1[Q]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1) - } - - def ++[Q, R](other: Tuple2[Q, R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1, other._2) - } - - def ++[Q, R, S](other: Tuple3[Q, R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1, other._2, other._3) - } - - def ++[Q, R, S, T](other: Tuple4[Q, R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1, other._2, other._3, other._4) - } - - def ++[Q, R, S, T, U](other: Tuple5[Q, R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1, other._2, other._3, other._4, other._5) - } - - def ++[Q, R, S, T, U, V](other: Tuple6[Q, R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, other._1, other._2, other._3, other._4, other._5, other._6) - } + implicit def tup15ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ): Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] = new Tuple15Adder(tup) + + class Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) { + def :+[Q](other: Q) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other + ) + def +:[Q](other: Q) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16 + ) + + def ++[Q](other: Tuple1[Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1 + ) + + def ++[Q, R](other: Tuple2[Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2 + ) + + def ++[Q, R, S](other: Tuple3[Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3 + ) + + def ++[Q, R, S, T](other: Tuple4[Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[Q, R, S, T, U](other: Tuple5[Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[Q, R, S, T, U, V](other: Tuple6[Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) } - implicit def tup16ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]): Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] = new Tuple16Adder(tup) - - class Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]) { - def :+[R](other: R) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other) - } - def +:[R](other: R) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17) - } - - def ++[R](other: Tuple1[R]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other._1) - } - - def ++[R, S](other: Tuple2[R, S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other._1, other._2) - } - - def ++[R, S, T](other: Tuple3[R, S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other._1, other._2, other._3) - } - - def ++[R, S, T, U](other: Tuple4[R, S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other._1, other._2, other._3, other._4) - } - - def ++[R, S, T, U, V](other: Tuple5[R, S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, other._1, other._2, other._3, other._4, other._5) - } + implicit def tup16ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ): Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] = new Tuple16Adder(tup) + + class Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) { + def :+[R](other: R) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other + ) + def +:[R](other: R) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17 + ) + + def ++[R](other: Tuple1[R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1 + ) + + def ++[R, S](other: Tuple2[R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2 + ) + + def ++[R, S, T](other: Tuple3[R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3 + ) + + def ++[R, S, T, U](other: Tuple4[R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[R, S, T, U, V](other: Tuple5[R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3, + other._4, + other._5 + ) } - implicit def tup17ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]): Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] = new Tuple17Adder(tup) - - class Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]) { - def :+[S](other: S) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, other) - } - def +:[S](other: S) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18) - } - - def ++[S](other: Tuple1[S]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, other._1) - } - - def ++[S, T](other: Tuple2[S, T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, other._1, other._2) - } - - def ++[S, T, U](other: Tuple3[S, T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, other._1, other._2, other._3) - } - - def ++[S, T, U, V](other: Tuple4[S, T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, other._1, other._2, other._3, other._4) - } + implicit def tup17ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ): Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] = new Tuple17Adder(tup) + + class Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) { + def :+[S](other: S) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other + ) + def +:[S](other: S) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18 + ) + + def ++[S](other: Tuple1[S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1 + ) + + def ++[S, T](other: Tuple2[S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2 + ) + + def ++[S, T, U](other: Tuple3[S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2, + other._3 + ) + + def ++[S, T, U, V](other: Tuple4[S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2, + other._3, + other._4 + ) } - implicit def tup18ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]): Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] = new Tuple18Adder(tup) - - class Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]) { - def :+[T](other: T) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, other) - } - def +:[T](other: T) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19) - } - - def ++[T](other: Tuple1[T]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, other._1) - } - - def ++[T, U](other: Tuple2[T, U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, other._1, other._2) - } - - def ++[T, U, V](other: Tuple3[T, U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, other._1, other._2, other._3) - } + implicit def tup18ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ): Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] = new Tuple18Adder(tup) + + class Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) { + def :+[T](other: T) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other + ) + def +:[T](other: T) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19 + ) + + def ++[T](other: Tuple1[T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1 + ) + + def ++[T, U](other: Tuple2[T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1, + other._2 + ) + + def ++[T, U, V](other: Tuple3[T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1, + other._2, + other._3 + ) } - implicit def tup19ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]): Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] = new Tuple19Adder(tup) - - class Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]) { - def :+[U](other: U) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, other) - } - def +:[U](other: U) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20) - } - - def ++[U](other: Tuple1[U]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, other._1) - } - - def ++[U, V](other: Tuple2[U, V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, other._1, other._2) - } + implicit def tup19ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ): Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] = new Tuple19Adder(tup) + + class Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) { + def :+[U](other: U) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other + ) + def +:[U](other: U) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20 + ) + + def ++[U](other: Tuple1[U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other._1 + ) + + def ++[U, V](other: Tuple2[U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other._1, + other._2 + ) } - implicit def tup20ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]): Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] = new Tuple20Adder(tup) - - class Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]) { - def :+[V](other: V) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, tup._21, other) - } - def +:[V](other: V) = { - (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, tup._21) - } - - def ++[V](other: Tuple1[V]) = { - (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, tup._12, tup._13, tup._14, tup._15, tup._16, tup._17, tup._18, tup._19, tup._20, tup._21, other._1) - } + implicit def tup20ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ): Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] = new Tuple20Adder(tup) + + class Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) { + def :+[V](other: V) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21, + other + ) + def +:[V](other: V) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21 + ) + + def ++[V](other: Tuple1[V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21, + other._1 + ) } - implicit def tup21ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]): Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] = new Tuple21Adder(tup) + implicit def tup21ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ): Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] = new Tuple21Adder(tup) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala b/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala index 9677cad7c7..8bcb24345f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe._ @@ -20,16 +20,16 @@ import cascading.operation._ import cascading.tuple.Fields import cascading.tuple.TupleEntry -import scala.{ Range => ScalaRange } +import scala.{Range => ScalaRange} /** - * This controls the sequence of reductions that happen inside a - * particular grouping operation. Not all elements can be combined, - * for instance, a scanLeft/foldLeft generally requires a sorting - * but such sorts are (at least for now) incompatible with doing a combine - * which includes some map-side reductions. + * This controls the sequence of reductions that happen inside a particular grouping operation. Not all + * elements can be combined, for instance, a scanLeft/foldLeft generally requires a sorting but such sorts are + * (at least for now) incompatible with doing a combine which includes some map-side reductions. */ -class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] with StreamOperations[GroupBuilder] { +class GroupBuilder(val groupFields: Fields) + extends FoldOperations[GroupBuilder] + with StreamOperations[GroupBuilder] { // We need the implicit conversions from symbols to Fields import Dsl._ @@ -47,8 +47,8 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] protected var sortF: Option[Fields] = None def sorting = sortF /* - * maxMF is the maximum index of a "middle field" allocated for mapReduceMap operations - */ + * maxMF is the maximum index of a "middle field" allocated for mapReduceMap operations + */ private var maxMF: Int = 0 private def getNextMiddlefield: String = { @@ -65,8 +65,8 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } /** - * Holds the number of reducers to use in the reduce stage of the groupBy/aggregateBy. - * By default uses whatever value is set in the jobConf. + * Holds the number of reducers to use in the reduce stage of the groupBy/aggregateBy. By default uses + * whatever value is set in the jobConf. */ private var numReducers: Option[Int] = None @@ -112,30 +112,25 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } /** - * This cancels map side aggregation - * and forces everything to the reducers + * This cancels map side aggregation and forces everything to the reducers */ def forceToReducers = { reds = None this } - protected def overrideReducers(p: Pipe): Pipe = { - numReducers.map { r => RichPipe.setReducers(p, r) }.getOrElse(p) - } + protected def overrideReducers(p: Pipe): Pipe = + numReducers.map(r => RichPipe.setReducers(p, r)).getOrElse(p) - protected def overrideDescription(p: Pipe): Pipe = { + protected def overrideDescription(p: Pipe): Pipe = RichPipe.setPipeDescriptions(p, descriptions) - } /** - * == Warning == - * This may significantly reduce performance of your job. - * It kills the ability to do map-side aggregation. + * ==Warning== + * This may significantly reduce performance of your job. It kills the ability to do map-side aggregation. */ - def buffer(args: Fields)(b: Buffer[_]): GroupBuilder = { + def buffer(args: Fields)(b: Buffer[_]): GroupBuilder = every(pipe => new Every(pipe, args, b)) - } /** * Prefer aggregateBy operations! @@ -148,18 +143,19 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } /** - * Prefer reduce or mapReduceMap. foldLeft will force all work to be - * done on the reducers. If your function is not associative and - * commutative, foldLeft may be required. + * Prefer reduce or mapReduceMap. foldLeft will force all work to be done on the reducers. If your function + * is not associative and commutative, foldLeft may be required. * - * == Best Practice == + * ==Best Practice== * Make sure init is an immutable object. * - * == Note == - * Init needs to be serializable with Kryo (because we copy it for each - * grouping to avoid possible errors using a mutable init object). + * ==Note== + * Init needs to be serializable with Kryo (because we copy it for each grouping to avoid possible errors + * using a mutable init object). */ - def foldLeft[X, T](fieldDef: (Fields, Fields))(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { + def foldLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { val (inFields, outFields) = fieldDef conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) @@ -167,25 +163,28 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] val beforePF = projectFields every(pipe => new Every(pipe, inFields, ag)) // Update projectFields, which makes sense in a fold, but invalidated on every - projectFields = beforePF.map { Fields.merge(_, inFields) } + projectFields = beforePF.map(Fields.merge(_, inFields)) this } /** * Type `T` is the type of the input field `(input to map, T => X)` * - * Type `X` is the intermediate type, which your reduce function operates on - * `(reduce is (X,X) => X)` + * Type `X` is the intermediate type, which your reduce function operates on `(reduce is (X,X) => X)` * * Type `U` is the final result type, `(final map is: X => U)` * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. */ - def mapReduceMap[T, X, U](fieldDef: (Fields, Fields))(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit startConv: TupleConverter[T], - middleSetter: TupleSetter[X], - middleConv: TupleConverter[X], - endSetter: TupleSetter[U]): GroupBuilder = { + def mapReduceMap[T, X, U]( + fieldDef: (Fields, Fields) + )(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U] + ): GroupBuilder = { val (maybeSortedFromFields, maybeSortedToFields) = fieldDef //Check for arity safety: // To fields CANNOT have a sorting, or cascading gets unhappy: @@ -195,44 +194,54 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] startConv.assertArityMatches(fromFields) endSetter.assertArityMatches(toFields) // Update projectFields - projectFields = projectFields.map { Fields.merge(_, fromFields) } + projectFields = projectFields.map(Fields.merge(_, fromFields)) val ag = new MRMAggregator[T, X, U](mapfn, redfn, mapfn2, toFields, startConv, endSetter) val ev = (pipe => new Every(pipe, fromFields, ag)): Pipe => Every - assert(middleSetter.arity > 0, - "The middle arity must have definite size, try wrapping in scala.Tuple1 if you need a hack") + assert( + middleSetter.arity > 0, + "The middle arity must have definite size, try wrapping in scala.Tuple1 if you need a hack" + ) // Create the required number of middlefields based on the arity of middleSetter - val middleFields = strFields(ScalaRange(0, middleSetter.arity).map { i => getNextMiddlefield }) - val mrmBy = new MRMBy[T, X, U](fromFields, middleFields, toFields, - mapfn, redfn, mapfn2, startConv, middleSetter, middleConv, endSetter) + val middleFields = strFields(ScalaRange(0, middleSetter.arity).map(i => getNextMiddlefield)) + val mrmBy = new MRMBy[T, X, U]( + fromFields, + middleFields, + toFields, + mapfn, + redfn, + mapfn2, + startConv, + middleSetter, + middleConv, + endSetter + ) tryAggregateBy(mrmBy, ev) this } /** - * Corresponds to a Cascading Buffer - * which allows you to stream through the data, keeping some, dropping, scanning, etc... - * The iterator you are passed is lazy, and mapping will not trigger the - * entire evaluation. If you convert to a list (i.e. to reverse), you need to be aware - * that memory constraints may become an issue. + * Corresponds to a Cascading Buffer which allows you to stream through the data, keeping some, dropping, + * scanning, etc... The iterator you are passed is lazy, and mapping will not trigger the entire evaluation. + * If you convert to a list (i.e. to reverse), you need to be aware that memory constraints may become an + * issue. * - * == Warning == - * Any fields not referenced by the input fields will be aligned to the first output, - * and the final hadoop stream will have a length of the maximum of the output of this, and - * the input stream. So, if you change the length of your inputs, the other fields won't - * be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT TO KEEP ALIGNED IN THIS MAPPING! - * POB: This appears to be a Cascading design decision. + * ==Warning== + * Any fields not referenced by the input fields will be aligned to the first output, and the final hadoop + * stream will have a length of the maximum of the output of this, and the input stream. So, if you change + * the length of your inputs, the other fields won't be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT + * TO KEEP ALIGNED IN THIS MAPPING! POB: This appears to be a Cascading design decision. * - * == Warning == - * mapfn needs to be stateless. Multiple calls needs to be safe (no mutable - * state captured) + * ==Warning== + * mapfn needs to be stateless. Multiple calls needs to be safe (no mutable state captured) */ - def mapStream[T, X](fieldDef: (Fields, Fields))(mapfn: (Iterator[T]) => TraversableOnce[X])(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { + def mapStream[T, X]( + fieldDef: (Fields, Fields) + )(mapfn: (Iterator[T]) => TraversableOnce[X])(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { val (inFields, outFields) = fieldDef //Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) - val b = new BufferOp[Unit, T, X]((), - (u: Unit, it: Iterator[T]) => mapfn(it), outFields, conv, setter) + val b = new BufferOp[Unit, T, X]((), (u: Unit, it: Iterator[T]) => mapfn(it), outFields, conv, setter) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } @@ -244,44 +253,48 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } /** - * Analog of standard scanLeft (@see scala.collection.Iterable.scanLeft ) - * This invalidates map-side aggregation, forces all data to be transferred - * to reducers. Use only if you REALLY have to. + * Analog of standard scanLeft (@see scala.collection.Iterable.scanLeft ) This invalidates map-side + * aggregation, forces all data to be transferred to reducers. Use only if you REALLY have to. * - * == Best Practice == + * ==Best Practice== * Make sure init is an immutable object. * - * == Note == - * init needs to be serializable with Kryo (because we copy it for each - * grouping to avoid possible errors using a mutable init object). - * We override the default implementation here to use Kryo to serialize - * the initial value, for immutable serializable inits, this is not needed + * ==Note== + * init needs to be serializable with Kryo (because we copy it for each grouping to avoid possible errors + * using a mutable init object). We override the default implementation here to use Kryo to serialize the + * initial value, for immutable serializable inits, this is not needed */ - override def scanLeft[X, T](fieldDef: (Fields, Fields))(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { + override def scanLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { val (inFields, outFields) = fieldDef //Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) - val b = new BufferOp[X, T, X](init, + val b = new BufferOp[X, T, X]( + init, // On scala 2.8, there is no scanLeft // On scala 2.9, their implementation creates an off-by-one bug with the unused fields (i: X, it: Iterator[T]) => new ScanLeftIterator(it, i, fn), - outFields, conv, setter) + outFields, + conv, + setter + ) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } def groupMode: GroupMode = (reds, evs, sortF) match { - case (None, Nil, Some(_)) => IdentityMode // no reducers or everys, just a sort - case (Some(Nil), Nil, _) => IdentityMode // no sort, just identity. used to shuffle data - case (None, _, _) => GroupByMode + case (None, Nil, Some(_)) => IdentityMode // no reducers or everys, just a sort + case (Some(Nil), Nil, _) => IdentityMode // no sort, just identity. used to shuffle data + case (None, _, _) => GroupByMode case (Some(redList), _, None) => AggregateByMode // use map-side aggregation case _ => sys.error("Invalid GroupBuilder state: %s, %s, %s".format(reds, evs, sortF)) } protected def groupedPipeOf(name: String, in: Pipe): GroupBy = { val gb: GroupBy = sortF match { - case None => new GroupBy(name, in, groupFields) + case None => new GroupBy(name, in, groupFields) case Some(sf) => new GroupBy(name, in, groupFields, sf, isReversed) } overrideReducers(gb) @@ -291,13 +304,13 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def schedule(name: String, pipe: Pipe): Pipe = { - val maybeProjectedPipe = projectFields.map { pipe.project(_) }.getOrElse(pipe) + val maybeProjectedPipe = projectFields.map(pipe.project(_)).getOrElse(pipe) groupMode match { case GroupByMode => //In this case we cannot aggregate, so group: val start: Pipe = groupedPipeOf(name, maybeProjectedPipe) // Time to schedule the Every operations - evs.foldRight(start) { (op: (Pipe => Every), p) => op(p) } + evs.foldRight(start)((op: (Pipe => Every), p) => op(p)) case IdentityMode => //This is the case where the group function is identity: { g => g } @@ -306,11 +319,13 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] case AggregateByMode => //There is some non-empty AggregateBy to do: val redlist = reds.get - val ag = new AggregateBy(name, + val ag = new AggregateBy( + name, maybeProjectedPipe, groupFields, spillThreshold.getOrElse(0), // cascading considers 0 to be the default - redlist.reverse.toArray: _*) + redlist.reverse.toArray: _* + ) overrideReducers(ag.getGroupBy()) overrideDescription(ag.getGroupBy()) @@ -332,42 +347,46 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } sortF = Some(sort) // Update projectFields - projectFields = projectFields.map { Fields.merge(_, sort) } + projectFields = projectFields.map(Fields.merge(_, sort)) this } /** - * This is convenience method to allow plugging in blocks - * of group operations similar to `RichPipe.thenDo` + * This is convenience method to allow plugging in blocks of group operations similar to `RichPipe.thenDo` */ def thenDo(fn: (GroupBuilder) => GroupBuilder) = fn(this) /** - * An identity function that keeps all the tuples. A hack to implement - * groupAll and groupRandomly. + * An identity function that keeps all the tuples. A hack to implement groupAll and groupRandomly. */ - def pass: GroupBuilder = takeWhile(0) { (t: TupleEntry) => true } + def pass: GroupBuilder = takeWhile(0)((t: TupleEntry) => true) /** - * beginning of block with access to expensive nonserializable state. The state object should - * contain a function release() for resource management purpose. + * beginning of block with access to expensive nonserializable state. The state object should contain a + * function release() for resource management purpose. */ def using[C <: { def release(): Unit }](bf: => C) = new { /** * mapStream with state. */ - def mapStream[T, X](fieldDef: (Fields, Fields))(mapfn: (C, Iterator[T]) => TraversableOnce[X])(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { + def mapStream[T, X](fieldDef: (Fields, Fields))( + mapfn: (C, Iterator[T]) => TraversableOnce[X] + )(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { val (inFields, outFields) = fieldDef //Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) val b = new SideEffectBufferOp[Unit, T, C, X]( - (), bf, + (), + bf, (u: Unit, c: C, it: Iterator[T]) => mapfn(c, it), - new Function1[C, Unit] with java.io.Serializable { def apply(c: C): Unit = { c.release() } }, - outFields, conv, setter) + new Function1[C, Unit] with java.io.Serializable { def apply(c: C): Unit = c.release() }, + outFields, + conv, + setter + ) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } } @@ -375,16 +394,19 @@ class GroupBuilder(val groupFields: Fields) extends FoldOperations[GroupBuilder] } /** - * Scala 2.8 Iterators don't support scanLeft so we have to reimplement - * The Scala 2.9 implementation creates an off-by-one bug with the unused fields in the Fields API + * Scala 2.8 Iterators don't support scanLeft so we have to reimplement The Scala 2.9 implementation creates + * an off-by-one bug with the unused fields in the Fields API */ -class ScanLeftIterator[T, U](it: Iterator[T], init: U, fn: (U, T) => U) extends Iterator[U] with java.io.Serializable { +class ScanLeftIterator[T, U](it: Iterator[T], init: U, fn: (U, T) => U) + extends Iterator[U] + with java.io.Serializable { protected var prev: Option[U] = None - def hasNext: Boolean = { prev.isEmpty || it.hasNext } + def hasNext: Boolean = prev.isEmpty || it.hasNext // Don't use pattern matching in a performance-critical section @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def next = { - prev = prev.map { fn(_, it.next) } + prev = prev + .map(fn(_, it.next)) .orElse(Some(init)) prev.get } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala index 19c2ef3fc2..98882187b6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tap.SinkMode @@ -25,23 +25,22 @@ import cascading.tap.hadoop.Hfs import com.twitter.scalding.tap.ScaldingHfs private[scalding] class ConfPropertiesHfsTap( - sourceConfig: Config, - sinkConfig: Config, - scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], - stringPath: String, - sinkMode: SinkMode) extends ScaldingHfs(scheme, stringPath, sinkMode) { + sourceConfig: Config, + sinkConfig: Config, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + stringPath: String, + sinkMode: SinkMode +) extends ScaldingHfs(scheme, stringPath, sinkMode) { override def sourceConfInit(process: FlowProcess[JobConf], conf: JobConf): Unit = { - sourceConfig.toMap.foreach { - case (k, v) => - conf.set(k, v) + sourceConfig.toMap.foreach { case (k, v) => + conf.set(k, v) } super.sourceConfInit(process, conf) } override def sinkConfInit(process: FlowProcess[JobConf], conf: JobConf): Unit = { - sinkConfig.toMap.foreach { - case (k, v) => - conf.set(k, v) + sinkConfig.toMap.foreach { case (k, v) => + conf.set(k, v) } super.sinkConfInit(process, conf) } @@ -55,16 +54,20 @@ private[scalding] class ConfPropertiesHfsTap( * Changes here however will not show up in the hadoop UI */ trait HfsConfPropertySetter extends HfsTapProvider { - @deprecated("Tap config is deprecated, use sourceConfig or sinkConfig directly. In cascading configs applied to sinks can leak to sources in the step writing to the sink.", "0.17.0") + @deprecated( + "Tap config is deprecated, use sourceConfig or sinkConfig directly. In cascading configs applied to sinks can leak to sources in the step writing to the sink.", + "0.17.0" + ) def tapConfig: Config = Config.empty def sourceConfig: Config = Config.empty def sinkConfig: Config = Config.empty override def createHfsTap( - scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], - path: String, - sinkMode: SinkMode): Hfs = { + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + path: String, + sinkMode: SinkMode + ): Hfs = { // Deprecation handling val (srcCfg, sinkCfg) = if (sourceConfig == Config.empty && sinkConfig == Config.empty) { (tapConfig, tapConfig) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala b/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala index 9d1844b773..f5360b58d9 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding @@ -26,10 +26,12 @@ import java.util.Comparator */ class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Serializable { - val integralTypes: Set[Class[_]] = Set(classOf[java.lang.Long], + val integralTypes: Set[Class[_]] = Set( + classOf[java.lang.Long], classOf[java.lang.Integer], classOf[java.lang.Short], - classOf[java.lang.Byte]) + classOf[java.lang.Byte] + ) def isIntegral(boxed: AnyRef) = integralTypes(boxed.getClass) @@ -52,16 +54,13 @@ class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Ser a1.asInstanceOf[Comparable[AnyRef]].compareTo(a2) } - override def hashCode(obj: AnyRef): Int = { + override def hashCode(obj: AnyRef): Int = if (obj == null) { 0 } else if (isIntegral(obj)) { - obj.asInstanceOf[Number] - .longValue - .hashCode + obj.asInstanceOf[Number].longValue.hashCode } else { //Use the default: obj.hashCode } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala index 9a575bd9dd..39b5142a96 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.maple.tap.MemorySourceTap @@ -22,31 +22,33 @@ import cascading.tuple.Tuple import cascading.tuple.Fields import cascading.scheme.NullScheme -import java.io.{ InputStream, OutputStream } +import java.io.{InputStream, OutputStream} import scala.collection.mutable.Buffer import scala.collection.JavaConverters._ /** - * Allows working with an iterable object defined in the job (on the submitter) - * to be used within a Job as you would a Pipe/RichPipe + * Allows working with an iterable object defined in the job (on the submitter) to be used within a Job as you + * would a Pipe/RichPipe * - * These lists should probably be very tiny by Hadoop standards. If they are - * getting large, you should probably dump them to HDFS and use the normal - * mechanisms to address the data (a FileSource). + * These lists should probably be very tiny by Hadoop standards. If they are getting large, you should + * probably dump them to HDFS and use the normal mechanisms to address the data (a FileSource). */ -case class IterableSource[+T](@transient iter: Iterable[T], inFields: Fields = Fields.NONE)(implicit set: TupleSetter[T], conv: TupleConverter[T]) extends Source with Mappable[T] { +case class IterableSource[+T](@transient iter: Iterable[T], inFields: Fields = Fields.NONE)(implicit + set: TupleSetter[T], + conv: TupleConverter[T] +) extends Source + with Mappable[T] { - def fields = { + def fields = if (inFields.isNone && set.arity > 0) { Dsl.intFields(0 until set.arity) } else inFields - } override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) @transient - private val asBuffer: Buffer[Tuple] = iter.map { set(_) }.toBuffer + private val asBuffer: Buffer[Tuple] = iter.map(set(_)).toBuffer private lazy val hdfsTap: Tap[_, _, _] = new MemorySourceTap(asBuffer.asJava, fields) @@ -55,17 +57,17 @@ case class IterableSource[+T](@transient iter: Iterable[T], inFields: Fields = F sys.error("IterableSource is a Read-only Source") } mode match { - case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) - case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) + case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) + case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) case Hdfs(_, _) => hdfsTap case HadoopTest(_, _) => hdfsTap - case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) + case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) } } /** - * Don't use the whole string of the iterable, which can be huge. - * We take the first 10 items + the identityHashCode of the iter. + * Don't use the whole string of the iterable, which can be huge. We take the first 10 items + the + * identityHashCode of the iter. */ override val sourceId: String = "IterableSource(%s)-%d".format(iter.take(10).toString, System.identityHashCode(iter)) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Job.scala b/scalding-core/src/main/scala/com/twitter/scalding/Job.scala index 203278991a..4d013144c6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Job.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Job.scala @@ -12,10 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.flow.{ Flow, FlowDef, FlowListener, FlowStep, FlowStepListener, FlowSkipStrategy, FlowStepStrategy } +import cascading.flow.{ + Flow, + FlowDef, + FlowListener, + FlowSkipStrategy, + FlowStep, + FlowStepListener, + FlowStepStrategy +} import cascading.pipe.Pipe import cascading.property.AppProps import cascading.stats.CascadingStats @@ -23,47 +31,47 @@ import cascading.stats.CascadingStats import com.twitter.algebird.Semigroup import com.twitter.scalding.typed.cascading_backend.CascadingBackend -import org.apache.hadoop.io.serializer.{ Serialization => HSerialization } +import org.apache.hadoop.io.serializer.{Serialization => HSerialization} -import scala.concurrent.{ Future, Promise } -import scala.util.{Try, Success, Failure} +import scala.concurrent.{Future, Promise} +import scala.util.{Failure, Success, Try} -import java.io.{ BufferedWriter, FileOutputStream, OutputStreamWriter } -import java.util.{ List => JList } +import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} +import java.util.{List => JList} -import java.util.concurrent.{ Executors, TimeUnit, ThreadFactory, Callable, TimeoutException } +import java.util.concurrent.{Callable, Executors, ThreadFactory, TimeUnit, TimeoutException} import java.util.concurrent.atomic.AtomicInteger object Job { + /** - * Use reflection to create the job by name. We use the thread's - * context classloader so that classes in the submitted jar and any - * jars included via -libjar can be found. + * Use reflection to create the job by name. We use the thread's context classloader so that classes in the + * submitted jar and any jars included via -libjar can be found. */ - def apply(jobName: String, args: Args): Job = { - Class.forName(jobName, true, Thread.currentThread().getContextClassLoader) + def apply(jobName: String, args: Args): Job = + Class + .forName(jobName, true, Thread.currentThread().getContextClassLoader) .getConstructor(classOf[Args]) .newInstance(args) .asInstanceOf[Job] - } /** - * Make a job reflectively from the given class - * and the Args contained in the Config. + * Make a job reflectively from the given class and the Args contained in the Config. */ def makeJob[J <: Job](cls: Class[J]): Execution[J] = Execution.getConfigMode.flatMap { case (conf, mode) => // Now we need to allocate the job Execution.from { val argsWithMode = Mode.putMode(mode, conf.getArgs) - cls.getConstructor(classOf[Args]) + cls + .getConstructor(classOf[Args]) .newInstance(argsWithMode) } } /** - * Create a job reflectively from a class, which handles threading - * through the Args and Mode correctly in the way Job subclasses expect + * Create a job reflectively from a class, which handles threading through the Args and Mode correctly in + * the way Job subclasses expect */ def toExecutionFromClass[J <: Job](cls: Class[J], onEmpty: Execution[Unit]): Execution[Unit] = makeJob(cls).flatMap(toExecution(_, onEmpty)) @@ -71,12 +79,10 @@ object Job { /** * Convert Jobs that only use the TypedPipe API to an Execution * - * This can fail for some exotic jobs, but for standard subclasses - * of Job (that don't override existing methods in Job except config) - * it should work + * This can fail for some exotic jobs, but for standard subclasses of Job (that don't override existing + * methods in Job except config) it should work * - * onEmpty is the execution to run if you have an empty job. Common - * values might be Execution.unit or + * onEmpty is the execution to run if you have an empty job. Common values might be Execution.unit or * Execution.failed(new Exeception("unexpected empty execution")) */ def toExecution(job: Job, onEmpty: Execution[Unit]): Execution[Unit] = @@ -89,7 +95,7 @@ object Job { // evaluate this *after* the current Execution val nextJobEx: Execution[Unit] = Execution.from(job.next).flatMap { // putting inside Execution.from memoizes this call - case None => Execution.unit + case None => Execution.unit case Some(nextJob) => toExecution(nextJob, onEmpty) } @@ -99,20 +105,18 @@ object Job { _ <- Execution.withConfig(ex)(_ => conf.setExecutionCleanupOnFinish(true)) _ <- nextJobEx } yield () - } + } } /** - * Job is a convenience class to make using Scalding easier. - * Subclasses of Job automatically have a number of nice implicits to enable more concise - * syntax, including: - * conversion from Pipe, Source or Iterable to RichPipe - * conversion from Source or Iterable to Pipe - * conversion to collections or Tuple[1-22] to cascading.tuple.Fields + * Job is a convenience class to make using Scalding easier. Subclasses of Job automatically have a number of + * nice implicits to enable more concise syntax, including: conversion from Pipe, Source or Iterable to + * RichPipe conversion from Source or Iterable to Pipe conversion to collections or Tuple[1-22] to + * cascading.tuple.Fields * - * Additionally, the job provides an implicit Mode and FlowDef so that functions that - * register starts or ends of a flow graph, specifically anything that reads or writes data - * on Hadoop, has the needed implicits available. + * Additionally, the job provides an implicit Mode and FlowDef so that functions that register starts or ends + * of a flow graph, specifically anything that reads or writes data on Hadoop, has the needed implicits + * available. * * If you want to write code outside of a Job, you will want to either: * @@ -120,8 +124,8 @@ object Job { * * OR: * - * write code that rather than returning values, it returns a (FlowDef, Mode) => T, - * these functions can be combined Monadically using algebird.monad.Reader. + * write code that rather than returning values, it returns a (FlowDef, Mode) => T, these functions can be + * combined Monadically using algebird.monad.Reader. */ class Job(val args: Args) extends FieldConversions with java.io.Serializable { Tracing.init() @@ -138,21 +142,18 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { } /** - * you should never call this directly, it is here to make - * the DSL work. Just know, you can treat a Pipe as a RichPipe - * within a Job + * you should never call this directly, it is here to make the DSL work. Just know, you can treat a Pipe as + * a RichPipe within a Job */ implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) + /** - * This implicit is to enable RichPipe methods directly on Source - * objects, such as map/flatMap, etc... + * This implicit is to enable RichPipe methods directly on Source objects, such as map/flatMap, etc... * - * Note that Mappable is a subclass of Source, and Mappable already - * has mapTo and flatMapTo BUT WITHOUT incoming fields used (see - * the Mappable trait). This creates some confusion when using these methods - * (this is an unfortunate mistake in our design that was not noticed until later). - * To remove ambiguity, explicitly call .read on any Source that you begin - * operating with a mapTo/flatMapTo. + * Note that Mappable is a subclass of Source, and Mappable already has mapTo and flatMapTo BUT WITHOUT + * incoming fields used (see the Mappable trait). This creates some confusion when using these methods (this + * is an unfortunate mistake in our design that was not noticed until later). To remove ambiguity, + * explicitly call .read on any Source that you begin operating with a mapTo/flatMapTo. */ implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read) @@ -160,7 +161,9 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe = IterableSource[T](iter)(set, conv).read - implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = + implicit def iterableToRichPipe[T]( + iter: Iterable[T] + )(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = RichPipe(toPipe(iter)(set, conv)) // Provide args as an implicit val for extensions such as the Checkpoint extension. @@ -181,8 +184,7 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { private[this] val uniqueId = UniqueID.getIDFor(flowDef) /** - * Copy this job - * By default, this uses reflection and the single argument Args constructor + * Copy this job By default, this uses reflection and the single argument Args constructor */ def clone(nextargs: Args): Job = this.getClass @@ -191,14 +193,14 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { .asInstanceOf[Job] /** - * Implement this method if you want some other jobs to run after the current - * job. These will not execute until the current job has run successfully. + * Implement this method if you want some other jobs to run after the current job. These will not execute + * until the current job has run successfully. */ def next: Option[Job] = None /** - * Keep 100k tuples in memory by default before spilling - * Turn this up as high as you can without getting OOM. + * Keep 100k tuples in memory by default before spilling Turn this up as high as you can without getting + * OOM. * * This is ignored if there is a value set in the incoming jobConf on Hadoop */ @@ -211,15 +213,12 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { def classIdentifier: String = Config.md5Identifier(getClass) /** - * This is the exact config that is passed to the Cascading FlowConnector. - * By default: - * if there are no spill thresholds in mode.config, we replace with defaultSpillThreshold - * we overwrite io.serializations with ioSerializations - * we overwrite cascading.tuple.element.comparator.default to defaultComparator - * we add some scalding keys for debugging/logging + * This is the exact config that is passed to the Cascading FlowConnector. By default: if there are no spill + * thresholds in mode.config, we replace with defaultSpillThreshold we overwrite io.serializations with + * ioSerializations we overwrite cascading.tuple.element.comparator.default to defaultComparator we add some + * scalding keys for debugging/logging * - * Tip: override this method, call super, and ++ your additional - * map to add or overwrite more options + * Tip: override this method, call super, and ++ your additional map to add or overwrite more options * * This returns Map[AnyRef, AnyRef] for compatibility with older code */ @@ -230,18 +229,18 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { .setMapSideAggregationThreshold(defaultSpillThreshold) // This is setting a property for cascading/driven - AppProps.addApplicationFramework(null, - String.format("scalding:%s", scaldingVersion)) + AppProps.addApplicationFramework(null, String.format("scalding:%s", scaldingVersion)) val modeConf = mode match { - case h: HadoopMode => Config.fromHadoop(h.jobConf) + case h: HadoopMode => Config.fromHadoop(h.jobConf) case _: CascadingLocal => Config.unitTestDefault - case _ => Config.empty + case _ => Config.empty } val init = base ++ modeConf - defaultComparator.map(init.setDefaultComparator) + defaultComparator + .map(init.setDefaultComparator) .getOrElse(init) .setSerialization(Right(classOf[serialization.KryoHadoop]), ioSerializations) .addCascadingClassSerializationTokens(reflectedClasses) @@ -250,8 +249,10 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { .setCascadingAppId(name) .setScaldingFlowClass(getClass) .setArgs(args) - .maybeSetSubmittedTimestamp()._2 - .toMap.toMap[AnyRef, AnyRef] // linter:disable:TypeToType // the second one is to lift from String -> AnyRef + .maybeSetSubmittedTimestamp() + ._2 + .toMap + .toMap[AnyRef, AnyRef] // linter:disable:TypeToType // the second one is to lift from String -> AnyRef } private def reflectedClasses: Set[Class[_]] = @@ -259,7 +260,6 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { ReferencedClassFinder.findReferencedClasses(getClass) } else Set.empty - /** * This is here so that Mappable.toIterator can find an implicit config */ @@ -271,7 +271,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { * Specify a callback to run before the start of each flow step. * * Defaults to what Config.getReducerEstimator specifies. - * @see ExecutionContext.buildFlow + * @see + * ExecutionContext.buildFlow */ def stepStrategy: Option[FlowStepStrategy[_]] = None @@ -290,9 +291,9 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { case None => Failure(new IllegalStateException("sink taps are required")) case Some(flow) => - listeners.foreach { flow.addListener(_) } - stepListeners.foreach { flow.addStepListener(_) } - skipStrategy.foreach { flow.setFlowSkipStrategy(_) } + listeners.foreach(flow.addListener(_)) + stepListeners.foreach(flow.addStepListener(_)) + skipStrategy.foreach(flow.setFlowSkipStrategy(_)) stepStrategy.foreach { strategy => val existing = flow.getFlowStepStrategy val composed = @@ -301,7 +302,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { else FlowStepStrategies[Any].plus( existing.asInstanceOf[FlowStepStrategy[Any]], - strategy.asInstanceOf[FlowStepStrategy[Any]]) + strategy.asInstanceOf[FlowStepStrategy[Any]] + ) flow.setFlowStepStrategy(composed) } Success(flow) @@ -317,9 +319,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { // called after successful run // only override if you do not use flowDef - def clear(): Unit = { + def clear(): Unit = FlowStateMap.clear(flowDef) - } protected def handleStats(statsData: CascadingStats): Unit = { scaldingCascadingStats = Some(statsData) @@ -335,9 +336,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { val jobStats = Stats.getAllCustomCounters()(statsData) if (!jobStats.isEmpty) { println("Dumping custom counters:") - jobStats.foreach { - case (counter, value) => - println("%s\t%s".format(counter, value)) + jobStats.foreach { case (counter, value) => + println("%s\t%s".format(counter, value)) } } } @@ -351,7 +351,8 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { /** * Save the Flow object after a run to allow clients to inspect the job. - * @see HadoopPlatformJobTest + * @see + * HadoopPlatformJobTest */ @transient private[scalding] var completedFlow: Option[Flow[_]] = None @@ -371,31 +372,30 @@ class Job(val args: Args) extends FieldConversions with java.io.Serializable { def stepListeners: List[FlowStepListener] = Nil /** - * These are user-defined serializations IN-ADDITION to (but deduped) - * with the required serializations + * These are user-defined serializations IN-ADDITION to (but deduped) with the required serializations */ def ioSerializations: List[Class[_ <: HSerialization[_]]] = Nil + /** - * Override this if you want to customize comparisons/hashing for your job - * the config method overwrites using this before sending to cascading - * The one we use by default is needed used to make Joins in the - * Fields-API more robust to Long vs Int differences. - * If you only use the Typed-API, consider changing this to return None + * Override this if you want to customize comparisons/hashing for your job the config method overwrites + * using this before sending to cascading The one we use by default is needed used to make Joins in the + * Fields-API more robust to Long vs Int differences. If you only use the Typed-API, consider changing this + * to return None */ def defaultComparator: Option[Class[_ <: java.util.Comparator[_]]] = Some(classOf[IntegralComparator]) /** - * This is implicit so that a Source can be used as the argument - * to a join or other method that accepts Pipe. + * This is implicit so that a Source can be used as the argument to a join or other method that accepts + * Pipe. */ implicit def read(src: Source): Pipe = src.read + /** - * This is only here for Java jobs which cannot automatically - * access the implicit Pipe => RichPipe which makes: pipe.write( ) - * convenient + * This is only here for Java jobs which cannot automatically access the implicit Pipe => RichPipe which + * makes: pipe.write( ) convenient */ - def write(pipe: Pipe, src: Source): Unit = { src.writeFrom(pipe) } + def write(pipe: Pipe, src: Source): Unit = src.writeFrom(pipe) /* * Need to be lazy to be used within pipes. @@ -443,10 +443,8 @@ class NamedPoolThreadFactory(name: String, makeDaemons: Boolean) extends ThreadF } /** - * Sets up an implicit dateRange to use in your sources and an implicit - * timezone. - * Example args: --date 2011-10-02 2011-10-04 --tz UTC - * If no timezone is given, Pacific is assumed. + * Sets up an implicit dateRange to use in your sources and an implicit timezone. Example args: --date + * 2011-10-02 2011-10-04 --tz UTC If no timezone is given, Pacific is assumed. */ trait DefaultDateRangeJob extends Job { //Get date implicits and PACIFIC and UTC vals. @@ -457,7 +455,7 @@ trait DefaultDateRangeJob extends Job { def defaultTimeZone = PACIFIC implicit lazy val tz: java.util.TimeZone = args.optional("tz") match { case Some(tzn) => java.util.TimeZone.getTimeZone(tzn) - case None => defaultTimeZone + case None => defaultTimeZone } // Optionally take a --period, which determines how many days each job runs over (rather @@ -476,7 +474,8 @@ trait DefaultDateRangeJob extends Job { (s, e) } - implicit lazy val dateRange: DateRange = DateRange(startDate, if (period > 0) startDate + Days(period) - Millisecs(1) else endDate) + implicit lazy val dateRange: DateRange = + DateRange(startDate, if (period > 0) startDate + Days(period) - Millisecs(1) else endDate) override def next: Option[Job] = if (period > 0) { @@ -484,7 +483,9 @@ trait DefaultDateRangeJob extends Job { if (nextStartDate + Days(period - 1) > endDate) None // we're done else // return a new job with the new startDate - Some(clone(args + ("date" -> List(nextStartDate.toString("yyyy-MM-dd"), endDate.toString("yyyy-MM-dd"))))) + Some( + clone(args + ("date" -> List(nextStartDate.toString("yyyy-MM-dd"), endDate.toString("yyyy-MM-dd")))) + ) } else None } @@ -495,15 +496,15 @@ trait UtcDateRangeJob extends DefaultDateRangeJob { } /** - * This is a simple job that allows you to launch Execution[T] - * instances using scalding.Tool and scald.rb. You cannot print - * the graph. + * This is a simple job that allows you to launch Execution[T] instances using scalding.Tool and scald.rb. You + * cannot print the graph. */ abstract class ExecutionJob[+T](args: Args) extends Job(args) { - import scala.concurrent.{ Await, ExecutionContext => scEC } + import scala.concurrent.{Await, ExecutionContext => scEC} + /** - * To avoid serialization issues, this should not be a val, but a def, - * and prefer to keep as much as possible inside the method. + * To avoid serialization issues, this should not be a val, but a def, and prefer to keep as much as + * possible inside the method. */ def execution: Execution[T] @@ -517,14 +518,19 @@ abstract class ExecutionJob[+T](args: Args) extends Job(args) { def result: Future[T] = resultPromise.future override def buildFlow: Flow[_] = - sys.error("ExecutionJobs do not have a single accessible flow. " + - "You cannot print the graph as it may be dynamically built or recurrent") + sys.error( + "ExecutionJobs do not have a single accessible flow. " + + "You cannot print the graph as it may be dynamically built or recurrent" + ) final override def run = { - val r = Config.tryFrom(config) + val r = Config + .tryFrom(config) .map { conf => - Await.result(execution.run(conf, mode)(concurrentExecutionContext), - scala.concurrent.duration.Duration.Inf) + Await.result( + execution.run(conf, mode)(concurrentExecutionContext), + scala.concurrent.duration.Duration.Inf + ) } if (!resultPromise.tryComplete(r)) { // The test framework can call this more than once. @@ -542,18 +548,15 @@ abstract class ExecutionJob[+T](args: Args) extends Job(args) { * failing command is printed to stdout. */ class ScriptJob(cmds: Iterable[String]) extends Job(Args("")) { - override def run = { + override def run = try { - cmds.dropWhile { - cmd: String => - { - new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match { - case x if x != 0 => - println(cmd + " failed, exitStatus: " + x) - false - case 0 => true - } - } + cmds.dropWhile { cmd: String => + new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match { + case x if x != 0 => + println(cmd + " failed, exitStatus: " + x) + false + case 0 => true + } }.isEmpty } catch { case e: Exception => { @@ -561,13 +564,13 @@ class ScriptJob(cmds: Iterable[String]) extends Job(Args("")) { false } } - } } /** * Allows custom counter verification logic when the job completes. */ trait CounterVerification extends Job { + /** * Verify counter values. The job will fail if this returns false or throws an exception. */ @@ -578,25 +581,22 @@ trait CounterVerification extends Job { */ def verifyCountersInTest: Boolean = true - override def listeners: List[FlowListener] = { + override def listeners: List[FlowListener] = if (this.mode.isInstanceOf[TestMode] && !this.verifyCountersInTest) { super.listeners } else { super.listeners :+ new StatsFlowListener(this.verifyCounters) } - } } private[scalding] case class FlowStepStrategies[A]() extends Semigroup[FlowStepStrategy[A]] { + /** * Returns a new FlowStepStrategy that runs both strategies in sequence. */ def plus(l: FlowStepStrategy[A], r: FlowStepStrategy[A]): FlowStepStrategy[A] = new FlowStepStrategy[A] { - override def apply( - flow: Flow[A], - predecessorSteps: JList[FlowStep[A]], - flowStep: FlowStep[A]): Unit = { + override def apply(flow: Flow[A], predecessorSteps: JList[FlowStep[A]], flowStep: FlowStep[A]): Unit = { l.apply(flow, predecessorSteps, flowStep) r.apply(flow, predecessorSteps, flowStep) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala b/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala index 299012a4cc..78b6474965 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala @@ -12,30 +12,36 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import scala.collection.JavaConverters._ -import cascading.stats.{ CascadeStats, CascadingStats, FlowStats } +import cascading.stats.{CascadeStats, CascadingStats, FlowStats} -import scala.util.{ Failure, Try } +import scala.util.{Failure, Try} object JobStats { def empty: JobStats = new JobStats(Map("counters" -> Map.empty)) def apply(stats: CascadingStats): JobStats = { val m: Map[String, Any] = statsMap(stats) - new JobStats( - stats match { - case cs: CascadeStats => m - case fs: FlowStats => m + ("flow_step_stats" -> fs.getFlowStepStats.asScala.map(statsMap)) - }) + new JobStats(stats match { + case cs: CascadeStats => m + case fs: FlowStats => m + ("flow_step_stats" -> fs.getFlowStepStats.asScala.map(statsMap)) + }) } private def counterMap(stats: CascadingStats): Map[String, Map[String, Long]] = stats.getCounterGroups.asScala.map { group => - (group, stats.getCountersFor(group).asScala.map { counter => - (counter, stats.getCounterValue(group, counter)) - }.toMap) + ( + group, + stats + .getCountersFor(group) + .asScala + .map { counter => + (counter, stats.getCounterValue(group, counter)) + } + .toMap + ) }.toMap private def statsMap(stats: CascadingStats): Map[String, Any] = @@ -51,7 +57,8 @@ object JobStats { "failed" -> stats.isFailed, "skipped" -> stats.isSkipped, "stopped" -> stats.isStopped, - "successful" -> stats.isSuccessful) + "successful" -> stats.isSuccessful + ) /** * Returns the counters with Group String -> Counter String -> Long @@ -59,19 +66,21 @@ object JobStats { def toCounters(cMap: Any): Try[Map[String, Map[String, Long]]] = // This really sucks, but this is what happens when you let Map[String, Any] into your code cMap match { - case m: Map[_, _] => Try { - m.foldLeft(Map.empty[String, Map[String, Long]]) { - case (acc, (k: String, v: Any)) => v match { - case m: Map[_, _] => - acc + (k -> m.foldLeft(Map.empty[String, Long]) { - case (acc2, (k: String, v: Long)) => acc2 + (k -> v) - case (_, kv) => sys.error("inner k, v not (String, Long):" + kv) - }) - case _ => sys.error("inner values are not Maps: " + v) + case m: Map[_, _] => + Try { + m.foldLeft(Map.empty[String, Map[String, Long]]) { + case (acc, (k: String, v: Any)) => + v match { + case m: Map[_, _] => + acc + (k -> m.foldLeft(Map.empty[String, Long]) { + case (acc2, (k: String, v: Long)) => acc2 + (k -> v) + case (_, kv) => sys.error("inner k, v not (String, Long):" + kv) + }) + case _ => sys.error("inner values are not Maps: " + v) + } + case kv => sys.error("Map does not contain string keys: " + kv) } - case kv => sys.error("Map does not contain string keys: " + (kv)) } - } case _ => Failure(new Exception("%s not a Map[String, Any]".format(cMap))) } @@ -80,10 +89,9 @@ object JobStats { else { Try(a.toString.toInt) .recoverWith { case t: Throwable => Try(a.toString.toDouble) } - .recover { - case t: Throwable => - val s = a.toString - "\"%s\"".format(s) + .recover { case t: Throwable => + val s = a.toString + "\"%s\"".format(s) } .get .toString @@ -94,12 +102,14 @@ object JobStats { // If you want to write this, call toMap and use json, etc... to write it case class JobStats(toMap: Map[String, Any]) { def counters: Map[String, Map[String, Long]] = - toMap.get("counters") + toMap + .get("counters") .map(JobStats.toCounters(_)) .getOrElse(sys.error("counters missing from: " + toMap)) .get def toJson: String = - toMap.map { case (k, v) => "\"%s\" : %s".format(k, JobStats.toJsonValue(v)) } + toMap + .map { case (k, v) => "\"%s\" : %s".format(k, JobStats.toJsonValue(v)) } .mkString("{", ",", "}") } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala b/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala index fd09fac010..ef2535e368 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import scala.collection.mutable @@ -24,17 +24,14 @@ import cascading.stats.CascadingStats import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.JobConf - object JobTest { @deprecated(message = "Use the non-reflection based JobTest apply methods", since = "0.16.1") - def apply(jobName: String) = { + def apply(jobName: String) = new JobTest((args: Args) => Job(jobName, args)) - } - def apply(cons: (Args) => Job) = { + def apply(cons: (Args) => Job) = new JobTest(cons) - } def apply[T <: Job: Manifest] = { val cons = { (args: Args) => @@ -48,8 +45,8 @@ object JobTest { // We have to memoize to return the same buffer each time. private case class MemoizedSourceFn[T]( - fn: Source => Option[Iterable[T]], - setter: TupleSetter[T] + fn: Source => Option[Iterable[T]], + setter: TupleSetter[T] ) extends (Source => Option[mutable.Buffer[Tuple]]) { private val memo = mutable.Map[Source, Option[mutable.Buffer[Tuple]]]() private val lock = new Object() @@ -61,16 +58,13 @@ object JobTest { } object CascadeTest { - def apply(jobName: String) = { + def apply(jobName: String) = new CascadeTest((args: Args) => Job(jobName, args)) - } } /** - * This class is used to construct unit tests for scalding jobs. - * You should not use it unless you are writing tests. - * For examples of how to do that, see the tests included in the - * main scalding repository: + * This class is used to construct unit tests for scalding jobs. You should not use it unless you are writing + * tests. For examples of how to do that, see the tests included in the main scalding repository: * https://github.com/twitter/scalding/tree/master/scalding-core/src/test/scala/com/twitter/scalding */ class JobTest(cons: (Args) => Job) { @@ -95,7 +89,7 @@ class JobTest(cons: (Args) => Job) { } private def sourceBuffer[T: TupleSetter](s: Source, tups: Iterable[T]): JobTest = { - source { src => if (src == s) Some(tups) else None } + source(src => if (src == s) Some(tups) else None) this } @@ -110,9 +104,8 @@ class JobTest(cons: (Args) => Job) { } /** - * Enables syntax like: - * .ifSource { case Tsv("in") => List(1, 2, 3) } - * We need a different function name from source to help the compiler + * Enables syntax like: .ifSource { case Tsv("in") => List(1, 2, 3) } We need a different function name from + * source to help the compiler */ def ifSource[T](fn: PartialFunction[Source, Iterable[T]])(implicit setter: TupleSetter[T]): JobTest = source(fn.lift) @@ -133,11 +126,13 @@ class JobTest(cons: (Args) => Job) { * you also modify the `finalize` function accordingly. */ sinkSet += s - callbacks += (() => op(buffer.map { tup => conv(new TupleEntry(tup)) })) + callbacks += (() => op(buffer.map(tup => conv(new TupleEntry(tup))))) this } - def typedSink[A](s: Source with TypedSink[A])(op: mutable.Buffer[A] => Unit)(implicit conv: TupleConverter[A]) = + def typedSink[A](s: Source with TypedSink[A])(op: mutable.Buffer[A] => Unit)(implicit + conv: TupleConverter[A] + ) = sink[A](s)(op) // Used to pass an assertion about a counter defined by the given group and name. @@ -216,8 +211,7 @@ class JobTest(cons: (Args) => Job) { } /** - * Run the clean ups and checks after a job - * has executed + * Run the clean ups and checks after a job has executed */ def postRunChecks(mode: Mode): Unit = { mode match { @@ -227,12 +221,12 @@ class JobTest(cons: (Args) => Job) { * you also modify the `finalize` function accordingly. */ // The sinks are written to disk, we need to clean them up: - sinkSet.foreach{ hadoopTest.finalize(_) } + sinkSet.foreach(hadoopTest.finalize(_)) } case _ => () } // Now it is time to check the test conditions: - callbacks.foreach { cb => cb() } + callbacks.foreach(cb => cb()) } // Registers test files, initializes the global mode, and creates a job. @@ -251,8 +245,14 @@ class JobTest(cons: (Args) => Job) { // create cascading 3.0 planner trace files during tests if (System.getenv.asScala.getOrElse("SCALDING_CASCADING3_DEBUG", "0") == "1") { System.setProperty("cascading.planner.plan.path", "target/test/cascading/traceplan/" + job.name) - System.setProperty("cascading.planner.plan.transforms.path", "target/test/cascading/traceplan/" + job.name + "/transform") - System.setProperty("cascading.planner.stats.path", "target/test/cascading/traceplan/" + job.name + "/stats") + System.setProperty( + "cascading.planner.plan.transforms.path", + "target/test/cascading/traceplan/" + job.name + "/transform" + ) + System.setProperty( + "cascading.planner.stats.path", + "target/test/cascading/traceplan/" + job.name + "/stats" + ) } if (validateJob) { @@ -262,12 +262,13 @@ class JobTest(cons: (Args) => Job) { // Make sure to clean the state: job.clear() - val next: Option[Job] = if (runNext) { job.next } else { None } + val next: Option[Job] = if (runNext) { job.next } + else { None } next match { case Some(nextjob) => runJob(nextjob, runNext) case None => postRunChecks(job.mode) - statsCallbacks.foreach { cb => cb(job.scaldingCascadingStats.get) } + statsCallbacks.foreach(cb => cb(job.scaldingCascadingStats.get)) } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala b/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala index bcc6e29a70..c156da6d3f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe._ import cascading.pipe.joiner._ import cascading.tuple._ -import java.util.{ Iterator => JIterator } +import java.util.{Iterator => JIterator} import java.util.Random // this one is serializable, scala.util.Random is not import scala.collection.JavaConverters._ @@ -40,72 +40,74 @@ trait JoinAlgorithms { def pipe: Pipe /** - * This method is used internally to implement all joins. - * You can use this directly if you want to implement something like a star join, - * e.g., when joining a single pipe to multiple other pipes. Make sure that you call this method - * on the larger pipe to make the grouping as efficient as possible. - * - * If you are only joining two pipes, then you are better off - * using joinWithSmaller/joinWithLarger/joinWithTiny/leftJoinWithTiny. + * This method is used internally to implement all joins. You can use this directly if you want to implement + * something like a star join, e.g., when joining a single pipe to multiple other pipes. Make sure that you + * call this method on the larger pipe to make the grouping as efficient as possible. * + * If you are only joining two pipes, then you are better off using + * joinWithSmaller/joinWithLarger/joinWithTiny/leftJoinWithTiny. */ - def coGroupBy(f: Fields, j: JoinMode = InnerJoinMode)(builder: CoGroupBuilder => GroupBuilder): Pipe = { + def coGroupBy(f: Fields, j: JoinMode = InnerJoinMode)(builder: CoGroupBuilder => GroupBuilder): Pipe = builder(new CoGroupBuilder(f, j)).schedule(pipe.getName, pipe) - } /** - * == WARNING == - * Doing a cross product with even a moderate sized pipe can - * create ENORMOUS output. The use-case here is attaching a constant (e.g. - * a number or a dictionary or set) to each row in another pipe. - * A common use-case comes from a groupAll and reduction to one row, - * then you want to send the results back out to every element in a pipe + * ==WARNING== + * Doing a cross product with even a moderate sized pipe can create ENORMOUS output. The use-case here is + * attaching a constant (e.g. a number or a dictionary or set) to each row in another pipe. A common + * use-case comes from a groupAll and reduction to one row, then you want to send the results back out to + * every element in a pipe * - * This uses joinWithTiny, so tiny pipe is replicated to all Mappers. If it - * is large, this will blow up. Get it: be foolish here and LOSE IT ALL! + * This uses joinWithTiny, so tiny pipe is replicated to all Mappers. If it is large, this will blow up. Get + * it: be foolish here and LOSE IT ALL! * * Use at your own risk. */ def crossWithTiny(tiny: Pipe) = { - val tinyJoin = tiny.map(() -> '__joinTiny__) { (u: Unit) => 1 } - pipe.map(() -> '__joinBig__) { (u: Unit) => 1 } + val tinyJoin = tiny.map(() -> '__joinTiny__)((u: Unit) => 1) + pipe + .map(() -> '__joinBig__)((u: Unit) => 1) .joinWithTiny('__joinBig__ -> '__joinTiny__, tinyJoin) .discard('__joinBig__, '__joinTiny__) } + /** - * Does a cross-product by doing a blockJoin. - * Useful when doing a large cross, if your cluster can take it. + * Does a cross-product by doing a blockJoin. Useful when doing a large cross, if your cluster can take it. * Prefer crossWithTiny */ def crossWithSmaller(p: Pipe, replication: Int = 20) = { - val smallJoin = p.map(() -> '__joinSmall__) { (u: Unit) => 1 } - pipe.map(() -> '__joinBig__) { (u: Unit) => 1 } + val smallJoin = p.map(() -> '__joinSmall__)((u: Unit) => 1) + pipe + .map(() -> '__joinBig__)((u: Unit) => 1) .blockJoinWithSmaller('__joinBig__ -> '__joinSmall__, smallJoin, rightReplication = replication) .discard('__joinBig__, '__joinSmall__) } /** - * Rename the collisions and return the pipe and the new names, - * and the fields to discard + * Rename the collisions and return the pipe and the new names, and the fields to discard */ - private def renameCollidingFields(p: Pipe, fields: Fields, - collisions: Set[Comparable[_]]): (Pipe, Fields, Fields) = { + private def renameCollidingFields( + p: Pipe, + fields: Fields, + collisions: Set[Comparable[_]] + ): (Pipe, Fields, Fields) = { // Here is how we rename colliding fields def rename(f: Comparable[_]): String = "__temp_join_" + f.toString // convert to list, so we are explicit that ordering is fixed below: val renaming = collisions.toList val orig = new Fields(renaming: _*) - val temp = new Fields(renaming.map { rename }: _*) + val temp = new Fields(renaming.map(rename): _*) // Now construct the new join keys, where we check for a rename // otherwise use the original key: - val newJoinKeys = new Fields(asList(fields) - .map { fname => - // If we renamed, get the rename, else just use the field - if (collisions(fname)) { - rename(fname) - } else fname - }: _*) + val newJoinKeys = new Fields( + asList(fields) + .map { fname => + // If we renamed, get the rename, else just use the field + if (collisions(fname)) { + rename(fname) + } else fname + }: _* + ) val renamedPipe = p.rename(orig -> temp) (renamedPipe, newJoinKeys, temp) } @@ -113,36 +115,37 @@ trait JoinAlgorithms { /** * Flip between LeftJoin to RightJoin */ - private def flipJoiner(j: Joiner): Joiner = { + private def flipJoiner(j: Joiner): Joiner = j match { case outer: OuterJoin => outer case inner: InnerJoin => inner - case left: LeftJoin => new RightJoin + case left: LeftJoin => new RightJoin case right: RightJoin => new LeftJoin - case other => throw new InvalidJoinModeException("cannot use joiner " + other + - " since it cannot be flipped safely") + case other => + throw new InvalidJoinModeException( + "cannot use joiner " + other + + " since it cannot be flipped safely" + ) } - } - def joinerToJoinModes(j: Joiner) = { + def joinerToJoinModes(j: Joiner) = j match { case i: InnerJoin => (InnerJoinMode, InnerJoinMode) - case l: LeftJoin => (InnerJoinMode, OuterJoinMode) + case l: LeftJoin => (InnerJoinMode, OuterJoinMode) case r: RightJoin => (OuterJoinMode, InnerJoinMode) case o: OuterJoin => (OuterJoinMode, OuterJoinMode) - case _ => throw new InvalidJoinModeException("cannot convert joiner to joiner modes") + case _ => throw new InvalidJoinModeException("cannot convert joiner to joiner modes") } - } /** - * Joins the first set of keys in the first pipe to the second set of keys in the second pipe. - * All keys must be unique UNLESS it is an inner join, then duplicated join keys are allowed, but - * the second copy is deleted (as cascading does not allow duplicated field names). + * Joins the first set of keys in the first pipe to the second set of keys in the second pipe. All keys must + * be unique UNLESS it is an inner join, then duplicated join keys are allowed, but the second copy is + * deleted (as cascading does not allow duplicated field names). * * Smaller here means that the values/key is smaller than the left. * - * Avoid going crazy adding more explicit join modes. Instead do for some other join - * mode with a larger pipe: + * Avoid going crazy adding more explicit join modes. Instead do for some other join mode with a larger + * pipe: * * {{{ * .then { pipe => other. @@ -150,7 +153,12 @@ trait JoinAlgorithms { * } * }}} */ - def joinWithSmaller(fs: (Fields, Fields), that: Pipe, joiner: Joiner = new InnerJoin, reducers: Int = -1) = { + def joinWithSmaller( + fs: (Fields, Fields), + that: Pipe, + joiner: Joiner = new InnerJoin, + reducers: Int = -1 + ) = { // If we are not doing an inner join, the join fields must be disjoint: val joiners = joinerToJoinModes(joiner) val intersection = asSet(fs._1).intersect(asSet(fs._2)) @@ -167,52 +175,53 @@ trait JoinAlgorithms { * So, we rename the right hand side to temporary names, then discard them after the operation */ val (renamedThat, newJoinFields, temp) = renameCollidingFields(that, fs._2, intersection) - pipe.coGroupBy(fs._1, joiners._1) { - _.coGroup(newJoinFields, renamedThat, joiners._2) - .reducers(reducers) - }.discard(temp) + pipe + .coGroupBy(fs._1, joiners._1) { + _.coGroup(newJoinFields, renamedThat, joiners._2) + .reducers(reducers) + } + .discard(temp) } else { - throw new IllegalArgumentException("join keys must be disjoint unless you are doing an InnerJoin. Found: " + - fs.toString + ", which overlap with: " + intersection.toString) + throw new IllegalArgumentException( + "join keys must be disjoint unless you are doing an InnerJoin. Found: " + + fs.toString + ", which overlap with: " + intersection.toString + ) } } /** * same as reversing the order on joinWithSmaller */ - def joinWithLarger(fs: (Fields, Fields), that: Pipe, joiner: Joiner = new InnerJoin, reducers: Int = -1) = { + def joinWithLarger(fs: (Fields, Fields), that: Pipe, joiner: Joiner = new InnerJoin, reducers: Int = -1) = that.joinWithSmaller((fs._2, fs._1), pipe, flipJoiner(joiner), reducers) - } /** - * This is joinWithSmaller with joiner parameter fixed to LeftJoin. If the item is absent on the right put null for the keys and values + * This is joinWithSmaller with joiner parameter fixed to LeftJoin. If the item is absent on the right put + * null for the keys and values */ - def leftJoinWithSmaller(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = { + def leftJoinWithSmaller(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = joinWithSmaller(fs, that, new LeftJoin, reducers) - } /** - * This is joinWithLarger with joiner parameter fixed to LeftJoin. If the item is absent on the right put null for the keys and values + * This is joinWithLarger with joiner parameter fixed to LeftJoin. If the item is absent on the right put + * null for the keys and values */ - def leftJoinWithLarger(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = { + def leftJoinWithLarger(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = joinWithLarger(fs, that, new LeftJoin, reducers) - } /** - * This does an assymmetric join, using cascading's "HashJoin". This only runs through - * this pipe once, and keeps the right hand side pipe in memory (but is spillable). - * - * Choose this when Left > max(mappers,reducers) * Right, or when the left side is three - * orders of magnitude larger. + * This does an assymmetric join, using cascading's "HashJoin". This only runs through this pipe once, and + * keeps the right hand side pipe in memory (but is spillable). * - * joins the first set of keys in the first pipe to the second set of keys in the second pipe. - * Duplicated join keys are allowed, but - * the second copy is deleted (as cascading does not allow duplicated field names). + * Choose this when Left > max(mappers,reducers) * Right, or when the left side is three orders of magnitude + * larger. * + * joins the first set of keys in the first pipe to the second set of keys in the second pipe. Duplicated + * join keys are allowed, but the second copy is deleted (as cascading does not allow duplicated field + * names). * - * == Warning == - * This does not work with outer joins, or right joins, only inner and - * left join versions are given. + * ==Warning== + * This does not work with outer joins, or right joins, only inner and left join versions are given. */ def joinWithTiny(fs: (Fields, Fields), that: Pipe) = { val intersection = asSet(fs._1).intersect(asSet(fs._2)) @@ -220,46 +229,54 @@ trait JoinAlgorithms { new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, WrappedJoiner(new InnerJoin)) } else { val (renamedThat, newJoinFields, temp) = renameCollidingFields(that, fs._2, intersection) - (new HashJoin(assignName(pipe), fs._1, assignName(renamedThat), newJoinFields, WrappedJoiner(new InnerJoin))) + (new HashJoin( + assignName(pipe), + fs._1, + assignName(renamedThat), + newJoinFields, + WrappedJoiner(new InnerJoin) + )) .discard(temp) } } - def leftJoinWithTiny(fs: (Fields, Fields), that: Pipe) = { + def leftJoinWithTiny(fs: (Fields, Fields), that: Pipe) = //Rename these pipes to avoid cascading name conflicts new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, WrappedJoiner(new LeftJoin)) - } /** - * Performs a block join, otherwise known as a replicate fragment join (RF join). - * The input params leftReplication and rightReplication control the replication of the left and right - * pipes respectively. + * Performs a block join, otherwise known as a replicate fragment join (RF join). The input params + * leftReplication and rightReplication control the replication of the left and right pipes respectively. * - * This is useful in cases where the data has extreme skew. A symptom of this is that we may see a job stuck for - * a very long time on a small number of reducers. + * This is useful in cases where the data has extreme skew. A symptom of this is that we may see a job stuck + * for a very long time on a small number of reducers. * - * A block join is way to get around this: we add a random integer field and a replica field - * to every tuple in the left and right pipes. We then join on the original keys and - * on these new dummy fields. These dummy fields make it less likely that the skewed keys will - * be hashed to the same reducer. + * A block join is way to get around this: we add a random integer field and a replica field to every tuple + * in the left and right pipes. We then join on the original keys and on these new dummy fields. These dummy + * fields make it less likely that the skewed keys will be hashed to the same reducer. * - * The final data size is right * rightReplication + left * leftReplication - * but because of the fragmentation, we are guaranteed the same number of hits as the original join. + * The final data size is right * rightReplication + left * leftReplication but because of the + * fragmentation, we are guaranteed the same number of hits as the original join. * - * If the right pipe is really small then you are probably better off with a joinWithTiny. If however - * the right pipe is medium sized, then you are better off with a blockJoinWithSmaller, and a good rule - * of thumb is to set rightReplication = left.size / right.size and leftReplication = 1 + * If the right pipe is really small then you are probably better off with a joinWithTiny. If however the + * right pipe is medium sized, then you are better off with a blockJoinWithSmaller, and a good rule of thumb + * is to set rightReplication = left.size / right.size and leftReplication = 1 * - * Finally, if both pipes are of similar size, e.g. in case of a self join with a high data skew, - * then it makes sense to set leftReplication and rightReplication to be approximately equal. + * Finally, if both pipes are of similar size, e.g. in case of a self join with a high data skew, then it + * makes sense to set leftReplication and rightReplication to be approximately equal. * - * == Note == - * You can only use an InnerJoin or a LeftJoin with a leftReplication of 1 - * (or a RightJoin with a rightReplication of 1) when doing a blockJoin. + * ==Note== + * You can only use an InnerJoin or a LeftJoin with a leftReplication of 1 (or a RightJoin with a + * rightReplication of 1) when doing a blockJoin. */ - def blockJoinWithSmaller(fs: (Fields, Fields), - otherPipe: Pipe, rightReplication: Int = 1, leftReplication: Int = 1, - joiner: Joiner = new InnerJoin, reducers: Int = -1): Pipe = { + def blockJoinWithSmaller( + fs: (Fields, Fields), + otherPipe: Pipe, + rightReplication: Int = 1, + leftReplication: Int = 1, + joiner: Joiner = new InnerJoin, + reducers: Int = -1 + ): Pipe = { assert(rightReplication > 0, "Must specify a positive number for the right replication in block join") assert(leftReplication > 0, "Must specify a positive number for the left replication in block join") @@ -271,7 +288,8 @@ trait JoinAlgorithms { // Add the new dummy replication fields val newLeft = addReplicationFields(pipe, leftFields, leftReplication, rightReplication) - val newRight = addReplicationFields(otherPipe, rightFields, rightReplication, leftReplication, swap = true) + val newRight = + addReplicationFields(otherPipe, rightFields, rightReplication, leftReplication, swap = true) val leftJoinFields = Fields.join(fs._1, leftFields) val rightJoinFields = Fields.join(fs._2, rightFields) @@ -285,84 +303,97 @@ trait JoinAlgorithms { /** * Adds one random field and one replica field. */ - private def addReplicationFields(p: Pipe, f: Fields, - replication: Int, otherReplication: Int, swap: Boolean = false): Pipe = { + private def addReplicationFields( + p: Pipe, + f: Fields, + replication: Int, + otherReplication: Int, + swap: Boolean = false + ): Pipe = /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ p.using(new Random(Seed) with Stateful).flatMap(() -> f) { (rand: Random, _: Unit) => val rfs = getReplicationFields(rand, replication, otherReplication) - if (swap) rfs.map { case (i, j) => (j, i) } else rfs + if (swap) rfs.map { case (i, j) => (j, i) } + else rfs } - } /** * Returns a list of the dummy replication fields used to replicate groups in skewed joins. * - * For example, suppose you have two pipes P1 and P2. While performing a skewed join for a particular - * key K, you want to replicate every row in P1 with this key 3 times, and every row in P2 with this - * key 5 times. + * For example, suppose you have two pipes P1 and P2. While performing a skewed join for a particular key K, + * you want to replicate every row in P1 with this key 3 times, and every row in P2 with this key 5 times. * * Then: * - * - For the P1 replication, the first element of each tuple is the same random integer between 0 and 4, - * and the second element of each tuple is the index of the replication (between 0 and 2). This first - * random element guarantees that we will match exactly one random row in P2 with the same key. - * - Similarly for the P2 replication. + * - For the P1 replication, the first element of each tuple is the same random integer between 0 and 4, + * and the second element of each tuple is the index of the replication (between 0 and 2). This first + * random element guarantees that we will match exactly one random row in P2 with the same key. + * - Similarly for the P2 replication. * * Examples: * - * getReplicationFields(3, 5) - * => List( (1, 0), (1, 1), (1, 2) ) + * getReplicationFields(3, 5) + * => List( (1, 0), (1, 1), (1, 2) ) * - * getReplicationFields(5, 3) - * => List( (2, 0), (2, 1), (2, 2), (2, 3), (2, 4) ) + * getReplicationFields(5, 3) + * => List( (2, 0), (2, 1), (2, 2), (2, 3), (2, 4) ) */ - private def getReplicationFields(r: Random, replication: Int, otherReplication: Int): IndexedSeq[(Int, Int)] = { + private def getReplicationFields( + r: Random, + replication: Int, + otherReplication: Int + ): IndexedSeq[(Int, Int)] = { assert(replication >= 1 && otherReplication >= 1, "Replication counts must be >= 1") val rand = r.nextInt(otherReplication) - (0 until replication).map { rep => (rand, rep) } + (0 until replication).map(rep => (rand, rep)) } - private def assertValidJoinMode(joiner: Joiner, left: Int, right: Int): Unit = { + private def assertValidJoinMode(joiner: Joiner, left: Int, right: Int): Unit = (joiner, left, right) match { case (i: InnerJoin, _, _) => () - case (k: LeftJoin, 1, _) => () + case (k: LeftJoin, 1, _) => () case (m: RightJoin, _, 1) => () case (j, l, r) => throw new InvalidJoinModeException( - "you cannot use joiner " + j + " with left replication " + l + " and right replication " + r) + "you cannot use joiner " + j + " with left replication " + l + " and right replication " + r + ) } - } /** * Performs a skewed join, which is useful when the data has extreme skew. * - * For example, imagine joining a pipe of Twitter's follow graph against a pipe of user genders, - * in order to find the gender distribution of the accounts every Twitter user follows. Since celebrities - * (e.g., Justin Bieber and Lady Gaga) have a much larger follower base than other users, and (under - * a standard join algorithm) all their followers get sent to the same reducer, the job will likely be - * stuck on a few reducers for a long time. A skewed join attempts to alleviate this problem. + * For example, imagine joining a pipe of Twitter's follow graph against a pipe of user genders, in order to + * find the gender distribution of the accounts every Twitter user follows. Since celebrities (e.g., Justin + * Bieber and Lady Gaga) have a much larger follower base than other users, and (under a standard join + * algorithm) all their followers get sent to the same reducer, the job will likely be stuck on a few + * reducers for a long time. A skewed join attempts to alleviate this problem. * * This works as follows: * - * 1. First, we sample from the left and right pipes with some small probability, in order to determine - * approximately how often each join key appears in each pipe. - * 2. We use these estimated counts to replicate the join keys, according to the given replication strategy. - * 3. Finally, we join the replicated pipes together. + * 1. First, we sample from the left and right pipes with some small probability, in order to determine + * approximately how often each join key appears in each pipe. 2. We use these estimated counts to + * replicate the join keys, according to the given replication strategy. 3. Finally, we join the + * replicated pipes together. * - * @param sampleRate This controls how often we sample from the left and right pipes when estimating key counts. - * @param replicator Algorithm for determining how much to replicate a join key in the left and right pipes. + * @param sampleRate + * This controls how often we sample from the left and right pipes when estimating key counts. + * @param replicator + * Algorithm for determining how much to replicate a join key in the left and right pipes. * * Note: since we do not set the replication counts, only inner joins are allowed. (Otherwise, replicated * rows would stay replicated when there is no counterpart in the other pipe.) */ - def skewJoinWithSmaller(fs: (Fields, Fields), otherPipe: Pipe, - sampleRate: Double = 0.001, reducers: Int = -1, - replicator: SkewReplication = SkewReplicationA()): Pipe = { + def skewJoinWithSmaller( + fs: (Fields, Fields), + otherPipe: Pipe, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicator: SkewReplication = SkewReplicationA() + ): Pipe = { assert(sampleRate > 0 && sampleRate < 1, "Sampling rate for skew joins must lie strictly between 0 and 1") @@ -384,15 +415,17 @@ trait JoinAlgorithms { val sampledCountFields = new Fields(leftSampledCountField, rightSampledCountField) /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ - val sampledLeft = pipe.sample(sampleRate, Seed) - .groupBy(fs._1) { _.size(leftSampledCountField) } - val sampledRight = rightPipe.sample(sampleRate, Seed) - .groupBy(rightResolvedJoinFields) { _.size(rightSampledCountField) } - val sampledCounts = sampledLeft.joinWithSmaller(fs._1 -> rightResolvedJoinFields, sampledRight, joiner = new OuterJoin) + val sampledLeft = pipe + .sample(sampleRate, Seed) + .groupBy(fs._1)(_.size(leftSampledCountField)) + val sampledRight = rightPipe + .sample(sampleRate, Seed) + .groupBy(rightResolvedJoinFields)(_.size(rightSampledCountField)) + val sampledCounts = sampledLeft + .joinWithSmaller(fs._1 -> rightResolvedJoinFields, sampledRight, joiner = new OuterJoin) .project(Fields.join(mergedJoinKeys, sampledCountFields)) .map(mergedJoinKeys -> mergedJoinKeys) { t: cascading.tuple.Tuple => // Make the outer join look like an inner join so that we can join @@ -422,10 +455,25 @@ trait JoinAlgorithms { val leftReplicationFields = new Fields("__LEFT_RAND__", "__LEFT_REP__") val rightReplicationFields = new Fields("__RIGHT_REP__", "__RIGHT_RAND__") - val replicatedLeft = skewReplicate(pipe, sampledCounts, fs._1, sampledCountFields, leftReplicationFields, - replicator, reducers) - val replicatedRight = skewReplicate(rightPipe, sampledCounts, rightResolvedJoinFields, sampledCountFields, rightReplicationFields, - replicator, reducers, true) + val replicatedLeft = skewReplicate( + pipe, + sampledCounts, + fs._1, + sampledCountFields, + leftReplicationFields, + replicator, + reducers + ) + val replicatedRight = skewReplicate( + rightPipe, + sampledCounts, + rightResolvedJoinFields, + sampledCountFields, + rightReplicationFields, + replicator, + reducers, + true + ) // 3. Finally, join the replicated pipes together. val leftJoinFields = Fields.join(fs._1, leftReplicationFields) @@ -441,36 +489,49 @@ trait JoinAlgorithms { else joinedPipe.discard(dupeFields) } - def skewJoinWithLarger(fs: (Fields, Fields), otherPipe: Pipe, - sampleRate: Double = 0.001, reducers: Int = -1, - replicator: SkewReplication = SkewReplicationA()): Pipe = { + def skewJoinWithLarger( + fs: (Fields, Fields), + otherPipe: Pipe, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicator: SkewReplication = SkewReplicationA() + ): Pipe = otherPipe.skewJoinWithSmaller((fs._2, fs._1), pipe, sampleRate, reducers, replicator) - } /** - * Helper method for performing skewed joins. This replicates the rows in {pipe} according - * to the estimated counts in {sampledCounts}. + * Helper method for performing skewed joins. This replicates the rows in {pipe} according to the estimated + * counts in {sampledCounts}. * - * @param pipe The pipe to be replicated. - * @param sampledCounts A pipe containing, for each key, the estimated counts of how often - * this key appeared in the samples of the original left and right pipes. - * @param replicator Strategy for how the pipe is replicated. - * @param isPipeOnRight Set to true when replicating the right pipe. + * @param pipe + * The pipe to be replicated. + * @param sampledCounts + * A pipe containing, for each key, the estimated counts of how often this key appeared in the samples of + * the original left and right pipes. + * @param replicator + * Strategy for how the pipe is replicated. + * @param isPipeOnRight + * Set to true when replicating the right pipe. */ - private def skewReplicate(pipe: Pipe, sampledCounts: Pipe, joinFields: Fields, - countFields: Fields, replicationFields: Fields, - replicator: SkewReplication, - numReducers: Int = -1, isPipeOnRight: Boolean = false) = { + private def skewReplicate( + pipe: Pipe, + sampledCounts: Pipe, + joinFields: Fields, + countFields: Fields, + replicationFields: Fields, + replicator: SkewReplication, + numReducers: Int = -1, + isPipeOnRight: Boolean = false + ) = { // Rename the fields to prepare for the leftJoin below. - val renamedFields = joinFields.iterator.asScala.toList.map { field => "__RENAMED_" + field + "__" } - val renamedSampledCounts = sampledCounts.rename(joinFields -> renamedFields) + val renamedFields = joinFields.iterator.asScala.toList.map(field => "__RENAMED_" + field + "__") + val renamedSampledCounts = sampledCounts + .rename(joinFields -> renamedFields) .project(Fields.join(renamedFields, countFields)) /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ pipe // Join the pipe against the sampled counts, so that we know approximately how often each @@ -492,8 +553,8 @@ trait JoinAlgorithms { class InvalidJoinModeException(args: String) extends Exception(args) /** - * Wraps a Joiner instance so that the active FlowProcess may be noted. This allows features of Scalding that need - * access to a FlowProcess (e.g., counters) to function properly inside a Joiner. + * Wraps a Joiner instance so that the active FlowProcess may be noted. This allows features of Scalding that + * need access to a FlowProcess (e.g., counters) to function properly inside a Joiner. */ private[scalding] class WrappedJoiner(val joiner: Joiner) extends Joiner { override def getIterator(joinerClosure: JoinerClosure): JIterator[Tuple] = { @@ -511,13 +572,13 @@ private[scalding] class WrappedJoiner(val joiner: Joiner) extends Joiner { } private[scalding] object WrappedJoiner { + /** * Wrap the given Joiner in a WrappedJoiner instance if it is not already wrapped. */ - def apply(joiner: Joiner): WrappedJoiner = { + def apply(joiner: Joiner): WrappedJoiner = joiner match { case wrapped: WrappedJoiner => wrapped - case _ => new WrappedJoiner(joiner) + case _ => new WrappedJoiner(joiner) } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala b/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala index ad3f82eb61..9f82d4cc50 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.File @@ -27,9 +27,12 @@ object ExpandLibJarsGlobs { System.arraycopy(inputArgs, 0, newArgs, 0, inputArgs.length) val existing = newArgs(libJarsIdx) - val replacement = existing.split(",").flatMap { element => - fromGlob(element).map(_.toString) - }.mkString(",") + val replacement = existing + .split(",") + .flatMap { element => + fromGlob(element).map(_.toString) + } + .mkString(",") newArgs(libJarsIdx) = replacement newArgs @@ -39,16 +42,16 @@ object ExpandLibJarsGlobs { //tree from Duncan McGregor @ http://stackoverflow.com/questions/2637643/how-do-i-list-all-files-in-a-subdirectory-in-scala private[this] def tree(root: File, skipHidden: Boolean = false): Stream[File] = if (!root.exists || (skipHidden && root.isHidden)) Stream.empty - else root #:: ( - root.listFiles match { - case null => Stream.empty + else + root #:: (root.listFiles match { + case null => Stream.empty case files => files.toStream.flatMap(tree(_, skipHidden)) }) def fromGlob(glob: String, filesOnly: Boolean = true): Stream[Path] = { import java.nio.file._ val fs = FileSystems.getDefault() - val expandedSlash = if (glob.endsWith("/")) s"${glob}/*" else glob + val expandedSlash = if (glob.endsWith("/")) s"$glob/*" else glob val absoluteGlob = fs.getPath(expandedSlash).toAbsolutePath val matcher: PathMatcher = fs.getPathMatcher(s"glob:$absoluteGlob") @@ -61,4 +64,4 @@ object ExpandLibJarsGlobs { if (filesOnly) globMatchingPaths.filter(_.toFile.isFile) else globMatchingPaths } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/LineNumber.scala b/scalding-core/src/main/scala/com/twitter/scalding/LineNumber.scala index 58f44e5929..29e988bb93 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/LineNumber.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/LineNumber.scala @@ -12,15 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.slf4j.{ Logger, LoggerFactory } +import org.slf4j.{Logger, LoggerFactory} object LineNumber { + /** - * depth 0 means the StackTraceElement for the caller - * of this method (skipping getCurrent and the Thread.currentThread + * depth 0 means the StackTraceElement for the caller of this method (skipping getCurrent and the + * Thread.currentThread */ def getCurrent(depth: Int): StackTraceElement = getCurrent(depth, Thread.currentThread().getStackTrace) @@ -34,8 +35,12 @@ object LineNumber { private val LOG: Logger = LoggerFactory.getLogger(LineNumber.getClass) - private[this] def ignorePaths(classPrefixes: Set[String], stack: Seq[StackTraceElement]): Option[StackTraceElement] = - stack.drop(2) + private[this] def ignorePaths( + classPrefixes: Set[String], + stack: Seq[StackTraceElement] + ): Option[StackTraceElement] = + stack + .drop(2) .dropWhile { ste => classPrefixes.exists { prefix => ste.getClassName.startsWith(prefix) @@ -72,20 +77,21 @@ object LineNumber { if (it.hasNext) Some(it.next) else None - val scaldingJobCaller = headOption(stack - .iterator - .filter { se => se.getClassName.startsWith(scaldingPrefix) } - .filter { se => - try { - val cls = Class.forName(se.getClassName) - jobClass.isAssignableFrom(cls) - } catch { - // skip classes that we don't find. We seem to run into this for some lambdas on Scala 2.12 in travis - case cnf: ClassNotFoundException => - LOG.warn(s"Skipping $se.getClassName as we can't find the class") - false + val scaldingJobCaller = headOption( + stack.iterator + .filter(se => se.getClassName.startsWith(scaldingPrefix)) + .filter { se => + try { + val cls = Class.forName(se.getClassName) + jobClass.isAssignableFrom(cls) + } catch { + // skip classes that we don't find. We seem to run into this for some lambdas on Scala 2.12 in travis + case cnf: ClassNotFoundException => + LOG.warn(s"Skipping $se.getClassName as we can't find the class") + false + } } - }) + ) scaldingJobCaller .orElse(nonScalding) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala b/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala index 248c4ae46e..c722a6c0ad 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tap.Tap @@ -25,12 +25,11 @@ import scala.collection.mutable.Buffer import scala.collection.JavaConverters._ class MemoryTap[In, Out](val scheme: Scheme[Properties, In, Out, _, _], val tupleBuffer: Buffer[Tuple]) - extends Tap[Properties, In, Out](scheme) { + extends Tap[Properties, In, Out](scheme) { private var modifiedTime: Long = 1L - def updateModifiedTime(): Unit = { + def updateModifiedTime(): Unit = modifiedTime = System.currentTimeMillis - } override def createResource(conf: Properties) = { updateModifiedTime() @@ -44,9 +43,8 @@ class MemoryTap[In, Out](val scheme: Scheme[Properties, In, Out, _, _], val tupl override def getModifiedTime(conf: Properties) = if (resourceExists(conf)) modifiedTime else 0L override lazy val getIdentifier: String = scala.math.random.toString - override def openForRead(flowProcess: FlowProcess[Properties], input: In) = { + override def openForRead(flowProcess: FlowProcess[Properties], input: In) = new TupleEntryChainIterator(scheme.getSourceFields, tupleBuffer.toIterator.asJava) - } override def openForWrite(flowProcess: FlowProcess[Properties], output: Out): TupleEntryCollector = { tupleBuffer.clear() @@ -59,7 +57,8 @@ class MemoryTap[In, Out](val scheme: Scheme[Properties, In, Out, _, _], val tupl } -class MemoryTupleEntryCollector(val tupleBuffer: Buffer[Tuple], mt: MemoryTap[_, _]) extends TupleEntryCollector { +class MemoryTupleEntryCollector(val tupleBuffer: Buffer[Tuple], mt: MemoryTap[_, _]) + extends TupleEntryCollector { override def collect(tupleEntry: TupleEntry): Unit = { mt.updateModifiedTime() diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala b/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala index ba33634746..d2ee2c294d 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.FlowConnector @@ -25,10 +25,10 @@ case class ModeException(message: String) extends RuntimeException(message) case class ModeLoadException(message: String, origin: ClassNotFoundException) extends RuntimeException(origin) object Mode { + /** - * This is a Args and a Mode together. It is used purely as - * a work-around for the fact that Job only accepts an Args object, - * but needs a Mode inside. + * This is a Args and a Mode together. It is used purely as a work-around for the fact that Job only accepts + * an Args object, but needs a Mode inside. */ private class ArgsWithMode(argsMap: Map[String, List[String]], val mode: Mode) extends Args(argsMap) { override def +(keyvals: (String, Iterable[String])): Args = @@ -41,7 +41,7 @@ object Mode { /** Get a Mode if this Args was the result of a putMode */ def getMode(args: Args): Option[Mode] = args match { case withMode: ArgsWithMode => Some(withMode.mode) - case _ => None + case _ => None } val CascadingFlowConnectorClassKey = "cascading.flow.connector.class" @@ -51,7 +51,8 @@ object Mode { val DefaultHadoopFlowProcess = "cascading.flow.hadoop.HadoopFlowProcess" val DefaultHadoop2Mr1FlowConnector = "cascading.flow.hadoop2.Hadoop2MR1FlowConnector" - val DefaultHadoop2Mr1FlowProcess = "cascading.flow.hadoop.HadoopFlowProcess" // no Hadoop2MR1FlowProcess as of Cascading 3.0.0-wip-75? + val DefaultHadoop2Mr1FlowProcess = + "cascading.flow.hadoop.HadoopFlowProcess" // no Hadoop2MR1FlowProcess as of Cascading 3.0.0-wip-75? val DefaultHadoop2TezFlowConnector = "cascading.flow.tez.Hadoop2TezFlowConnector" val DefaultHadoop2TezFlowProcess = "cascading.flow.tez.Hadoop2TezFlowProcess" @@ -66,7 +67,9 @@ object Mode { if (args.boolean("local")) Local(strictSources) - else if (args.boolean("hdfs")) /* FIXME: should we start printing deprecation warnings ? It's okay to set manually c.f.*.class though */ + else if ( + args.boolean("hdfs") + ) /* FIXME: should we start printing deprecation warnings ? It's okay to set manually c.f.*.class though */ Hdfs(strictSources, config) else if (args.boolean("hadoop1")) { config.set(CascadingFlowConnectorClassKey, DefaultHadoopFlowConnector) @@ -81,10 +84,15 @@ object Mode { config.set(CascadingFlowProcessClassKey, DefaultHadoop2TezFlowProcess) Hdfs(strictSources, config) } else - throw ArgsException("[ERROR] Mode must be one of --local, --hadoop1, --hadoop2-mr1, --hadoop2-tez or --hdfs, you provided none") + throw ArgsException( + "[ERROR] Mode must be one of --local, --hadoop1, --hadoop2-mr1, --hadoop2-tez or --hdfs, you provided none" + ) } - @deprecated("Use CascadingMode.cast(mode) or pattern match directly on known CascadingModes (e.g. Hdfs, Local)", "0.18.0") + @deprecated( + "Use CascadingMode.cast(mode) or pattern match directly on known CascadingModes (e.g. Hdfs, Local)", + "0.18.0" + ) implicit class DeprecatedCascadingModeMethods(val mode: Mode) extends AnyVal { private def cmode: CascadingMode = CascadingMode.cast(mode) @@ -111,4 +119,3 @@ trait Mode extends java.io.Serializable { */ def newWriter(): Execution.Writer } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala b/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala index 2f1337f2ab..5c3f94701c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding { import cascading.operation._ @@ -22,7 +22,7 @@ package com.twitter.scalding { import com.twitter.chill.MeatLocker import scala.collection.JavaConverters._ - import com.twitter.algebird.{ Semigroup, SummingWithHitsCache, AdaptiveCache } + import com.twitter.algebird.{AdaptiveCache, Semigroup, SummingWithHitsCache} import com.twitter.scalding.mathematics.Poisson import serialization.Externalizer import scala.util.Try @@ -34,9 +34,14 @@ package com.twitter.scalding { } } - class FlatMapFunction[S, T](@transient fn: S => TraversableOnce[T], fields: Fields, - conv: TupleConverter[S], set: TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + class FlatMapFunction[S, T]( + @transient fn: S => TraversableOnce[T], + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) /** @@ -44,17 +49,17 @@ package com.twitter.scalding { */ private[scalding] def getFunction = fn - def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = lockedFn.get(conv(functionCall.getArguments)).foreach { arg: T => val this_tup = set(arg) functionCall.getOutputCollector.add(this_tup) } - } } - class MapFunction[S, T](@transient fn: S => T, fields: Fields, - conv: TupleConverter[S], set: TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + class MapFunction[S, T](@transient fn: S => T, fields: Fields, conv: TupleConverter[S], set: TupleSetter[T]) + extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { val res = lockedFn.get(conv(functionCall.getArguments)) @@ -65,16 +70,19 @@ package com.twitter.scalding { /* The IdentityFunction puts empty nodes in the cascading graph. We use these to nudge the cascading planner in some edge cases. - */ + */ object IdentityFunction - extends BaseOperation[Any](Fields.ALL) with Function[Any] with ScaldingPrepare[Any] { - def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { + extends BaseOperation[Any](Fields.ALL) + with Function[Any] + with ScaldingPrepare[Any] { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = functionCall.getOutputCollector.add(functionCall.getArguments) - } } class CleanupIdentityFunction(@transient fn: () => Unit) - extends BaseOperation[Any](Fields.ALL) with Filter[Any] with ScaldingPrepare[Any] { + extends BaseOperation[Any](Fields.ALL) + with Filter[Any] + with ScaldingPrepare[Any] { val lockedEf = Externalizer(fn) @@ -84,9 +92,14 @@ package com.twitter.scalding { Try(lockedEf.get).foreach(_()) } - class CollectFunction[S, T](@transient fn: PartialFunction[S, T], fields: Fields, - conv: TupleConverter[S], set: TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + class CollectFunction[S, T]( + @transient fn: PartialFunction[S, T], + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) @@ -101,47 +114,50 @@ package com.twitter.scalding { } /** - * An implementation of map-side combining which is appropriate for associative and commutative functions - * If a cacheSize is given, it is used, else we query - * the config for cascading.aggregateby.threshold (standard cascading param for an equivalent case) - * else we use a default value of 100,000 + * An implementation of map-side combining which is appropriate for associative and commutative functions If + * a cacheSize is given, it is used, else we query the config for cascading.aggregateby.threshold (standard + * cascading param for an equivalent case) else we use a default value of 100,000 * - * This keeps a cache of keys up to the cache-size, summing values as keys collide - * On eviction, or completion of this Operation, the key-value pairs are put into outputCollector. + * This keeps a cache of keys up to the cache-size, summing values as keys collide On eviction, or + * completion of this Operation, the key-value pairs are put into outputCollector. * - * This NEVER spills to disk and generally never be a performance penalty. If you have - * poor locality in the keys, you just don't get any benefit but little added cost. + * This NEVER spills to disk and generally never be a performance penalty. If you have poor locality in the + * keys, you just don't get any benefit but little added cost. * - * Note this means that you may still have repeated keys in the output even on a single mapper - * since the key space may be so large that you can't fit all of them in the cache at the same - * time. + * Note this means that you may still have repeated keys in the output even on a single mapper since the key + * space may be so large that you can't fit all of them in the cache at the same time. * * You can use this with the Fields-API by doing: * {{{ - * val msr = new MapsideReduce(Semigroup.from(fn), 'key, 'value, None) - * // MUST map onto the same key,value space (may be multiple fields) - * val mapSideReduced = pipe.eachTo(('key, 'value) -> ('key, 'value)) { _ => msr } + * val msr = new MapsideReduce(Semigroup.from(fn), 'key, 'value, None) + * // MUST map onto the same key,value space (may be multiple fields) + * val mapSideReduced = pipe.eachTo(('key, 'value) -> ('key, 'value)) { _ => msr } * }}} - * That said, this is equivalent to AggregateBy, and the only value is that it is much simpler than AggregateBy. - * AggregateBy assumes several parallel reductions are happening, and thus has many loops, and array lookups - * to deal with that. Since this does many fewer allocations, and has a smaller code-path it may be faster for - * the typed-API. + * That said, this is equivalent to AggregateBy, and the only value is that it is much simpler than + * AggregateBy. AggregateBy assumes several parallel reductions are happening, and thus has many loops, and + * array lookups to deal with that. Since this does many fewer allocations, and has a smaller code-path it + * may be faster for the typed-API. */ object MapsideReduce { val COUNTER_GROUP = "MapsideReduce" } class MapsideReduce[V]( - @transient commutativeSemigroup: Semigroup[V], - keyFields: Fields, valueFields: Fields, - cacheSize: Option[Int])(implicit conv: TupleConverter[V], set: TupleSetter[V]) - extends BaseOperation[MapsideCache[Tuple, V]](Fields.join(keyFields, valueFields)) - with Function[MapsideCache[Tuple, V]] - with ScaldingPrepare[MapsideCache[Tuple, V]] { + @transient commutativeSemigroup: Semigroup[V], + keyFields: Fields, + valueFields: Fields, + cacheSize: Option[Int] + )(implicit conv: TupleConverter[V], set: TupleSetter[V]) + extends BaseOperation[MapsideCache[Tuple, V]](Fields.join(keyFields, valueFields)) + with Function[MapsideCache[Tuple, V]] + with ScaldingPrepare[MapsideCache[Tuple, V]] { val boxedSemigroup = Externalizer(commutativeSemigroup) - override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[Tuple, V]]): Unit = { + override def prepare( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = { //Set up the context: implicit val sg: Semigroup[V] = boxedSemigroup.get val cache = MapsideCache[Tuple, V](cacheSize, flowProcess) @@ -149,7 +165,10 @@ package com.twitter.scalding { } @inline - private def add(evicted: Option[Map[Tuple, V]], functionCall: FunctionCall[MapsideCache[Tuple, V]]): Unit = { + private def add( + evicted: Option[Map[Tuple, V]], + functionCall: FunctionCall[MapsideCache[Tuple, V]] + ): Unit = // Use iterator and while for optimal performance (avoid closures/fn calls) if (evicted.isDefined) { // Don't use pattern matching in performance-critical code @@ -163,9 +182,11 @@ package com.twitter.scalding { tecol.add(key) } } - } - override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[MapsideCache[Tuple, V]]): Unit = { + override def operate( + flowProcess: FlowProcess[_], + functionCall: FunctionCall[MapsideCache[Tuple, V]] + ): Unit = { val cache = functionCall.getContext val keyValueTE = functionCall.getArguments // Have to keep a copy of the key tuple because cascading will modify it @@ -175,7 +196,10 @@ package com.twitter.scalding { add(evicted, functionCall) } - override def flush(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[Tuple, V]]): Unit = { + override def flush( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = { // Docs say it is safe to do this cast: // http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/Operation.html#flush(cascading.flow.FlowProcess, cascading.operation.OperationCall) val functionCall = operationCall.asInstanceOf[FunctionCall[MapsideCache[Tuple, V]]] @@ -183,27 +207,34 @@ package com.twitter.scalding { add(cache.flush, functionCall) } - override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[Tuple, V]]): Unit = { + override def cleanup( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = // The cache may be large, but super sure we drop any reference to it ASAP // probably overly defensive, but it's super cheap. operationCall.setContext(null) - } } class TypedMapsideReduce[K, V]( - @transient fn: TupleEntry => TraversableOnce[(K, V)], - @transient commutativeSemigroup: Semigroup[V], - sourceFields: Fields, - keyFields: Fields, valueFields: Fields, - cacheSize: Option[Int])(implicit setKV: TupleSetter[(K, V)]) - extends BaseOperation[MapsideCache[K, V]](Fields.join(keyFields, valueFields)) - with Function[MapsideCache[K, V]] - with ScaldingPrepare[MapsideCache[K, V]] { + @transient fn: TupleEntry => TraversableOnce[(K, V)], + @transient commutativeSemigroup: Semigroup[V], + sourceFields: Fields, + keyFields: Fields, + valueFields: Fields, + cacheSize: Option[Int] + )(implicit setKV: TupleSetter[(K, V)]) + extends BaseOperation[MapsideCache[K, V]](Fields.join(keyFields, valueFields)) + with Function[MapsideCache[K, V]] + with ScaldingPrepare[MapsideCache[K, V]] { val boxedSemigroup = Externalizer(commutativeSemigroup) val lockedFn = Externalizer(fn) - override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[K, V]]): Unit = { + override def prepare( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = { //Set up the context: implicit val sg: Semigroup[V] = boxedSemigroup.get val cache = MapsideCache[K, V](cacheSize, flowProcess) @@ -213,7 +244,7 @@ package com.twitter.scalding { // Don't use pattern matching in a performance-critical section @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) @inline - private def add(evicted: Option[Map[K, V]], functionCall: FunctionCall[MapsideCache[K, V]]): Unit = { + private def add(evicted: Option[Map[K, V]], functionCall: FunctionCall[MapsideCache[K, V]]): Unit = // Use iterator and while for optimal performance (avoid closures/fn calls) if (evicted.isDefined) { val it = evicted.get.iterator @@ -224,11 +255,12 @@ package com.twitter.scalding { tecol.add(setKV(key, value)) } } - } - import scala.collection.mutable.{ Map => MMap } + import scala.collection.mutable.{Map => MMap} - private[this] class CollectionBackedMap[K, V](val backingMap: MMap[K, V]) extends Map[K, V] with java.io.Serializable { + private[this] class CollectionBackedMap[K, V](val backingMap: MMap[K, V]) + extends Map[K, V] + with java.io.Serializable { def get(key: K) = backingMap.get(key) def iterator = backingMap.iterator @@ -241,7 +273,9 @@ package com.twitter.scalding { // Don't use pattern matching in a performance-critical section @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) private[this] def mergeTraversableOnce[K, V: Semigroup](items: TraversableOnce[(K, V)]): Map[K, V] = { - val mutable = scala.collection.mutable.OpenHashMap[K, V]() // Scala's OpenHashMap seems faster than Java and Scala's HashMap Impl's + val mutable = + scala.collection.mutable + .OpenHashMap[K, V]() // Scala's OpenHashMap seems faster than Java and Scala's HashMap Impl's val innerIter = items.toIterator while (innerIter.hasNext) { val (k, v) = innerIter.next @@ -253,7 +287,10 @@ package com.twitter.scalding { new CollectionBackedMap(mutable) } - override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[MapsideCache[K, V]]): Unit = { + override def operate( + flowProcess: FlowProcess[_], + functionCall: FunctionCall[MapsideCache[K, V]] + ): Unit = { val cache = functionCall.getContext implicit val sg: Semigroup[V] = boxedSemigroup.get val res: Map[K, V] = mergeTraversableOnce(lockedFn.get(functionCall.getArguments)) @@ -261,7 +298,10 @@ package com.twitter.scalding { add(evicted, functionCall) } - override def flush(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[K, V]]): Unit = { + override def flush( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = { // Docs say it is safe to do this cast: // http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/Operation.html#flush(cascading.flow.FlowProcess, cascading.operation.OperationCall) val functionCall = operationCall.asInstanceOf[FunctionCall[MapsideCache[K, V]]] @@ -269,11 +309,13 @@ package com.twitter.scalding { add(cache.flush, functionCall) } - override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[MapsideCache[K, V]]): Unit = { + override def cleanup( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = // The cache may be large, but super sure we drop any reference to it ASAP // probably overly defensive, but it's super cheap. operationCall.setContext(null) - } } sealed trait MapsideCache[K, V] { @@ -290,12 +332,12 @@ package com.twitter.scalding { private def getCacheSize(fp: FlowProcess[_]): Int = Option(fp.getStringProperty(SIZE_CONFIG_KEY)) - .filterNot { _.isEmpty } - .map { _.toInt } + .filterNot(_.isEmpty) + .map(_.toInt) .getOrElse(DEFAULT_CACHE_SIZE) def apply[K, V: Semigroup](cacheSize: Option[Int], flowProcess: FlowProcess[_]): MapsideCache[K, V] = { - val size = cacheSize.getOrElse{ getCacheSize(flowProcess) } + val size = cacheSize.getOrElse(getCacheSize(flowProcess)) val adaptive = Option(flowProcess.getStringProperty(ADAPTIVE_CACHE_KEY)).isDefined if (adaptive) new AdaptiveMapsideCache(flowProcess, new AdaptiveCache(size)) @@ -305,7 +347,7 @@ package com.twitter.scalding { } final class SummingMapsideCache[K, V](flowProcess: FlowProcess[_], summingCache: SummingWithHitsCache[K, V]) - extends MapsideCache[K, V] { + extends MapsideCache[K, V] { private[this] val misses = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "misses")) private[this] val hits = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "hits")) private[this] val evictions = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "evictions")) @@ -338,7 +380,7 @@ package com.twitter.scalding { } final class AdaptiveMapsideCache[K, V](flowProcess: FlowProcess[_], adaptiveCache: AdaptiveCache[K, V]) - extends MapsideCache[K, V] { + extends MapsideCache[K, V] { private[this] val misses = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "misses")) private[this] val hits = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "hits")) private[this] val capacity = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "capacity")) @@ -383,30 +425,32 @@ package com.twitter.scalding { * BaseOperation with support for context */ abstract class SideEffectBaseOperation[C]( - @transient bf: => C, // begin function returns a context - @transient ef: C => Unit, // end function to clean up context object - fields: Fields) extends BaseOperation[C](fields) with ScaldingPrepare[C] { + @transient bf: => C, // begin function returns a context + @transient ef: C => Unit, // end function to clean up context object + fields: Fields + ) extends BaseOperation[C](fields) + with ScaldingPrepare[C] { val lockedBf = Externalizer(() => bf) val lockedEf = Externalizer(ef) - override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = { + override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = operationCall.setContext(lockedBf.get.apply) - } - override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = { + override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = lockedEf.get(operationCall.getContext) - } } /* * A map function that allows state object to be set up and tear down. */ class SideEffectMapFunction[S, C, T]( - bf: => C, // begin function returns a context - @transient fn: (C, S) => T, // function that takes a context and a tuple and generate a new tuple - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[S], - set: TupleSetter[T]) extends SideEffectBaseOperation[C](bf, ef, fields) with Function[C] { + bf: => C, // begin function returns a context + @transient fn: (C, S) => T, // function that takes a context and a tuple and generate a new tuple + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Function[C] { val lockedFn = Externalizer(fn) override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]): Unit = { @@ -421,42 +465,53 @@ package com.twitter.scalding { * A flatmap function that allows state object to be set up and tear down. */ class SideEffectFlatMapFunction[S, C, T]( - bf: => C, // begin function returns a context - @transient fn: (C, S) => TraversableOnce[T], // function that takes a context and a tuple, returns TraversableOnce of T - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[S], - set: TupleSetter[T]) extends SideEffectBaseOperation[C](bf, ef, fields) with Function[C] { + bf: => C, // begin function returns a context + @transient fn: ( + C, + S + ) => TraversableOnce[T], // function that takes a context and a tuple, returns TraversableOnce of T + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Function[C] { val lockedFn = Externalizer(fn) override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]): Unit = { val context = functionCall.getContext val s = conv(functionCall.getArguments) - lockedFn.get(context, s) foreach { t => functionCall.getOutputCollector.add(set(t)) } + lockedFn.get(context, s).foreach(t => functionCall.getOutputCollector.add(set(t))) } } class FilterFunction[T](@transient fn: T => Boolean, conv: TupleConverter[T]) - extends BaseOperation[Any] with Filter[Any] with ScaldingPrepare[Any] { + extends BaseOperation[Any] + with Filter[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) - def isRemove(flowProcess: FlowProcess[_], filterCall: FilterCall[Any]) = { + def isRemove(flowProcess: FlowProcess[_], filterCall: FilterCall[Any]) = !lockedFn.get(conv(filterCall.getArguments)) - } } // All the following are operations for use in GroupBuilder - class FoldAggregator[T, X](@transient fn: (X, T) => X, @transient init: X, fields: Fields, - conv: TupleConverter[T], set: TupleSetter[X]) - extends BaseOperation[X](fields) with Aggregator[X] with ScaldingPrepare[X] { + class FoldAggregator[T, X]( + @transient fn: (X, T) => X, + @transient init: X, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends BaseOperation[X](fields) + with Aggregator[X] + with ScaldingPrepare[X] { val lockedFn = Externalizer(fn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy - def start(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = { + def start(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = call.setContext(initCopy) - } def aggregate(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = { val left = call.getContext @@ -464,29 +519,31 @@ package com.twitter.scalding { call.setContext(lockedFn.get(left, right)) } - def complete(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = { + def complete(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = call.getOutputCollector.add(set(call.getContext)) - } } /* * fields are the declared fields of this aggregator */ class MRMAggregator[T, X, U]( - @transient inputFsmf: T => X, - @transient inputRfn: (X, X) => X, - @transient inputMrfn: X => U, - fields: Fields, conv: TupleConverter[T], set: TupleSetter[U]) - extends BaseOperation[Tuple](fields) with Aggregator[Tuple] with ScaldingPrepare[Tuple] { + @transient inputFsmf: T => X, + @transient inputRfn: (X, X) => X, + @transient inputMrfn: X => U, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[U] + ) extends BaseOperation[Tuple](fields) + with Aggregator[Tuple] + with ScaldingPrepare[Tuple] { val fsmf = Externalizer(inputFsmf) val rfn = Externalizer(inputRfn) val mrfn = Externalizer(inputMrfn) // The context is a singleton Tuple, which is mutable so // we don't have to allocate at every step of the loop: - def start(flowProcess: FlowProcess[_], call: AggregatorCall[Tuple]): Unit = { + def start(flowProcess: FlowProcess[_], call: AggregatorCall[Tuple]): Unit = call.setContext(null) - } def extractArgument(call: AggregatorCall[Tuple]): X = fsmf.get(conv(call.getArguments)) @@ -520,9 +577,8 @@ package com.twitter.scalding { } /** - * This handles the mapReduceMap work on the map-side of the operation. The code below - * attempts to be optimal with respect to memory allocations and performance, not functional - * style purity. + * This handles the mapReduceMap work on the map-side of the operation. The code below attempts to be + * optimal with respect to memory allocations and performance, not functional style purity. */ abstract class FoldFunctor[X](fields: Fields) extends AggregateBy.Functor { @@ -555,7 +611,7 @@ package com.twitter.scalding { nextContext } - override final def complete(flowProcess: FlowProcess[_], context: Tuple) = { + override final def complete(flowProcess: FlowProcess[_], context: Tuple) = if (context == null) { throw new Exception("FoldFunctor completed with any aggregate calls") } else { @@ -564,20 +620,19 @@ package com.twitter.scalding { context.set(0, null) finish(res) } - } } /** - * This handles the mapReduceMap work on the map-side of the operation. The code below - * attempts to be optimal with respect to memory allocations and performance, not functional - * style purity. + * This handles the mapReduceMap work on the map-side of the operation. The code below attempts to be + * optimal with respect to memory allocations and performance, not functional style purity. */ class MRMFunctor[T, X]( - @transient inputMrfn: T => X, - @transient inputRfn: (X, X) => X, - fields: Fields, - conv: TupleConverter[T], set: TupleSetter[X]) - extends FoldFunctor[X](fields) { + @transient inputMrfn: T => X, + @transient inputRfn: (X, X) => X, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends FoldFunctor[X](fields) { val mrfn = Externalizer(inputMrfn) val rfn = Externalizer(inputRfn) @@ -593,33 +648,40 @@ package com.twitter.scalding { /** * MapReduceMapBy Class */ - class MRMBy[T, X, U](arguments: Fields, - middleFields: Fields, - declaredFields: Fields, - mfn: T => X, - rfn: (X, X) => X, - mfn2: X => U, - startConv: TupleConverter[T], - midSet: TupleSetter[X], - midConv: TupleConverter[X], - endSet: TupleSetter[U]) extends AggregateBy( - arguments, - new MRMFunctor[T, X](mfn, rfn, middleFields, startConv, midSet), - new MRMAggregator[X, X, U](args => args, rfn, mfn2, declaredFields, midConv, endSet)) + class MRMBy[T, X, U]( + arguments: Fields, + middleFields: Fields, + declaredFields: Fields, + mfn: T => X, + rfn: (X, X) => X, + mfn2: X => U, + startConv: TupleConverter[T], + midSet: TupleSetter[X], + midConv: TupleConverter[X], + endSet: TupleSetter[U] + ) extends AggregateBy( + arguments, + new MRMFunctor[T, X](mfn, rfn, middleFields, startConv, midSet), + new MRMAggregator[X, X, U](args => args, rfn, mfn2, declaredFields, midConv, endSet) + ) class BufferOp[I, T, X]( - @transient init: I, - @transient inputIterfn: (I, Iterator[T]) => TraversableOnce[X], - fields: Fields, conv: TupleConverter[T], set: TupleSetter[X]) - extends BaseOperation[Any](fields) with Buffer[Any] with ScaldingPrepare[Any] { + @transient init: I, + @transient inputIterfn: (I, Iterator[T]) => TraversableOnce[X], + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends BaseOperation[Any](fields) + with Buffer[Any] + with ScaldingPrepare[Any] { val iterfn = Externalizer(inputIterfn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]): Unit = { val oc = call.getOutputCollector - val in = call.getArgumentsIterator.asScala.map { entry => conv(entry) } - iterfn.get(initCopy, in).foreach { x => oc.add(set(x)) } + val in = call.getArgumentsIterator.asScala.map(entry => conv(entry)) + iterfn.get(initCopy, in).foreach(x => oc.add(set(x))) } } @@ -627,13 +689,15 @@ package com.twitter.scalding { * A buffer that allows state object to be set up and tear down. */ class SideEffectBufferOp[I, T, C, X]( - @transient init: I, - bf: => C, // begin function returns a context - @transient inputIterfn: (I, C, Iterator[T]) => TraversableOnce[X], - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[T], - set: TupleSetter[X]) extends SideEffectBaseOperation[C](bf, ef, fields) with Buffer[C] { + @transient init: I, + bf: => C, // begin function returns a context + @transient inputIterfn: (I, C, Iterator[T]) => TraversableOnce[X], + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Buffer[C] { val iterfn = Externalizer(inputIterfn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy @@ -641,13 +705,15 @@ package com.twitter.scalding { def operate(flowProcess: FlowProcess[_], call: BufferCall[C]): Unit = { val context = call.getContext val oc = call.getOutputCollector - val in = call.getArgumentsIterator.asScala.map { entry => conv(entry) } - iterfn.get(initCopy, context, in).foreach { x => oc.add(set(x)) } + val in = call.getArgumentsIterator.asScala.map(entry => conv(entry)) + iterfn.get(initCopy, context, in).foreach(x => oc.add(set(x))) } } - class SampleWithReplacement(frac: Double, val seed: Int = new java.util.Random().nextInt) extends BaseOperation[Poisson]() - with Function[Poisson] with ScaldingPrepare[Poisson] { + class SampleWithReplacement(frac: Double, val seed: Int = new java.util.Random().nextInt) + extends BaseOperation[Poisson]() + with Function[Poisson] + with ScaldingPrepare[Poisson] { override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[Poisson]): Unit = { super.prepare(flowProcess, operationCall) val p = new Poisson(frac, seed) @@ -663,18 +729,19 @@ package com.twitter.scalding { /** In the typed API every reduce operation is handled by this Buffer */ class TypedBufferOp[K, V, U]( - conv: TupleConverter[K], - convV: TupleConverter[V], - @transient reduceFn: (K, Iterator[V]) => Iterator[U], - valueField: Fields) - extends BaseOperation[Any](valueField) with Buffer[Any] with ScaldingPrepare[Any] { + conv: TupleConverter[K], + convV: TupleConverter[V], + @transient reduceFn: (K, Iterator[V]) => Iterator[U], + valueField: Fields + ) extends BaseOperation[Any](valueField) + with Buffer[Any] + with ScaldingPrepare[Any] { val reduceFnSer = Externalizer(reduceFn) def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]): Unit = { val oc = call.getOutputCollector val key = conv(call.getGroup) - val values = call.getArgumentsIterator - .asScala + val values = call.getArgumentsIterator.asScala .map(convV(_)) // Avoiding a lambda here @@ -688,12 +755,11 @@ package com.twitter.scalding { } /** - * This gets a pair out of a tuple, incruments the counters with the left, and passes the value - * on + * This gets a pair out of a tuple, incruments the counters with the left, and passes the value on */ class IncrementCounters[A](pass: Fields, conv: TupleConverter[(A, Iterable[((String, String), Long)])]) - extends BaseOperation[Any](pass) - with Function[Any] { + extends BaseOperation[Any](pass) + with Function[Any] { override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { val (a, inc) = conv(functionCall.getArguments) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala index aefd9f4590..aa4125326c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import scala.util.{ Try, Success, Failure } +import scala.util.{Failure, Success, Try} import cascading.tap.Tap case class OptionalSource[T](src: Mappable[T]) extends Source with Mappable[T] { @@ -27,6 +27,7 @@ case class OptionalSource[T](src: Mappable[T]) extends Source with Mappable[T] { src.createTap(readOrWrite) case Failure(_) => IterableSource[T](Nil)(TupleSetter.singleSetter[T], src.converter) - .createTap(readOrWrite).asInstanceOf[Tap[_, _, _]] + .createTap(readOrWrite) + .asInstanceOf[Tap[_, _, _]] } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala index 04eb29a433..782d948996 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tap.hadoop.Hfs -import cascading.tap.hadoop.{ PartitionTap => HPartitionTap } +import cascading.tap.hadoop.{PartitionTap => HPartitionTap} import cascading.tap.local.FileTap -import cascading.tap.local.{ PartitionTap => LPartitionTap } +import cascading.tap.local.{PartitionTap => LPartitionTap} import cascading.tap.partition.DelimitedPartition import cascading.tap.partition.Partition import cascading.tap.SinkMode @@ -28,7 +28,9 @@ import cascading.tuple.Fields /** * This is a base class for partition-based output sources */ -abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) extends SchemedSource with HfsTapProvider { +abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) + extends SchemedSource + with HfsTapProvider { // The root path of the partitioned output. def basePath: String @@ -38,12 +40,15 @@ abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) exte /** * Creates the partition tap. * - * @param readOrWrite Describes if this source is being read from or written to. - * @param mode The mode of the job. (implicit) + * @param readOrWrite + * Describes if this source is being read from or written to. + * @param mode + * The mode of the job. (implicit) * - * @return A cascading PartitionTap. + * @return + * A cascading PartitionTap. */ - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = readOrWrite match { case Read => throw new InvalidSourceException("Using PartitionSource for input not yet implemented") case Write => { @@ -52,7 +57,7 @@ abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) exte val localTap = new FileTap(localScheme, basePath, sinkMode) openWritesThreshold match { case Some(threshold) => new LPartitionTap(localTap, partition, threshold) - case None => new LPartitionTap(localTap, partition) + case None => new LPartitionTap(localTap, partition) } } case hdfsMode @ Hdfs(_, _) => { @@ -67,69 +72,83 @@ abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) exte } } } - } /** * Validates the taps, makes sure there are no nulls in the path. * - * @param mode The mode of the job. + * @param mode + * The mode of the job. */ - override def validateTaps(mode: Mode): Unit = { + override def validateTaps(mode: Mode): Unit = if (basePath == null) { throw new InvalidSourceException("basePath cannot be null for PartitionTap") } - } - private[this] def getHPartitionTap(hfsTap: Hfs): HPartitionTap = { + private[this] def getHPartitionTap(hfsTap: Hfs): HPartitionTap = openWritesThreshold match { case Some(threshold) => new HPartitionTap(hfsTap, partition, threshold) - case None => new HPartitionTap(hfsTap, partition) + case None => new HPartitionTap(hfsTap, partition) } - } } /** * An implementation of TSV output, split over a partition tap. * - * Similar to TemplateSource, but with addition of tsvFields, to - * let users explicitly specify which fields they want to see in - * the TSV (allows user to discard path fields). + * Similar to TemplateSource, but with addition of tsvFields, to let users explicitly specify which fields + * they want to see in the TSV (allows user to discard path fields). * - * apply assumes user wants a DelimitedPartition (the only - * strategy bundled with Cascading). + * apply assumes user wants a DelimitedPartition (the only strategy bundled with Cascading). * - * @param basePath The root path for the output. - * @param delimiter The path delimiter, defaults to / to create sub-directory bins. - * @param pathFields The set of fields to apply to the path. - * @param writeHeader Flag to indicate that the header should be written to the file. - * @param tsvFields The set of fields to include in the TSV output. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param delimiter + * The path delimiter, defaults to / to create sub-directory bins. + * @param pathFields + * The set of fields to apply to the path. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param tsvFields + * The set of fields to include in the TSV output. + * @param sinkMode + * How to handle conflicts with existing output. */ object PartitionedTsv { def apply( - basePath: String, - delimiter: String = "/", - pathFields: Fields = Fields.ALL, - writeHeader: Boolean = false, - tsvFields: Fields = Fields.ALL, - sinkMode: SinkMode = SinkMode.REPLACE) = new PartitionedTsv(basePath, new DelimitedPartition(pathFields, delimiter), writeHeader, tsvFields, sinkMode) + basePath: String, + delimiter: String = "/", + pathFields: Fields = Fields.ALL, + writeHeader: Boolean = false, + tsvFields: Fields = Fields.ALL, + sinkMode: SinkMode = SinkMode.REPLACE + ) = new PartitionedTsv( + basePath, + new DelimitedPartition(pathFields, delimiter), + writeHeader, + tsvFields, + sinkMode + ) } /** * An implementation of TSV output, split over a partition tap. * - * @param basePath The root path for the output. - * @param partition The partitioning strategy to use. - * @param writeHeader Flag to indicate that the header should be written to the file. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param partition + * The partitioning strategy to use. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param sinkMode + * How to handle conflicts with existing output. */ case class PartitionedTsv( - override val basePath: String, - override val partition: Partition, - override val writeHeader: Boolean, - val tsvFields: Fields, - override val sinkMode: SinkMode) - extends PartitionSource with DelimitedScheme { + override val basePath: String, + override val partition: Partition, + override val writeHeader: Boolean, + val tsvFields: Fields, + override val sinkMode: SinkMode +) extends PartitionSource + with DelimitedScheme { override val fields = tsvFields } @@ -137,38 +156,53 @@ case class PartitionedTsv( /** * An implementation of SequenceFile output, split over a partition tap. * - * apply assumes user wants a DelimitedPartition (the only - * strategy bundled with Cascading). + * apply assumes user wants a DelimitedPartition (the only strategy bundled with Cascading). * - * @param basePath The root path for the output. - * @param delimiter The path delimiter, defaults to / to create sub-directory bins. - * @param pathFields The set of fields to apply to the path. - * @param sequenceFields The set of fields to use for the sequence file. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param delimiter + * The path delimiter, defaults to / to create sub-directory bins. + * @param pathFields + * The set of fields to apply to the path. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param sinkMode + * How to handle conflicts with existing output. */ object PartitionedSequenceFile { def apply( - basePath: String, - delimiter: String = "/", - pathFields: Fields = Fields.ALL, - sequenceFields: Fields = Fields.ALL, - sinkMode: SinkMode = SinkMode.REPLACE) = new PartitionedSequenceFile(basePath, new DelimitedPartition(pathFields, delimiter), sequenceFields, sinkMode) + basePath: String, + delimiter: String = "/", + pathFields: Fields = Fields.ALL, + sequenceFields: Fields = Fields.ALL, + sinkMode: SinkMode = SinkMode.REPLACE + ) = new PartitionedSequenceFile( + basePath, + new DelimitedPartition(pathFields, delimiter), + sequenceFields, + sinkMode + ) } /** * An implementation of SequenceFile output, split over a partition tap. * - * @param basePath The root path for the output. - * @param partition The partitioning strategy to use. - * @param sequenceFields The set of fields to use for the sequence file. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param partition + * The partitioning strategy to use. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param sinkMode + * How to handle conflicts with existing output. */ case class PartitionedSequenceFile( - override val basePath: String, - override val partition: Partition, - val sequenceFields: Fields, - override val sinkMode: SinkMode) - extends PartitionSource with SequenceFileScheme { + override val basePath: String, + override val partition: Partition, + val sequenceFields: Fields, + override val sinkMode: SinkMode +) extends PartitionSource + with SequenceFileScheme { override val fields = sequenceFields } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala b/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala index 3c5ab7347d..dddc108c3a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala @@ -12,24 +12,25 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe.{ Pipe, Each } +import cascading.pipe.{Each, Pipe} import cascading.operation.Debug import cascading.operation.Debug.Output /** - * This is a builder for Cascading's Debug object. - * The default instance is the same default as cascading's new Debug() + * This is a builder for Cascading's Debug object. The default instance is the same default as cascading's new + * Debug() * https://github.com/cwensel/cascading/blob/wip-2.5/cascading-core/src/main/java/cascading/operation/Debug.java#L46 - * This is based on work by: https://github.com/granthenke - * https://github.com/twitter/scalding/pull/559 + * This is based on work by: https://github.com/granthenke https://github.com/twitter/scalding/pull/559 */ -case class PipeDebug(output: Output = Output.STDERR, - prefix: String = null, - printFieldsEvery: Option[Int] = None, - printTuplesEvery: Int = 1) { +case class PipeDebug( + output: Output = Output.STDERR, + prefix: String = null, + printFieldsEvery: Option[Int] = None, + printTuplesEvery: Int = 1 +) { def toStdOut: PipeDebug = copy(output = Output.STDOUT) def toStdErr: PipeDebug = copy(output = Output.STDERR) @@ -49,4 +50,3 @@ case class PipeDebug(output: Output = Output.STDERR, def apply(p: Pipe): Pipe = new Each(p, toDebug) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala index 7645cac59c..8ba25b6365 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala @@ -12,21 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -import cascading.tuple.{ Tuple => CTuple } - -import com.twitter.algebird.{ - Semigroup, - Ring, - AveragedValue, - Moments, - HyperLogLogMonoid, - HLL, - Aggregator -} +import cascading.tuple.{Tuple => CTuple} + +import com.twitter.algebird.{Aggregator, AveragedValue, HLL, HyperLogLogMonoid, Moments, Ring, Semigroup} import com.twitter.algebird.mutable.PriorityQueueMonoid @@ -37,56 +29,59 @@ import scala.collection.JavaConverters._ import Dsl._ //Get the conversion implicits /** - * Implements reductions on top of a simple abstraction for the Fields-API - * This is for associative and commutive operations (particularly Monoids and Semigroups play a big role here) + * Implements reductions on top of a simple abstraction for the Fields-API This is for associative and + * commutive operations (particularly Monoids and Semigroups play a big role here) * - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. + * We use the f-bounded polymorphism trick to return the type called Self in each operation. */ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializable { + /** - * Type T is the type of the input field (input to map, T => X) - * Type X is the intermediate type, which your reduce function operates on - * (reduce is (X,X) => X) - * Type U is the final result type, (final map is: X => U) + * Type T is the type of the input field (input to map, T => X) Type X is the intermediate type, which your + * reduce function operates on (reduce is (X,X) => X) Type U is the final result type, (final map is: X => + * U) * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers */ - def mapReduceMap[T, X, U](fieldDef: (Fields, Fields))(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit startConv: TupleConverter[T], - middleSetter: TupleSetter[X], - middleConv: TupleConverter[X], - endSetter: TupleSetter[U]): Self + def mapReduceMap[T, X, U]( + fieldDef: (Fields, Fields) + )(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U] + ): Self ///////////////////////////////////////// // All the below functions are implemented in terms of the above ///////////////////////////////////////// /** Pretty much a synonym for mapReduceMap with the methods collected into a trait. */ - def aggregate[A, B, C](fieldDef: (Fields, Fields))(ag: Aggregator[A, B, C])(implicit startConv: TupleConverter[A], - middleSetter: TupleSetter[B], - middleConv: TupleConverter[B], - endSetter: TupleSetter[C]): Self = + def aggregate[A, B, C](fieldDef: (Fields, Fields))(ag: Aggregator[A, B, C])(implicit + startConv: TupleConverter[A], + middleSetter: TupleSetter[B], + middleConv: TupleConverter[B], + endSetter: TupleSetter[C] + ): Self = mapReduceMap[A, B, C](fieldDef)(ag.prepare _)(ag.reduce _)(ag.present _) /** - * uses a more stable online algorithm which should - * be suitable for large numbers of records + * uses a more stable online algorithm which should be suitable for large numbers of records * - * == Similar To == - * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + * ==Similar To== + * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm */ - def average(f: (Fields, Fields)) = mapPlusMap(f) { (x: Double) => AveragedValue(1L, x) } { _.value } + def average(f: (Fields, Fields)) = mapPlusMap(f)((x: Double) => AveragedValue(1L, x))(_.value) def average(f: Symbol): Self = average(f -> f) /** - * Approximate number of unique values - * We use about m = (104/errPercent)^2 bytes of memory per key - * Uses `.toString.getBytes` to serialize the data so you MUST - * ensure that .toString is an equivalance on your counted fields - * (i.e. `x.toString == y.toString` if and only if `x == y`) + * Approximate number of unique values We use about m = (104/errPercent)^2 bytes of memory per key Uses + * `.toString.getBytes` to serialize the data so you MUST ensure that .toString is an equivalance on your + * counted fields (i.e. `x.toString == y.toString` if and only if `x == y`) * * For each key: * {{{ @@ -98,57 +93,58 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * 0.25% error ~ 256kB * }}} */ - def approximateUniqueCount[T <% Array[Byte]: TupleConverter](f: (Fields, Fields), errPercent: Double = 1.0) = { - hyperLogLogMap[T, Double](f, errPercent) { _.estimatedSize } - } - - def hyperLogLog[T <% Array[Byte]: TupleConverter](f: (Fields, Fields), errPercent: Double = 1.0) = { - hyperLogLogMap[T, HLL](f, errPercent) { hll => hll } - } - - private[this] def hyperLogLogMap[T <% Array[Byte]: TupleConverter, U: TupleSetter](f: (Fields, Fields), errPercent: Double = 1.0)(fn: HLL => U) = { + def approximateUniqueCount[T <% Array[Byte]: TupleConverter]( + f: (Fields, Fields), + errPercent: Double = 1.0 + ) = + hyperLogLogMap[T, Double](f, errPercent)(_.estimatedSize) + + def hyperLogLog[T <% Array[Byte]: TupleConverter](f: (Fields, Fields), errPercent: Double = 1.0) = + hyperLogLogMap[T, HLL](f, errPercent)(hll => hll) + + private[this] def hyperLogLogMap[T <% Array[Byte]: TupleConverter, U: TupleSetter]( + f: (Fields, Fields), + errPercent: Double = 1.0 + )(fn: HLL => U) = { //bits = log(m) == 2 *log(104/errPercent) = 2log(104) - 2*log(errPercent) def log2(x: Double) = scala.math.log(x) / scala.math.log(2.0) val bits = 2 * scala.math.ceil(log2(104) - log2(errPercent)).toInt implicit val hmm: HyperLogLogMonoid = new HyperLogLogMonoid(bits) - mapPlusMap(f) { (t: T) => hmm.create(t) } (fn) + mapPlusMap(f)((t: T) => hmm.create(t))(fn) } /** - * This is count with a predicate: only counts the tuples for which - * `fn(tuple)` is true + * This is count with a predicate: only counts the tuples for which `fn(tuple)` is true */ - def count[T: TupleConverter](fieldDef: (Fields, Fields))(fn: T => Boolean): Self = { - mapPlusMap(fieldDef){ (arg: T) => if (fn(arg)) 1L else 0L } { s => s } - } + def count[T: TupleConverter](fieldDef: (Fields, Fields))(fn: T => Boolean): Self = + mapPlusMap(fieldDef)((arg: T) => if (fn(arg)) 1L else 0L)(s => s) /** - * Opposite of RichPipe.unpivot. See SQL/Excel for more on this function - * converts a row-wise representation into a column-wise one. + * Opposite of RichPipe.unpivot. See SQL/Excel for more on this function converts a row-wise representation + * into a column-wise one. * - * == Example == + * ==Example== * {{{ * pivot(('feature, 'value) -> ('clicks, 'impressions, 'requests)) * }}} * - * it will find the feature named "clicks", and put the value in the column with the field named - * clicks. + * it will find the feature named "clicks", and put the value in the column with the field named clicks. * * Absent fields result in null unless a default value is provided. Unnamed output fields are ignored. * - * == Note == + * ==Note== * Duplicated fields will result in an error. * - * == Hint == + * ==Hint== * if you want more precision, first do a * * {{{ * map('value -> value) { x : AnyRef => Option(x) } * }}} * - * and you will have non-nulls for all present values, and Nones for values that were present - * but previously null. All nulls in the final output will be those truly missing. - * Similarly, if you want to check if there are any items present that shouldn't be: + * and you will have non-nulls for all present values, and Nones for values that were present but previously + * null. All nulls in the final output will be those truly missing. Similarly, if you want to check if there + * are any items present that shouldn't be: * * {{{ * map('feature -> 'feature) { fname : String => @@ -157,59 +153,58 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * } * }}} */ - def pivot(fieldDef: (Fields, Fields), defaultVal: Any = null): Self = { + def pivot(fieldDef: (Fields, Fields), defaultVal: Any = null): Self = // Make sure the fields are strings: mapList[(String, AnyRef), CTuple](fieldDef) { outputList => val asMap = outputList.toMap assert(asMap.size == outputList.size, "Repeated pivot key fields: " + outputList.toString) - val values = fieldDef._2 - .iterator.asScala + val values = fieldDef._2.iterator.asScala // Look up this key: - .map { fname => asMap.getOrElse(fname.asInstanceOf[String], defaultVal.asInstanceOf[AnyRef]) } + .map(fname => asMap.getOrElse(fname.asInstanceOf[String], defaultVal.asInstanceOf[AnyRef])) // Create the cascading tuple new CTuple(values.toSeq: _*) } - } /** - * Compute the count, ave and standard deviation in one pass - * example: g.sizeAveStdev('x -> ('cntx, 'avex, 'stdevx)) + * Compute the count, ave and standard deviation in one pass example: g.sizeAveStdev('x -> ('cntx, 'avex, + * 'stdevx)) */ - def sizeAveStdev(fieldDef: (Fields, Fields)) = { - mapPlusMap(fieldDef) { (x: Double) => Moments(x) } { (mom: Moments) => (mom.count, mom.mean, mom.stddev) } - } + def sizeAveStdev(fieldDef: (Fields, Fields)) = + mapPlusMap(fieldDef)((x: Double) => Moments(x))((mom: Moments) => (mom.count, mom.mean, mom.stddev)) /* * check if a predicate is satisfied for all in the values for this key */ - def forall[T: TupleConverter](fieldDef: (Fields, Fields))(fn: (T) => Boolean): Self = { - mapReduceMap(fieldDef)(fn)({ (x: Boolean, y: Boolean) => x && y })({ x => x }) - } + def forall[T: TupleConverter](fieldDef: (Fields, Fields))(fn: (T) => Boolean): Self = + mapReduceMap(fieldDef)(fn) { (x: Boolean, y: Boolean) => x && y } { x => x } /** * Return the first, useful probably only for sorted case. */ - def head(fd: (Fields, Fields)): Self = { + def head(fd: (Fields, Fields)): Self = //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: - mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) } { (oldVal, newVal) => oldVal } { result => result._1 } - } + mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) }((oldVal, newVal) => oldVal) { result => + result._1 + } def head(f: Symbol*): Self = head(f -> f) - def last(fd: (Fields, Fields)) = { + def last(fd: (Fields, Fields)) = //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: - mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) } { (oldVal, newVal) => newVal } { result => result._1 } - } + mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) }((oldVal, newVal) => newVal) { result => + result._1 + } def last(f: Symbol*): Self = last(f -> f) /** - * Collect all the values into a List[T] and then operate on that - * list. This fundamentally uses as much memory as it takes to store the list. - * This gives you the list in the reverse order it was encounted (it is built - * as a stack for efficiency reasons). If you care about order, call .reverse in your fn + * Collect all the values into a List[T] and then operate on that list. This fundamentally uses as much + * memory as it takes to store the list. This gives you the list in the reverse order it was encounted (it + * is built as a stack for efficiency reasons). If you care about order, call .reverse in your fn * * STRONGLY PREFER TO AVOID THIS. Try reduce or plus and an O(1) memory algorithm. */ - def mapList[T, R](fieldDef: (Fields, Fields))(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = { + def mapList[T, R]( + fieldDef: (Fields, Fields) + )(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = { val midset = implicitly[TupleSetter[List[T]]] val midconv = implicitly[TupleConverter[List[T]]] @@ -217,26 +212,34 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ x => List(x) } { //Reduce, note the bigger list is likely on the left, so concat into it: (prev, current) => current ++ prev - } { fn(_) }(conv, midset, midconv, setter) + }(fn(_))(conv, midset, midconv, setter) } - def mapPlusMap[T, X, U](fieldDef: (Fields, Fields))(mapfn: T => X)(mapfn2: X => U)(implicit startConv: TupleConverter[T], - middleSetter: TupleSetter[X], - middleConv: TupleConverter[X], - endSetter: TupleSetter[U], - sgX: Semigroup[X]): Self = { - mapReduceMap[T, X, U](fieldDef) (mapfn)((x, y) => sgX.plus(x, y))(mapfn2) (startConv, middleSetter, middleConv, endSetter) - } + def mapPlusMap[T, X, U](fieldDef: (Fields, Fields))(mapfn: T => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U], + sgX: Semigroup[X] + ): Self = + mapReduceMap[T, X, U](fieldDef)(mapfn)((x, y) => sgX.plus(x, y))(mapfn2)( + startConv, + middleSetter, + middleConv, + endSetter + ) private def extremum(max: Boolean, fieldDef: (Fields, Fields)): Self = { //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: val select = if (max) { - { (a: CTuple, b: CTuple) => (a.compareTo(b) >= 0) } + (a: CTuple, b: CTuple) => (a.compareTo(b) >= 0) } else { - { (a: CTuple, b: CTuple) => (a.compareTo(b) <= 0) } + (a: CTuple, b: CTuple) => (a.compareTo(b) <= 0) } - mapReduceMap(fieldDef) { ctuple: CTuple => Tuple1(ctuple) } { (oldVal, newVal) => if (select(oldVal._1, newVal._1)) oldVal else newVal } { result => result._1 } + mapReduceMap(fieldDef) { ctuple: CTuple => Tuple1(ctuple) } { (oldVal, newVal) => + if (select(oldVal._1, newVal._1)) oldVal else newVal + }(result => result._1) } def max(fieldDef: (Fields, Fields)) = extremum(true, fieldDef) def max(f: Symbol*) = extremum(true, (f -> f)) @@ -244,19 +247,17 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ def min(f: Symbol*) = extremum(false, (f -> f)) /** - * Similar to the scala.collection.Iterable.mkString - * takes the source and destination fieldname, which should be a single - * field. The result will be start, each item.toString separated by sep, - * followed by end for convenience there several common variants below + * Similar to the scala.collection.Iterable.mkString takes the source and destination fieldname, which + * should be a single field. The result will be start, each item.toString separated by sep, followed by end + * for convenience there several common variants below */ - def mkString(fieldDef: (Fields, Fields), start: String, sep: String, end: String): Self = { - mapList[String, String](fieldDef) { _.mkString(start, sep, end) } - } + def mkString(fieldDef: (Fields, Fields), start: String, sep: String, end: String): Self = + mapList[String, String](fieldDef)(_.mkString(start, sep, end)) def mkString(fieldDef: (Fields, Fields), sep: String): Self = mkString(fieldDef, "", sep, "") def mkString(fieldDef: (Fields, Fields)): Self = mkString(fieldDef, "", "", "") + /** - * these will only be called if a tuple is not passed, meaning just one - * column + * these will only be called if a tuple is not passed, meaning just one column */ def mkString(fieldDef: Symbol, start: String, sep: String, end: String): Self = { val f: Fields = fieldDef @@ -268,7 +269,7 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ /** * Apply an associative/commutative operation on the left field. * - * == Example == + * ==Example== * {{{ * reduce(('mass,'allids)->('totalMass, 'idset)) { (left:(Double,Set[Long]),right:(Double,Set[Long])) => * (left._1 + right._1, left._2 ++ right._2) @@ -277,36 +278,39 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * * Equivalent to a mapReduceMap with trivial (identity) map functions. * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. */ - def reduce[T](fieldDef: (Fields, Fields))(fn: (T, T) => T)(implicit setter: TupleSetter[T], conv: TupleConverter[T]): Self = { - mapReduceMap[T, T, T](fieldDef)({ t => t })(fn)({ t => t })(conv, setter, conv, setter) - } + def reduce[T](fieldDef: (Fields, Fields))( + fn: (T, T) => T + )(implicit setter: TupleSetter[T], conv: TupleConverter[T]): Self = + mapReduceMap[T, T, T](fieldDef) { t => t }(fn) { t => t }(conv, setter, conv, setter) //Same as reduce(f->f) - def reduce[T](fieldDef: Symbol*)(fn: (T, T) => T)(implicit setter: TupleSetter[T], - conv: TupleConverter[T]): Self = { + def reduce[T](fieldDef: Symbol*)( + fn: (T, T) => T + )(implicit setter: TupleSetter[T], conv: TupleConverter[T]): Self = reduce(fieldDef -> fieldDef)(fn)(setter, conv) - } // Abstract algebra reductions (sum, times, dot): /** - * Use `Semigroup.plus` to compute a sum. Not called sum to avoid conflicting with standard sum - * Your `Semigroup[T]` should be associated and commutative, else this doesn't make sense + * Use `Semigroup.plus` to compute a sum. Not called sum to avoid conflicting with standard sum Your + * `Semigroup[T]` should be associated and commutative, else this doesn't make sense * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers */ - def sum[T](fd: (Fields, Fields))(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = { + def sum[T]( + fd: (Fields, Fields) + )(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = // We reverse the order because the left is the old value in reduce, and for list concat // we are much better off concatenating into the bigger list - reduce[T](fd)({ (left, right) => sg.plus(right, left) })(tset, tconv) - } + reduce[T](fd) { (left, right) => sg.plus(right, left) }(tset, tconv) + /** - * The same as `sum(fs -> fs)` - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * The same as `sum(fs -> fs)` Assumed to be a commutative operation. If you don't want that, use + * .forceToReducers */ def sum[T](fs: Symbol*)(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = sum[T](fs -> fs)(sg, tconv, tset) @@ -314,58 +318,59 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ /** * Returns the product of all the items in this grouping */ - def times[T](fd: (Fields, Fields))(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = { + def times[T]( + fd: (Fields, Fields) + )(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = // We reverse the order because the left is the old value in reduce, and for list concat // we are much better off concatenating into the bigger list - reduce[T](fd)({ (left, right) => ring.times(right, left) })(tset, tconv) - } + reduce[T](fd) { (left, right) => ring.times(right, left) }(tset, tconv) /** * The same as `times(fs -> fs)` */ - def times[T](fs: Symbol*)(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = { + def times[T](fs: Symbol*)(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = times[T](fs -> fs)(ring, tconv, tset) - } /** * Convert a subset of fields into a list of Tuples. Need to provide the types of the tuple fields. */ - def toList[T](fieldDef: (Fields, Fields))(implicit conv: TupleConverter[T]): Self = { + def toList[T](fieldDef: (Fields, Fields))(implicit conv: TupleConverter[T]): Self = // TODO(POB) this is jank in my opinion. Nulls should be filter by the user if they want - mapList[T, List[T]](fieldDef) { _.filter { t => t != null } } - } + mapList[T, List[T]](fieldDef)(_.filter(t => t != null)) /** * First do "times" on each pair, then "plus" them all together. * - * == Example == + * ==Example== * {{{ * groupBy('x) { _.dot('y,'z, 'ydotz) } * }}} */ - def dot[T](left: Fields, right: Fields, result: Fields)(implicit ttconv: TupleConverter[Tuple2[T, T]], ring: Ring[T], - tconv: TupleConverter[T], tset: TupleSetter[T]): Self = { + def dot[T](left: Fields, right: Fields, result: Fields)(implicit + ttconv: TupleConverter[Tuple2[T, T]], + ring: Ring[T], + tconv: TupleConverter[T], + tset: TupleSetter[T] + ): Self = mapReduceMap[(T, T), T, T](Fields.merge(left, right) -> result) { init: (T, T) => ring.times(init._1, init._2) } { (left: T, right: T) => ring.plus(left, right) - } { result => result } - } + }(result => result) /** * How many values are there for this key */ def size: Self = size('size) - def size(thisF: Fields): Self = { - mapPlusMap(() -> thisF) { (u: Unit) => 1L } { s => s } - } + def size(thisF: Fields): Self = + mapPlusMap(() -> thisF)((u: Unit) => 1L)(s => s) /** - * Equivalent to sorting by a comparison function - * then take-ing k items. This is MUCH more efficient than doing a total sort followed by a take, - * since these bounded sorts are done on the mapper, so only a sort of size k is needed. + * Equivalent to sorting by a comparison function then take-ing k items. This is MUCH more efficient than + * doing a total sort followed by a take, since these bounded sorts are done on the mapper, so only a sort + * of size k is needed. * - * == Example == + * ==Example== * {{{ * sortWithTake( ('clicks, 'tweet) -> 'topClicks, 5) { * fn : (t0 :(Long,Long), t1:(Long,Long) => t0._1 < t1._1 } @@ -381,9 +386,11 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ /** * Reverse of above when the implicit ordering makes sense. */ - def sortedReverseTake[T](f: (Fields, Fields), k: Int)(implicit conv: TupleConverter[T], ord: Ordering[T]): Self = { + def sortedReverseTake[T](f: (Fields, Fields), k: Int)(implicit + conv: TupleConverter[T], + ord: Ordering[T] + ): Self = sortedTake[T](f, k)(conv, ord.reverse) - } /** * Same as above but useful when the implicit ordering makes sense. @@ -392,12 +399,13 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ assert(f._2.size == 1, "output field size must be 1") implicit val mon: PriorityQueueMonoid[T] = new PriorityQueueMonoid[T](k) - mapPlusMap(f) { (tup: T) => mon.build(tup) } { - (lout: PriorityQueue[T]) => lout.iterator.asScala.toList.sorted + mapPlusMap(f)((tup: T) => mon.build(tup)) { (lout: PriorityQueue[T]) => + lout.iterator.asScala.toList.sorted } } - def histogram(f: (Fields, Fields), binWidth: Double = 1.0) = { - mapPlusMap(f) { x: Double => Map((math.floor(x / binWidth) * binWidth) -> 1L) } { map => new mathematics.Histogram(map, binWidth) } - } + def histogram(f: (Fields, Fields), binWidth: Double = 1.0) = + mapPlusMap(f) { x: Double => Map((math.floor(x / binWidth) * binWidth) -> 1L) } { map => + new mathematics.Histogram(map, binWidth) + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala index b05ce273eb..0a12e516c0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala @@ -3,7 +3,7 @@ package com.twitter.scalding import com.twitter.scalding.typed.CoGroupable import org.slf4j.LoggerFactory import scala.reflect.runtime.universe -import scala.reflect.runtime.universe.{ NullaryMethodType, RuntimeMirror, Symbol, Type, TypeRef } +import scala.reflect.runtime.universe.{NullaryMethodType, RuntimeMirror, Symbol, Type, TypeRef} import java.lang.{reflect => jReflect} object ReferencedClassFinder { @@ -16,23 +16,22 @@ object ReferencedClassFinder { classOf[TypedSink[_]], classOf[TypedSource[_]], classOf[CoGroupable[_, _]], - classOf[KeyedList[_, _]]) + classOf[KeyedList[_, _]] + ) /** - * Add the given type, as well as all referenced types to the cascading tokens list. - * note, for maximal efficiency, you should also register those types with the kryo - * instantiator being used. + * Add the given type, as well as all referenced types to the cascading tokens list. note, for maximal + * efficiency, you should also register those types with the kryo instantiator being used. */ - def addCascadingTokensFrom(c: Class[_], config: Config): Config = { + def addCascadingTokensFrom(c: Class[_], config: Config): Config = CascadingTokenUpdater.update(config, findReferencedClasses(c) + c) - } /** * Reflect over a scalding job to try and identify types it uses so they can be tokenized by cascading. * Since scala reflection is broken with the Hadoop InterfaceAudiance annotation (see - * https://issues.scala-lang.org/browse/SI-10129), we can't iterate over scalaType.members, so we instead use java - * reflection to iterate over fields to find the ones we care about, and then look those up in scala reflection to - * find the full un-erased type signatures, and try to find types from those. + * https://issues.scala-lang.org/browse/SI-10129), we can't iterate over scalaType.members, so we instead + * use java reflection to iterate over fields to find the ones we care about, and then look those up in + * scala reflection to find the full un-erased type signatures, and try to find types from those. * * Note: this not guaranteed to find every used type. Eg, it can't find types used in a step that isn't * referred to in a field @@ -59,7 +58,11 @@ object ReferencedClassFinder { } } - private def getFieldType(outerClass: Class[_], scalaType: universe.Type, field: jReflect.Field): Option[universe.Type] = + private def getFieldType( + outerClass: Class[_], + scalaType: universe.Type, + field: jReflect.Field + ): Option[universe.Type] = safeScalaReflectionCall(outerClass) { scalaType.member(universe.stringToTermName(field.getName)).typeSignature } @@ -75,39 +78,48 @@ object ReferencedClassFinder { } catch { // In some cases we fail to find references classes, it shouldn't be fatal. case r: RuntimeException if r.getMessage.contains("error reading Scala signature") => - LOG.warn(s"Unable to find referenced classes for: $outerClass. This is potentially due to missing dependencies", r) + LOG.warn( + s"Unable to find referenced classes for: $outerClass. This is potentially due to missing dependencies", + r + ) None case t: Throwable if t.getMessage.contains("illegal cyclic reference") => // Related to: https://issues.scala-lang.org/browse/SI-10129 - LOG.warn(s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", t) + LOG.warn( + s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", + t + ) None case ae: AssertionError if ae.getMessage.contains("no symbol could be loaded from interface") => // Related to: https://issues.scala-lang.org/browse/SI-10129 - LOG.warn(s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", ae) + LOG.warn( + s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", + ae + ) None case t: Throwable => throw t } - private def getClassesForType(mirror: RuntimeMirror, typeSignature: Type): Seq[Class[_]] = typeSignature match { - case TypeRef(_, _, args) => - args.flatMap { generic => - //If the wrapped type is a Tuple, recurse into its types - if (generic.typeSymbol.fullName.startsWith("scala.Tuple")) { - getClassesForType(mirror, generic) - } else { - getClassOpt(mirror, generic.typeSymbol) + private def getClassesForType(mirror: RuntimeMirror, typeSignature: Type): Seq[Class[_]] = + typeSignature match { + case TypeRef(_, _, args) => + args.flatMap { generic => + //If the wrapped type is a Tuple, recurse into its types + if (generic.typeSymbol.fullName.startsWith("scala.Tuple")) { + getClassesForType(mirror, generic) + } else { + getClassOpt(mirror, generic.typeSymbol) + } } - } - //.member returns the accessor method for the variable unless the field is private[this], so inspect the return type - case NullaryMethodType(resultType) => getClassesForType(mirror, resultType) - case _ => Nil - } + //.member returns the accessor method for the variable unless the field is private[this], so inspect the return type + case NullaryMethodType(resultType) => getClassesForType(mirror, resultType) + case _ => Nil + } - private def getClassOpt(mirror: RuntimeMirror, typeSymbol: Symbol): Option[Class[_]] = { + private def getClassOpt(mirror: RuntimeMirror, typeSymbol: Symbol): Option[Class[_]] = try { Some(mirror.runtimeClass(typeSymbol.asClass)) } catch { case _: ClassNotFoundException | ScalaReflectionException(_) => None } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala b/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala index 3042175bb5..49fbea4b0b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala @@ -12,18 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.FlowDef import cascading.pipe.Pipe import com.twitter.algebird.Monoid -import java.util.{ Map => JMap, List => JList } +import java.util.{List => JList, Map => JMap} /** - * This is an enrichment-pattern class for cascading.flow.FlowDef. - * The rule is to never use this class directly in input or return types, but - * only to add methods to FlowDef. + * This is an enrichment-pattern class for cascading.flow.FlowDef. The rule is to never use this class + * directly in input or return types, but only to add methods to FlowDef. */ class RichFlowDef(val fd: FlowDef) { // allow .asScala conversions @@ -57,12 +56,10 @@ class RichFlowDef(val fd: FlowDef) { private[this] def preferLeft[T](left: T, right: T): T = Option(left).getOrElse(right) - private[this] def mergeLeft[K, V](left: JMap[K, V], right: JMap[K, V]): Unit = { - right.asScala.foreach { - case (k, v) => - if (!left.containsKey(k)) left.put(k, v) + private[this] def mergeLeft[K, V](left: JMap[K, V], right: JMap[K, V]): Unit = + right.asScala.foreach { case (k, v) => + if (!left.containsKey(k)) left.put(k, v) } - } private[this] def appendLeft[T](left: JList[T], right: JList[T]): Unit = { val existing = left.asScala.toSet right.asScala @@ -70,13 +67,13 @@ class RichFlowDef(val fd: FlowDef) { .foreach(left.add) } - def isEmpty: Boolean = { + def isEmpty: Boolean = fd.getTraps.isEmpty && fd.getCheckpoints.isEmpty && fd.getSources.isEmpty && fd.getSinks.isEmpty && fd.getTails.isEmpty - } + /** * Mutate current flow def to add all sources/sinks/etc from given FlowDef */ @@ -141,20 +138,20 @@ class RichFlowDef(val fd: FlowDef) { } // Update the FlowState: - FlowStateMap.get(fd) + FlowStateMap + .get(fd) .foreach { thisFS => /** - * these are all the sources that are upstream - * of the pipe in question + * these are all the sources that are upstream of the pipe in question */ val subFlowState = Monoid.sum( - thisFS - .sourceMap + thisFS.sourceMap .collect { case (name, source) if headNames(name) => FlowState.withSource(name, source) - }) + } + ) /* * We assume all the old config updates need to be * done, but this may an over approximation and not diff --git a/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala index 3fdaa253b2..3f2046620b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.property.ConfigDef.Getter @@ -38,29 +38,33 @@ object RichPipe extends java.io.Serializable { private[scalding] val FormerNameBitLength = 12 private[scalding] val FormerAssignedPipeNamePattern = "^_pipe_([0-9]+).*$".r - private[scalding] val FromUuidPattern = "^.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-([0-9a-f]{12}).*$".r + private[scalding] val FromUuidPattern = + "^.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-([0-9a-f]{12}).*$".r // grab some bit of the previous pipe name to help walk up the graph across name assignments private def getFormerNameBit(p: Pipe): String = p.getName match { case FormerAssignedPipeNamePattern(pipeNumber) => pipeNumber - case FromUuidPattern(lastGroup) => lastGroup /* 12 characters */ - case s if s.length > FormerNameBitLength => s.substring(s.length - FormerNameBitLength, s.length) - case s => s + case FromUuidPattern(lastGroup) => lastGroup /* 12 characters */ + case s if s.length > FormerNameBitLength => s.substring(s.length - FormerNameBitLength, s.length) + case s => s } /** * Assign a new, guaranteed-unique name to the pipe. - * @param p a pipe, whose name should be changed - * @return a pipe with a new name which is guaranteed to be new and never re-assigned by this function + * @param p + * a pipe, whose name should be changed + * @return + * a pipe with a new name which is guaranteed to be new and never re-assigned by this function * * Note: the assigned name includes a few characters from the former name to assisgit dift in debugging. */ def assignName(p: Pipe): Pipe = new Pipe(getNextName + "-" + getFormerNameBit(p), p) private val REDUCER_KEY = "mapred.reduce.tasks" + /** - * Gets the underlying config for this pipe and sets the number of reducers - * useful for cascading GroupBy/CoGroup pipes. + * Gets the underlying config for this pipe and sets the number of reducers useful for cascading + * GroupBy/CoGroup pipes. */ def setReducers(p: Pipe, reducers: Int): Pipe = { if (reducers > 0) { @@ -69,41 +73,40 @@ object RichPipe extends java.io.Serializable { p.getStepConfigDef() .setProperty(Config.WithReducersSetExplicitly, "true") } else if (reducers != -1) { - throw new IllegalArgumentException(s"Number of reducers must be non-negative. Got: ${reducers}") + throw new IllegalArgumentException(s"Number of reducers must be non-negative. Got: $reducers") } p } // A pipe can have more than one description when merged together, so we store them delimited with 255.toChar. // Cannot use 1.toChar as we get an error if it is not a printable character. - private def encodePipeDescriptions(descriptions: Seq[String]): String = { + private def encodePipeDescriptions(descriptions: Seq[String]): String = descriptions.map(_.replace(255.toChar, ' ')).filter(_.nonEmpty).mkString(255.toChar.toString) - } - private def decodePipeDescriptions(encoding: String): Seq[String] = { + private def decodePipeDescriptions(encoding: String): Seq[String] = encoding.split(255.toChar).toSeq - } - def getPipeDescriptions(p: Pipe): Seq[String] = { + def getPipeDescriptions(p: Pipe): Seq[String] = if (p.getStepConfigDef.isEmpty) Nil else { // We use empty getter so we can get latest config value of Config.PipeDescriptions in the step ConfigDef. - val encodedResult = p.getStepConfigDef.apply(Config.PipeDescriptions, new Getter { - override def update(s: String, s1: String): String = ??? - override def get(s: String): String = null - }) + val encodedResult = p.getStepConfigDef.apply( + Config.PipeDescriptions, + new Getter { + override def update(s: String, s1: String): String = ??? + override def get(s: String): String = null + } + ) Option(encodedResult) .filterNot(_.isEmpty) .map(decodePipeDescriptions) .getOrElse(Nil) } - } def setPipeDescriptions(p: Pipe, descriptions: Seq[String]): Pipe = { - p.getStepConfigDef().setProperty( - Config.PipeDescriptions, - encodePipeDescriptions(getPipeDescriptions(p) ++ descriptions)) + p.getStepConfigDef() + .setProperty(Config.PipeDescriptions, encodePipeDescriptions(getPipeDescriptions(p) ++ descriptions)) p } @@ -130,32 +133,30 @@ object RichPipe extends java.io.Serializable { (!pipe.isInstanceOf[Splice]) /** - * This is true if a pipe passes through all - * input fields without explicitly remapping + * This is true if a pipe passes through all input fields without explicitly remapping */ @annotation.tailrec final def isPassthrough(pipe: Pipe): Boolean = { def element(p: Pipe): Boolean = p match { case e: Each if e.isFilter => true - case cp: Checkpoint => true - case _ => false + case cp: Checkpoint => true + case _ => false } isSourcePipe(pipe) || { element(pipe) && - (getSinglePreviousPipe(pipe) match { - case Some(prev) => isPassthrough(prev) - case None => false - }) + (getSinglePreviousPipe(pipe) match { + case Some(prev) => isPassthrough(prev) + case None => false + }) } } } /** - * This is an enrichment-pattern class for cascading.pipe.Pipe. - * The rule is to never use this class directly in input or return types, but - * only to add methods to Pipe. + * This is an enrichment-pattern class for cascading.pipe.Pipe. The rule is to never use this class directly + * in input or return types, but only to add methods to Pipe. */ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms { // We need this for the implicits @@ -168,21 +169,32 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms def name(s: String): Pipe = new Pipe(s, pipe) /** - * Beginning of block with access to expensive nonserializable state. The state object should - * contain a function release() for resource management purpose. + * Beginning of block with access to expensive nonserializable state. The state object should contain a + * function release() for resource management purpose. */ def using[C <: { def release(): Unit }](bf: => C) = new { /** * For pure side effect. */ - def foreach[A](f: Fields)(fn: (C, A) => Unit)(implicit conv: TupleConverter[A], set: TupleSetter[Unit], flowDef: FlowDef, mode: Mode) = { + def foreach[A](f: Fields)( + fn: (C, A) => Unit + )(implicit conv: TupleConverter[A], set: TupleSetter[Unit], flowDef: FlowDef, mode: Mode) = { conv.assertArityMatches(f) - val newPipe = new Each(pipe, f, new SideEffectMapFunction(bf, fn, - new Function1[C, Unit] with java.io.Serializable { - def apply(c: C): Unit = { c.release() } - }, - Fields.NONE, conv, set)) + val newPipe = new Each( + pipe, + f, + new SideEffectMapFunction( + bf, + fn, + new Function1[C, Unit] with java.io.Serializable { + def apply(c: C): Unit = c.release() + }, + Fields.NONE, + conv, + set + ) + ) NullSource.writeFrom(newPipe)(flowDef, mode) newPipe } @@ -190,43 +202,55 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms /** * map with state */ - def map[A, T](fs: (Fields, Fields))(fn: (C, A) => T)(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { + def map[A, T]( + fs: (Fields, Fields) + )(fn: (C, A) => T)(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { conv.assertArityMatches(fs._1) set.assertArityMatches(fs._2) - val mf = new SideEffectMapFunction(bf, fn, + val mf = new SideEffectMapFunction( + bf, + fn, new Function1[C, Unit] with java.io.Serializable { - def apply(c: C): Unit = { c.release() } + def apply(c: C): Unit = c.release() }, - fs._2, conv, set) + fs._2, + conv, + set + ) new Each(pipe, fs._1, mf, defaultMode(fs._1, fs._2)) } /** * flatMap with state */ - def flatMap[A, T](fs: (Fields, Fields))(fn: (C, A) => TraversableOnce[T])(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { + def flatMap[A, T]( + fs: (Fields, Fields) + )(fn: (C, A) => TraversableOnce[T])(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { conv.assertArityMatches(fs._1) set.assertArityMatches(fs._2) - val mf = new SideEffectFlatMapFunction(bf, fn, + val mf = new SideEffectFlatMapFunction( + bf, + fn, new Function1[C, Unit] with java.io.Serializable { - def apply(c: C): Unit = { c.release() } + def apply(c: C): Unit = c.release() }, - fs._2, conv, set) + fs._2, + conv, + set + ) new Each(pipe, fs._1, mf, defaultMode(fs._1, fs._2)) } } /** - * Keep only the given fields, and discard the rest. - * takes any number of parameters as long as we can convert - * them to a fields object + * Keep only the given fields, and discard the rest. takes any number of parameters as long as we can + * convert them to a fields object */ def project(fields: Fields): Pipe = new Each(pipe, fields, new Identity(fields)) /** - * Discard the given fields, and keep the rest. - * Kind of the opposite of project method. + * Discard the given fields, and keep the rest. Kind of the opposite of project method. */ def discard(f: Fields): Pipe = new Each(pipe, f, new NoOp, Fields.SWAP) @@ -239,10 +263,9 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms /** * group the Pipe based on fields * - * builder is typically a block that modifies the given GroupBuilder - * the final OUTPUT of the block is used to schedule the new pipe - * each method in GroupBuilder returns this, so it is recommended - * to chain them and use the default input: + * builder is typically a block that modifies the given GroupBuilder the final OUTPUT of the block is used + * to schedule the new pipe each method in GroupBuilder returns this, so it is recommended to chain them and + * use the default input: * * {{{ * _.size.max('f1) etc... @@ -255,7 +278,7 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms * Returns the set of distinct tuples containing the specified fields */ def distinct(f: Fields): Pipe = - groupBy(f) { _.size('__uniquecount__) }.project(f) + groupBy(f)(_.size('__uniquecount__)).project(f) /** * Returns the set of unique tuples containing the specified fields. Same as distinct @@ -265,7 +288,7 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms /** * Merge or Concatenate several pipes together with this one: */ - def ++(that: Pipe): Pipe = { + def ++(that: Pipe): Pipe = if (this.pipe == that) { // Cascading fails on self merge: // solution by Jack Guo @@ -273,46 +296,42 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms } else { new Merge(assignName(this.pipe), assignName(that)) } - } /** - * Group all tuples down to one reducer. - * (due to cascading limitation). - * This is probably only useful just before setting a tail such as Database - * tail, so that only one reducer talks to the DB. Kind of a hack. + * Group all tuples down to one reducer. (due to cascading limitation). This is probably only useful just + * before setting a tail such as Database tail, so that only one reducer talks to the DB. Kind of a hack. */ - def groupAll: Pipe = groupAll { _.pass } + def groupAll: Pipe = groupAll(_.pass) /** - * == Warning == + * ==Warning== * This kills parallelism. All the work is sent to one reducer. * - * Only use this in the case that you truly need all the data on one - * reducer. + * Only use this in the case that you truly need all the data on one reducer. * - * Just about the only reasonable case of this method is to reduce all values of a column - * or count all the rows. + * Just about the only reasonable case of this method is to reduce all values of a column or count all the + * rows. */ def groupAll(gs: GroupBuilder => GroupBuilder) = - map(() -> '__groupAll__) { (u: Unit) => 1 } - .groupBy('__groupAll__) { gs(_).reducers(1) } + map(() -> '__groupAll__)((u: Unit) => 1) + .groupBy('__groupAll__)(gs(_).reducers(1)) .discard('__groupAll__) /** * Force a random shuffle of all the data to exactly n reducers */ - def shard(n: Int): Pipe = groupRandomly(n) { _.pass } + def shard(n: Int): Pipe = groupRandomly(n)(_.pass) + /** - * Force a random shuffle of all the data to exactly n reducers, - * with a given seed if you need repeatability. + * Force a random shuffle of all the data to exactly n reducers, with a given seed if you need + * repeatability. */ - def shard(n: Int, seed: Int): Pipe = groupRandomly(n, seed) { _.pass } + def shard(n: Int, seed: Int): Pipe = groupRandomly(n, seed)(_.pass) /** * Like groupAll, but randomly groups data into n reducers. * - * you can provide a seed for the random number generator - * to get reproducible results + * you can provide a seed for the random number generator to get reproducible results */ def groupRandomly(n: Int)(gs: GroupBuilder => GroupBuilder): Pipe = groupRandomlyAux(n, None)(gs) @@ -326,27 +345,25 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms // achieves the behavior that reducer i gets i_th shard // by relying on cascading to use java's hashCode, which hash ints // to themselves - protected def groupRandomlyAux(n: Int, optSeed: Option[Long])(gs: GroupBuilder => GroupBuilder): Pipe = { + protected def groupRandomlyAux(n: Int, optSeed: Option[Long])(gs: GroupBuilder => GroupBuilder): Pipe = using(statefulRandom(optSeed)) - .map(() -> '__shard__) { (r: Random, _: Unit) => r.nextInt(n) } - .groupBy('__shard__) { gs(_).reducers(n) } + .map(() -> '__shard__)((r: Random, _: Unit) => r.nextInt(n)) + .groupBy('__shard__)(gs(_).reducers(n)) .discard('__shard__) - } private def statefulRandom(optSeed: Option[Long]): Random with Stateful = { val random = new Random with Stateful - optSeed.foreach { x => random.setSeed(x) } + optSeed.foreach(x => random.setSeed(x)) random } /** * Put all rows in random order * - * you can provide a seed for the random number generator - * to get reproducible results + * you can provide a seed for the random number generator to get reproducible results */ - def shuffle(shards: Int): Pipe = groupAndShuffleRandomly(shards) { _.pass } - def shuffle(shards: Int, seed: Long): Pipe = groupAndShuffleRandomly(shards, seed) { _.pass } + def shuffle(shards: Int): Pipe = groupAndShuffleRandomly(shards)(_.pass) + def shuffle(shards: Int, seed: Long): Pipe = groupAndShuffleRandomly(shards, seed)(_.pass) /** * Like shard, except do some operation im the reducers @@ -360,19 +377,20 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms def groupAndShuffleRandomly(reducers: Int, seed: Long)(gs: GroupBuilder => GroupBuilder): Pipe = groupAndShuffleRandomlyAux(reducers, Some(seed))(gs) - private def groupAndShuffleRandomlyAux(reducers: Int, optSeed: Option[Long])(gs: GroupBuilder => GroupBuilder): Pipe = { + private def groupAndShuffleRandomlyAux(reducers: Int, optSeed: Option[Long])( + gs: GroupBuilder => GroupBuilder + ): Pipe = using(statefulRandom(optSeed)) - .map(() -> ('__shuffle__)) { (r: Random, _: Unit) => r.nextDouble() } - .groupRandomlyAux(reducers, optSeed){ g: GroupBuilder => + .map(() -> '__shuffle__)((r: Random, _: Unit) => r.nextDouble()) + .groupRandomlyAux(reducers, optSeed) { g: GroupBuilder => gs(g.sortBy('__shuffle__)) } .discard('__shuffle__) - } /** * Adds a field with a constant value. * - * == Usage == + * ==Usage== * {{{ * insert('a, 1) * }}} @@ -383,17 +401,16 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms /** * Rename some set of N fields as another set of N fields * - * == Usage == + * ==Usage== * {{{ * rename('x -> 'z) - * rename(('x,'y) -> ('X,'Y)) + * rename(('x,'y) -> ('X,'Y)) * }}} * - * == Warning == - * `rename('x,'y)` is interpreted by scala as `rename(Tuple2('x,'y))` - * which then does `rename('x -> 'y)`. This is probably not what is intended - * but the compiler doesn't resolve the ambiguity. YOU MUST CALL THIS WITH - * A TUPLE2! If you don't, expect the unexpected. + * ==Warning== + * `rename('x,'y)` is interpreted by scala as `rename(Tuple2('x,'y))` which then does `rename('x -> 'y)`. + * This is probably not what is intended but the compiler doesn't resolve the ambiguity. YOU MUST CALL THIS + * WITH A TUPLE2! If you don't, expect the unexpected. */ def rename(fields: (Fields, Fields)): Pipe = { val (fromFields, toFields) = fields @@ -412,42 +429,34 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms } /** - * Keep only items that don't satisfy this predicate. - * `filterNot` is equal to negating a `filter` operation. + * Keep only items that don't satisfy this predicate. `filterNot` is equal to negating a `filter` operation. * - * {{{ filterNot('name) { name: String => name contains "a" } }}} + * {{{filterNot('name) { name: String => name contains "a" }}}} * * is the same as: * - * {{{ filter('name) { name: String => !(name contains "a") } }}} + * {{{filter('name) { name: String => !(name contains "a") }}}} */ def filterNot[A](f: Fields)(fn: (A) => Boolean)(implicit conv: TupleConverter[A]): Pipe = filter[A](f)(!fn(_)) /** - * Text files can have corrupted data. If you use this function and a - * cascading trap you can filter out corrupted data from your pipe. + * Text files can have corrupted data. If you use this function and a cascading trap you can filter out + * corrupted data from your pipe. */ - def verifyTypes[A](f: Fields)(implicit conv: TupleConverter[A]): Pipe = { - pipe.filter(f) { (a: A) => true } - } + def verifyTypes[A](f: Fields)(implicit conv: TupleConverter[A]): Pipe = + pipe.filter(f)((a: A) => true) /** - * Given a function, partitions the pipe into several groups based on the - * output of the function. Then applies a GroupBuilder function on each of the - * groups. + * Given a function, partitions the pipe into several groups based on the output of the function. Then + * applies a GroupBuilder function on each of the groups. * - * Example: - * pipe - * .mapTo(()->('age, 'weight) { ... } - * .partition('age -> 'isAdult) { _ > 18 } { _.average('weight) } - * pipe now contains the average weights of adults and minors. + * Example: pipe .mapTo(()->('age, 'weight) { ... } .partition('age -> 'isAdult) { _ > 18 } { + * _.average('weight) } pipe now contains the average weights of adults and minors. */ def partition[A, R](fs: (Fields, Fields))(fn: (A) => R)( - builder: GroupBuilder => GroupBuilder)( - implicit conv: TupleConverter[A], - ord: Ordering[R], - rset: TupleSetter[R]): Pipe = { + builder: GroupBuilder => GroupBuilder + )(implicit conv: TupleConverter[A], ord: Ordering[R], rset: TupleSetter[R]): Pipe = { val (fromFields, toFields) = fs conv.assertArityMatches(fromFields) rset.assertArityMatches(toFields) @@ -457,56 +466,59 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms map(fromFields -> tmpFields)(fn)(conv, TupleSetter.singleSetter[R]) .groupBy(tmpFields)(builder) - .map[R, R](tmpFields -> toFields){ (r: R) => r }(TupleConverter.singleConverter[R], rset) + .map[R, R](tmpFields -> toFields)((r: R) => r)(TupleConverter.singleConverter[R], rset) .discard(tmpFields) } /** - * If you use a map function that does not accept TupleEntry args, - * which is the common case, an implicit conversion in GeneratedConversions - * will convert your function into a `(TupleEntry => T)`. The result type - * T is converted to a cascading Tuple by an implicit `TupleSetter[T]`. - * acceptable T types are primitive types, cascading Tuples of those types, - * or `scala.Tuple(1-22)` of those types. + * If you use a map function that does not accept TupleEntry args, which is the common case, an implicit + * conversion in GeneratedConversions will convert your function into a `(TupleEntry => T)`. The result type + * T is converted to a cascading Tuple by an implicit `TupleSetter[T]`. acceptable T types are primitive + * types, cascading Tuples of those types, or `scala.Tuple(1-22)` of those types. * - * After the map, the input arguments will be set to the output of the map, - * so following with filter or map is fine without a new using statement if - * you mean to operate on the output. + * After the map, the input arguments will be set to the output of the map, so following with filter or map + * is fine without a new using statement if you mean to operate on the output. * * {{{ * map('data -> 'stuff) * }}} * - * * if output equals input, REPLACE is used. - * * if output or input is a subset of the other SWAP is used. - * * otherwise we append the new fields (cascading Fields.ALL is used) + * * if output equals input, REPLACE is used. * if output or input is a subset of the other SWAP is used. * + * otherwise we append the new fields (cascading Fields.ALL is used) * * {{{ * mapTo('data -> 'stuff) * }}} * - * Only the results (stuff) are kept (cascading Fields.RESULTS) + * Only the results (stuff) are kept (cascading Fields.RESULTS) * - * == Note == - * Using mapTo is the same as using map followed by a project for - * selecting just the output fields + * ==Note== + * Using mapTo is the same as using map followed by a project for selecting just the output fields */ - def map[A, T](fs: (Fields, Fields))(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def map[A, T]( + fs: (Fields, Fields) + )(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) each(fs)(new MapFunction[A, T](fn, _, conv, setter)) } - def mapTo[A, T](fs: (Fields, Fields))(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def mapTo[A, T]( + fs: (Fields, Fields) + )(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) eachTo(fs)(new MapFunction[A, T](fn, _, conv, setter)) } - def flatMap[A, T](fs: (Fields, Fields))(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def flatMap[A, T]( + fs: (Fields, Fields) + )(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) each(fs)(new FlatMapFunction[A, T](fn, _, conv, setter)) } - def flatMapTo[A, T](fs: (Fields, Fields))(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def flatMapTo[A, T]( + fs: (Fields, Fields) + )(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) eachTo(fs)(new FlatMapFunction[A, T](fn, _, conv, setter)) @@ -515,12 +527,16 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms /** * Filters all data that is defined for this partial function and then applies that function */ - def collect[A, T](fs: (Fields, Fields))(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def collect[A, T]( + fs: (Fields, Fields) + )(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) pipe.each(fs)(new CollectFunction[A, T](fn, _, conv, setter)) } - def collectTo[A, T](fs: (Fields, Fields))(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + def collectTo[A, T]( + fs: (Fields, Fields) + )(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { conv.assertArityMatches(fs._1) setter.assertArityMatches(fs._2) pipe.eachTo(fs)(new CollectFunction[A, T](fn, _, conv, setter)) @@ -535,8 +551,10 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms * * Common enough to be useful. */ - def flatten[T](fs: (Fields, Fields))(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = - flatMap[TraversableOnce[T], T](fs)({ it: TraversableOnce[T] => it })(conv, setter) + def flatten[T]( + fs: (Fields, Fields) + )(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = + flatMap[TraversableOnce[T], T](fs) { it: TraversableOnce[T] => it }(conv, setter) /** * the same as @@ -547,36 +565,36 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms * * Common enough to be useful. */ - def flattenTo[T](fs: (Fields, Fields))(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = - flatMapTo[TraversableOnce[T], T](fs)({ it: TraversableOnce[T] => it })(conv, setter) + def flattenTo[T]( + fs: (Fields, Fields) + )(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = + flatMapTo[TraversableOnce[T], T](fs) { it: TraversableOnce[T] => it }(conv, setter) /** - * Force a materialization to disk in the flow. - * This is useful before crossWithTiny if you filter just before. Ideally scalding/cascading would - * see this (and may in future versions), but for now it is here to aid in hand-tuning jobs + * Force a materialization to disk in the flow. This is useful before crossWithTiny if you filter just + * before. Ideally scalding/cascading would see this (and may in future versions), but for now it is here to + * aid in hand-tuning jobs */ lazy val forceToDisk: Pipe = new Checkpoint(pipe) /** * Convenience method for integrating with existing cascading Functions */ - def each(fs: (Fields, Fields))(fn: Fields => Function[_]) = { + def each(fs: (Fields, Fields))(fn: Fields => Function[_]) = new Each(pipe, fs._1, fn(fs._2), defaultMode(fs._1, fs._2)) - } /** * Same as above, but only keep the results field. */ - def eachTo(fs: (Fields, Fields))(fn: Fields => Function[_]) = { + def eachTo(fs: (Fields, Fields))(fn: Fields => Function[_]) = new Each(pipe, fs._1, fn(fs._2), Fields.RESULTS) - } /** - * This is an analog of the SQL/Excel unpivot function which converts columns of data - * into rows of data. Only the columns given as input fields are expanded in this way. - * For this operation to be reversible, you need to keep some unique key on each row. - * See GroupBuilder.pivot to reverse this operation assuming you leave behind a grouping key - * == Example == + * This is an analog of the SQL/Excel unpivot function which converts columns of data into rows of data. + * Only the columns given as input fields are expanded in this way. For this operation to be reversible, you + * need to keep some unique key on each row. See GroupBuilder.pivot to reverse this operation assuming you + * leave behind a grouping key + * ==Example== * {{{ * pipe.unpivot(('w,'x,'y,'z) -> ('feature, 'value)) * }}} @@ -599,32 +617,32 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms def unpivot(fieldDef: (Fields, Fields)): Pipe = { assert(fieldDef._2.size == 2, "Must specify exactly two Field names for the results") // toKeyValueList comes from TupleConversions - pipe.flatMap(fieldDef) { te: TupleEntry => TupleConverter.KeyValueList(te) } + pipe + .flatMap(fieldDef) { te: TupleEntry => TupleConverter.KeyValueList(te) } .discard(fieldDef._1) } /** - * Keep at most n elements. This is implemented by keeping - * approximately n/k elements on each of the k mappers or reducers (whichever we wind - * up being scheduled on). + * Keep at most n elements. This is implemented by keeping approximately n/k elements on each of the k + * mappers or reducers (whichever we wind up being scheduled on). */ def limit(n: Long): Pipe = new Each(pipe, new Limit(n)) /** - * Sample a fraction of elements. fraction should be between 0.00 (0%) and 1.00 (100%) - * you can provide a seed to get reproducible results - * + * Sample a fraction of elements. fraction should be between 0.00 (0%) and 1.00 (100%) you can provide a + * seed to get reproducible results */ def sample(fraction: Double): Pipe = new Each(pipe, new Sample(fraction)) def sample(fraction: Double, seed: Long): Pipe = new Each(pipe, new Sample(seed, fraction)) /** - * Sample fraction of elements with return. fraction should be between 0.00 (0%) and 1.00 (100%) - * you can provide a seed to get reproducible results - * + * Sample fraction of elements with return. fraction should be between 0.00 (0%) and 1.00 (100%) you can + * provide a seed to get reproducible results */ - def sampleWithReplacement(fraction: Double): Pipe = new Each(pipe, new SampleWithReplacement(fraction), Fields.ALL) - def sampleWithReplacement(fraction: Double, seed: Int): Pipe = new Each(pipe, new SampleWithReplacement(fraction, seed), Fields.ALL) + def sampleWithReplacement(fraction: Double): Pipe = + new Each(pipe, new SampleWithReplacement(fraction), Fields.ALL) + def sampleWithReplacement(fraction: Double, seed: Int): Pipe = + new Each(pipe, new SampleWithReplacement(fraction, seed), Fields.ALL) /** * Print all the tuples that pass to stderr @@ -632,9 +650,8 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms def debug: Pipe = debug(PipeDebug()) /** - * Print the tuples that pass with the options configured in debugger - * For instance: - * {{{ debug(PipeDebug().toStdOut.printTuplesEvery(100)) }}} + * Print the tuples that pass with the options configured in debugger For instance: + * {{{debug(PipeDebug().toStdOut.printTuplesEvery(100))}}} */ def debug(dbg: PipeDebug): Pipe = dbg(pipe) @@ -646,23 +663,21 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms A -> FlatMap -> write(tsv) -> FlatMap in the second flatmap cascading will read from the written tsv for running it. However TSV's use toString and so is not a bijection. here we stick in an identity function before the tsv write to keep to force cascading to do any fork/split beforehand. - */ + */ val writePipe: Pipe = outsource match { case t: Tsv => new Each(pipe, Fields.ALL, IdentityFunction, Fields.REPLACE) - case _ => pipe + case _ => pipe } outsource.writeFrom(writePipe)(flowDef, mode) pipe } /** - * Adds a trap to the current pipe, - * which will capture all exceptions that occur in this pipe - * and save them to the trapsource given + * Adds a trap to the current pipe, which will capture all exceptions that occur in this pipe and save them + * to the trapsource given * - * Traps do not include the original fields in a tuple, - * only the fields seen in an operation. - * Traps also do not include any exception information. + * Traps do not include the original fields in a tuple, only the fields seen in an operation. Traps also do + * not include any exception information. * * There can only be at most one trap for each pipe. */ @@ -675,17 +690,18 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms * Divides sum of values for this variable by their sum; assumes without checking that division is supported * on this type and that sum is not zero * - * If those assumptions do not hold, will throw an exception -- consider checking sum sepsarately and/or using addTrap + * If those assumptions do not hold, will throw an exception -- consider checking sum sepsarately and/or + * using addTrap * * in some cases, crossWithTiny has been broken, the implementation supports a work-around */ def normalize(f: Fields, useTiny: Boolean = true): Pipe = { - val total = groupAll { _.sum[Double](f -> '__total_for_normalize__) } + val total = groupAll(_.sum[Double](f -> '__total_for_normalize__)) (if (useTiny) { - crossWithTiny(total) - } else { - crossWithSmaller(total) - }) + crossWithTiny(total) + } else { + crossWithSmaller(total) + }) .map(Fields.merge(f, '__total_for_normalize__) -> f) { args: (Double, Double) => args._1 / args._2 } @@ -698,15 +714,14 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms * pipe.pack[(Int, Int)] (('field1, 'field2) -> 'field3) * }}} * - * will pack fields 'field1 and 'field2 to field 'field3, as long as 'field1 and 'field2 - * can be cast into integers. The output field 'field3 will be of tupel `(Int, Int)` - * + * will pack fields 'field1 and 'field2 to field 'field3, as long as 'field1 and 'field2 can be cast into + * integers. The output field 'field3 will be of tupel `(Int, Int)` */ def pack[T](fs: (Fields, Fields))(implicit packer: TuplePacker[T], setter: TupleSetter[T]): Pipe = { val (fromFields, toFields) = fs assert(toFields.size == 1, "Can only output 1 field in pack") val conv = packer.newConverter(fromFields) - pipe.map(fs) { input: T => input } (conv, setter) + pipe.map(fs) { input: T => input }(conv, setter) } /** @@ -716,12 +731,11 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms val (fromFields, toFields) = fs assert(toFields.size == 1, "Can only output 1 field in pack") val conv = packer.newConverter(fromFields) - pipe.mapTo(fs) { input: T => input } (conv, setter) + pipe.mapTo(fs) { input: T => input }(conv, setter) } /** - * The opposite of pack. Unpacks the input field of type `T` into - * the output fields. For example: + * The opposite of pack. Unpacks the input field of type `T` into the output fields. For example: * * {{{ * pipe.unpack[(Int, Int)] ('field1 -> ('field2, 'field3)) @@ -734,18 +748,20 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms assert(fromFields.size == 1, "Can only take 1 input field in unpack") val fields = (fromFields, unpacker.getResultFields(toFields)) val setter = unpacker.newSetter(toFields) - pipe.map(fields) { input: T => input } (conv, setter) + pipe.map(fields) { input: T => input }(conv, setter) } /** * Same as unpack but only the to fields are preserved. */ - def unpackTo[T](fs: (Fields, Fields))(implicit unpacker: TupleUnpacker[T], conv: TupleConverter[T]): Pipe = { + def unpackTo[T]( + fs: (Fields, Fields) + )(implicit unpacker: TupleUnpacker[T], conv: TupleConverter[T]): Pipe = { val (fromFields, toFields) = fs assert(fromFields.size == 1, "Can only take 1 input field in unpack") val fields = (fromFields, unpacker.getResultFields(toFields)) val setter = unpacker.newSetter(toFields) - pipe.mapTo(fields) { input: T => input } (conv, setter) + pipe.mapTo(fields) { input: T => input }(conv, setter) } /** @@ -753,19 +769,24 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms */ def upstreamPipes: Set[Pipe] = Iterator - .iterate(Seq(pipe))(pipes => for (p <- pipes; prev <- p.getPrevious) yield prev) + .iterate(Seq(pipe))(pipes => + for { + p <- pipes + prev <- p.getPrevious + } yield prev + ) .takeWhile(_.length > 0) .flatten .toSet /** - * This finds all the boxed serializations stored in the flow state map for this - * flowdef. We then find all the pipes back in the DAG from this pipe and apply - * those serializations. + * This finds all the boxed serializations stored in the flow state map for this flowdef. We then find all + * the pipes back in the DAG from this pipe and apply those serializations. */ private[scalding] def applyFlowConfigProperties(flowDef: FlowDef): Pipe = { case class ToVisit[T](queue: Queue[T], inQueue: Set[T]) { - def maybeAdd(t: T): ToVisit[T] = if (inQueue(t)) this else { + def maybeAdd(t: T): ToVisit[T] = if (inQueue(t)) this + else { ToVisit(queue :+ t, inQueue + t) } def next: Option[(T, ToVisit[T])] = @@ -781,17 +802,16 @@ class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms nextToVisit.next match { case Some((h, innerNextToVisit)) => go(h, nextVisited, innerNextToVisit) - case _ => nextVisited + case _ => nextVisited } } val allPipes = go(pipe, Set[Pipe](), ToVisit[Pipe](Queue.empty, Set.empty)) FlowStateMap.get(flowDef).foreach { fstm => - fstm.flowConfigUpdates.foreach { - case (k, v) => - allPipes.foreach { p => - p.getStepConfigDef().setProperty(k, v) - } + fstm.flowConfigUpdates.foreach { case (k, v) => + allPipes.foreach { p => + p.getStepConfigDef().setProperty(k, v) + } } } pipe diff --git a/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala b/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala index 3f745ff190..31507ea4ec 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding /** @@ -22,12 +22,11 @@ sealed abstract class SkewReplication { val DEFAULT_NUM_REDUCERS = 100 /** - * Given the estimated frequencies of a join key in two pipes that we want to skew-join together, - * this returns the key's replication amount in each pipe. + * Given the estimated frequencies of a join key in two pipes that we want to skew-join together, this + * returns the key's replication amount in each pipe. * - * Note: if we switch to a Count-Min sketch, we'll need to change the meaning of these counts - * from "sampled counts" to "estimates of full counts", and also change how we deal with counts of - * zero. + * Note: if we switch to a Count-Min sketch, we'll need to change the meaning of these counts from "sampled + * counts" to "estimates of full counts", and also change how we deal with counts of zero. */ def getReplications(leftCount: Int, rightCount: Int, reducers: Int): (Int, Int) } @@ -52,8 +51,8 @@ final case class SkewReplicationA(replicationFactor: Int = 1) extends SkewReplic /** * See https://github.com/twitter/scalding/pull/229#issuecomment-10792296 */ -final case class SkewReplicationB(maxKeysInMemory: Int = 1E6.toInt, maxReducerOutput: Int = 1E7.toInt) - extends SkewReplication { +final case class SkewReplicationB(maxKeysInMemory: Int = 1e6.toInt, maxReducerOutput: Int = 1e7.toInt) + extends SkewReplication { override def getReplications(leftCount: Int, rightCount: Int, reducers: Int) = { val numReducers = if (reducers <= 0) DEFAULT_NUM_REDUCERS else reducers diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala b/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala index 904271d4f2..bcda603eda 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Source.scala b/scalding-core/src/main/scala/com/twitter/scalding/Source.scala index b4b8341ee4..1c2aee5a33 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Source.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Source.scala @@ -12,19 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import java.io.{ InputStream, OutputStream } -import java.util.{ Map => JMap, Properties, UUID } +import java.io.{InputStream, OutputStream} +import java.util.{Map => JMap, Properties, UUID} import cascading.flow.FlowDef import cascading.flow.FlowProcess -import cascading.scheme.{ NullScheme, Scheme } +import cascading.scheme.{NullScheme, Scheme} import cascading.tap.hadoop.Hfs import cascading.tap.SinkMode -import cascading.tap.{ Tap, SourceTap, SinkTap } -import cascading.tuple.{ Fields, Tuple => CTuple, TupleEntry, TupleEntryCollector, TupleEntryIterator } +import cascading.tap.{SinkTap, SourceTap, Tap} +import cascading.tuple.{Fields, Tuple => CTuple, TupleEntry, TupleEntryCollector, TupleEntryIterator} import cascading.pipe.Pipe @@ -44,12 +44,11 @@ class InvalidSourceException(message: String, cause: Throwable) extends RuntimeE } /** - * InvalidSourceTap used in createTap method when we want to defer - * the failures to validateTaps method. + * InvalidSourceTap used in createTap method when we want to defer the failures to validateTaps method. * - * This is used because for Job classes, createTap method on sources is called - * when the class is initialized. In most cases though, we want any exceptions to be - * thrown by validateTaps method, which is called subsequently during flow planning. + * This is used because for Job classes, createTap method on sources is called when the class is initialized. + * In most cases though, we want any exceptions to be thrown by validateTaps method, which is called + * subsequently during flow planning. * * hdfsPaths represents user-supplied list that was detected as not containing any valid paths. */ @@ -66,7 +65,8 @@ class InvalidSourceTap(val e: Throwable) extends SourceTap[JobConf, RecordReader override def getModifiedTime(conf: JobConf): Long = 0L - override def openForRead(flow: FlowProcess[JobConf], input: RecordReader[_, _]): TupleEntryIterator = throw new InvalidSourceException("Encountered InvalidSourceTap!", e) + override def openForRead(flow: FlowProcess[JobConf], input: RecordReader[_, _]): TupleEntryIterator = + throw new InvalidSourceException("Encountered InvalidSourceTap!", e) override def resourceExists(conf: JobConf): Boolean = false @@ -88,13 +88,16 @@ class InvalidSourceTap(val e: Throwable) extends SourceTap[JobConf, RecordReader } /** - * Better error messaging for the occasion where an InvalidSourceTap does not - * fail in validation. + * Better error messaging for the occasion where an InvalidSourceTap does not fail in validation. */ private[scalding] class InvalidInputFormat extends InputFormat[Nothing, Nothing] { override def getSplits(conf: JobConf, numSplits: Int): Nothing = throw new InvalidSourceException("getSplits called on InvalidInputFormat") - override def getRecordReader(split: InputSplit, conf: JobConf, reporter: org.apache.hadoop.mapred.Reporter): Nothing = + override def getRecordReader( + split: InputSplit, + conf: JobConf, + reporter: org.apache.hadoop.mapred.Reporter + ): Nothing = throw new InvalidSourceException("getRecordReader called on InvalidInputFormat") } @@ -121,29 +124,24 @@ object CastHfsTap { } /** - * Every source must have a correct toString method. If you use - * case classes for instances of sources, you will get this for free. - * This is one of the several reasons we recommend using cases classes + * Every source must have a correct toString method. If you use case classes for instances of sources, you + * will get this for free. This is one of the several reasons we recommend using cases classes * - * java.io.Serializable is needed if the Source is going to have any - * methods attached that run on mappers or reducers, which will happen - * if you implement transformForRead or transformForWrite. + * java.io.Serializable is needed if the Source is going to have any methods attached that run on mappers or + * reducers, which will happen if you implement transformForRead or transformForWrite. */ abstract class Source extends java.io.Serializable { /** - * The mock passed in to scalding.JobTest may be considered - * as a mock of the Tap or the Source. By default, as of 0.9.0, - * it is considered as a Mock of the Source. If you set this - * to true, the mock in TestMode will be considered to be a - * mock of the Tap (which must be transformed) and not the Source. + * The mock passed in to scalding.JobTest may be considered as a mock of the Tap or the Source. By default, + * as of 0.9.0, it is considered as a Mock of the Source. If you set this to true, the mock in TestMode will + * be considered to be a mock of the Tap (which must be transformed) and not the Source. */ def transformInTest: Boolean = false /** - * This is a name the refers to this exact instance of the source - * (put another way, if s1.sourceId == s2.sourceId, the job should - * work the same if one is replaced with the other + * This is a name the refers to this exact instance of the source (put another way, if s1.sourceId == + * s2.sourceId, the job should work the same if one is replaced with the other */ def sourceId: String = toString @@ -165,13 +163,12 @@ abstract class Source extends java.io.Serializable { (mode, transformInTest) match { case (test: TestMode, false) => new Pipe(srcName) - case _ => transformForRead(new Pipe(srcName)) + case _ => transformForRead(new Pipe(srcName)) } } /** - * write the pipe but return the input so it can be chained into - * the next operation + * write the pipe but return the input so it can be chained into the next operation */ def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = { checkFlowDefNotNull() @@ -184,23 +181,22 @@ abstract class Source extends java.io.Serializable { } val newPipe = (mode, transformInTest) match { case (test: TestMode, false) => pipe - case _ => transformForWrite(pipe) + case _ => transformForWrite(pipe) } val outPipe = new Pipe(sinkName, newPipe) flowDef.addTail(outPipe) pipe } - protected def checkFlowDefNotNull()(implicit flowDef: FlowDef, mode: Mode): Unit = { + protected def checkFlowDefNotNull()(implicit flowDef: FlowDef, mode: Mode): Unit = assert(flowDef != null, "Trying to access null FlowDef while in mode: %s".format(mode)) - } protected def transformForWrite(pipe: Pipe) = pipe protected def transformForRead(pipe: Pipe) = pipe /** - * Subclasses of Source MUST override this method. They may call out to TestTapFactory for - * making Taps suitable for testing. + * Subclasses of Source MUST override this method. They may call out to TestTapFactory for making Taps + * suitable for testing. */ def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] /* @@ -212,53 +208,58 @@ abstract class Source extends java.io.Serializable { def readAtSubmitter[T](implicit mode: Mode, conv: TupleConverter[T]): Stream[T] = { validateTaps(mode) val tap = createTap(Read)(mode) - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(Config.defaultFrom(mode), tap) - .asScala.map(conv(_)) + .asScala + .map(conv(_)) .toStream } } /** - * Usually as soon as we open a source, we read and do some mapping - * operation on a single column or set of columns. - * T is the type of the single column. If doing multiple columns - * T will be a TupleN representing the types, e.g. (Int,Long,String) + * Usually as soon as we open a source, we read and do some mapping operation on a single column or set of + * columns. T is the type of the single column. If doing multiple columns T will be a TupleN representing the + * types, e.g. (Int,Long,String) * * Prefer to use TypedSource unless you are working with the fields API * - * NOTE: If we don't make this extend Source, established implicits are ambiguous - * when TDsl is in scope. + * NOTE: If we don't make this extend Source, established implicits are ambiguous when TDsl is in scope. */ trait Mappable[+T] extends Source with TypedSource[T] { - final def mapTo[U](out: Fields)(mf: (T) => U)(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = { + final def mapTo[U](out: Fields)( + mf: (T) => U + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = RichPipe(read(flowDef, mode)).mapTo[T, U](sourceFields -> out)(mf)(converter, setter) - } + /** - * If you want to filter, you should use this and output a 0 or 1 length Iterable. - * Filter does not change column names, and we generally expect to change columns here + * If you want to filter, you should use this and output a 0 or 1 length Iterable. Filter does not change + * column names, and we generally expect to change columns here */ - final def flatMapTo[U](out: Fields)(mf: (T) => TraversableOnce[U])(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = { + final def flatMapTo[U](out: Fields)( + mf: (T) => TraversableOnce[U] + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = RichPipe(read(flowDef, mode)).flatMapTo[T, U](sourceFields -> out)(mf)(converter, setter) - } /** - * Allows you to read a Tap on the submit node NOT FOR USE IN THE MAPPERS OR REDUCERS. - * Typical use might be to read in Job.next to determine if another job is needed + * Allows you to read a Tap on the submit node NOT FOR USE IN THE MAPPERS OR REDUCERS. Typical use might be + * to read in Job.next to determine if another job is needed */ def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { validateTaps(mode) val tap = createTap(Read)(mode) val conv = converter - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(config, tap) - .asScala.map { te => conv(te.selectEntry(sourceFields)) } + .asScala + .map(te => conv(te.selectEntry(sourceFields))) } /** - * Transform this Mappable into another by mapping after. - * We don't call this map because of conflicts with Mappable, unfortunately + * Transform this Mappable into another by mapping after. We don't call this map because of conflicts with + * Mappable, unfortunately */ override def andThen[U](fn: T => U): Mappable[U] = { val self = this // compiler generated self can cause problems with serialization @@ -276,21 +277,21 @@ trait Mappable[+T] extends Source with TypedSource[T] { } /** - * Mappable extension that defines the proper converter - * implementation for a Mappable with a single item. + * Mappable extension that defines the proper converter implementation for a Mappable with a single item. */ trait SingleMappable[T] extends Mappable[T] { override def converter[U >: T] = TupleConverter.asSuperConverter(TupleConverter.singleConverter[T]) } /** - * A tap that output nothing. It is used to drive execution of a task for side effect only. This - * can be used to drive a pipe without actually writing to HDFS. + * A tap that output nothing. It is used to drive execution of a task for side effect only. This can be used + * to drive a pipe without actually writing to HDFS. */ class NullTap[Config, Input, Output, SourceContext, SinkContext] - extends SinkTap[Config, Output]( - new NullScheme[Config, Input, Output, SourceContext, SinkContext](Fields.NONE, Fields.ALL), - SinkMode.UPDATE) { + extends SinkTap[Config, Output]( + new NullScheme[Config, Input, Output, SourceContext, SinkContext](Fields.NONE, Fields.ALL), + SinkMode.UPDATE + ) { def getIdentifier = "nullTap" def openForWrite(flowProcess: FlowProcess[Config], output: Output) = @@ -307,17 +308,18 @@ class NullTap[Config, Input, Output, SourceContext, SinkContext] } trait BaseNullSource extends Source { - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = readOrWrite match { case Read => throw new Exception("not supported, reading from null") - case Write => mode match { - case Hdfs(_, _) => new NullTap[JobConf, RecordReader[_, _], OutputCollector[_, _], Any, Any] - case Local(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] - case Test(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] - } + case Write => + mode match { + case Hdfs(_, _) => new NullTap[JobConf, RecordReader[_, _], OutputCollector[_, _], Any, Any] + case Local(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] + case Test(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] + } } - } } + /** * A source outputs nothing. It is used to drive execution of a task for side effect only. */ diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala b/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala index 8456fd632a..9ac3087d09 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala @@ -1,11 +1,11 @@ package com.twitter.scalding -import cascading.flow.{ Flow, FlowListener, FlowDef, FlowProcess } +import cascading.flow.{Flow, FlowDef, FlowListener, FlowProcess} import cascading.flow.hadoop.HadoopFlowProcess import cascading.stats.CascadingStats import java.util.concurrent.ConcurrentHashMap import org.apache.hadoop.mapreduce.Counter -import org.slf4j.{ Logger, LoggerFactory } +import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ import scala.collection.mutable import scala.ref.WeakReference @@ -24,12 +24,15 @@ import scala.util.Try * which increments on the submitter before creating the function. See the difference? */ trait Stat extends java.io.Serializable { + /** * increment by the given amount */ def incBy(amount: Long): Unit + /** increment by 1L */ def inc(): Unit = incBy(1L) + /** increment by -1L (decrement) */ def dec(): Unit = incBy(-1L) def key: StatKey @@ -52,7 +55,7 @@ private[scalding] object CounterImpl { def apply(fp: FlowProcess[_], statKey: StatKey): CounterImpl = fp match { case hFP: HadoopFlowProcess => HadoopFlowPCounterImpl(hFP, statKey) - case _ => GenericFlowPCounterImpl(fp, statKey) + case _ => GenericFlowPCounterImpl(fp, statKey) } } @@ -60,11 +63,13 @@ sealed private[scalding] trait CounterImpl { def increment(amount: Long): Unit } -private[scalding] final case class GenericFlowPCounterImpl(fp: FlowProcess[_], statKey: StatKey) extends CounterImpl { +private[scalding] final case class GenericFlowPCounterImpl(fp: FlowProcess[_], statKey: StatKey) + extends CounterImpl { override def increment(amount: Long): Unit = fp.increment(statKey.group, statKey.counter, amount) } -private[scalding] final case class HadoopFlowPCounterImpl(fp: HadoopFlowProcess, statKey: StatKey) extends CounterImpl { +private[scalding] final case class HadoopFlowPCounterImpl(fp: HadoopFlowProcess, statKey: StatKey) + extends CounterImpl { // we use a nullable type here for efficiency private[this] val counter: Counter = (for { r <- Option(fp.getReporter) @@ -73,11 +78,14 @@ private[scalding] final case class HadoopFlowPCounterImpl(fp: HadoopFlowProcess, def skipNull: Boolean = fp.getProperty(Config.SkipNullCounters) match { - case null => false // by default don't skip + case null => false // by default don't skip case isset => isset.toString.toBoolean } - require((counter != null) || skipNull, s"counter for $statKey is null and ${Config.SkipNullCounters} is not set to true") + require( + (counter != null) || skipNull, + s"counter for $statKey is null and ${Config.SkipNullCounters} is not set to true" + ) override def increment(amount: Long): Unit = if (counter != null) counter.increment(amount) else () @@ -105,7 +113,8 @@ object Stats { // Returns a map of all custom counter names and their counts. def getAllCustomCounters()(implicit cascadingStats: CascadingStats): Map[String, Long] = - cascadingStats.getCountersFor(ScaldingGroup) + cascadingStats + .getCountersFor(ScaldingGroup) .asScala .map { counter => val value = getCounterValue(counter) @@ -115,10 +124,9 @@ object Stats { } /** - * Used to inject a typed unique identifier to uniquely name each scalding flow. - * This is here mostly to deal with the case of testing where there are many - * concurrent threads running Flows. Users should never have to worry about - * these + * Used to inject a typed unique identifier to uniquely name each scalding flow. This is here mostly to deal + * with the case of testing where there are many concurrent threads running Flows. Users should never have to + * worry about these */ case class UniqueID(get: String) { assert(get.indexOf(',') == -1, "UniqueID cannot contain ,: " + get) @@ -154,7 +162,7 @@ object RuntimeStats extends java.io.Serializable { (new ConcurrentHashMap[String, WeakReference[FlowProcess[_]]]).asScala } - def getFlowProcessForUniqueId(uniqueId: UniqueID): FlowProcess[_] = { + def getFlowProcessForUniqueId(uniqueId: UniqueID): FlowProcess[_] = (for { weakFlowProcess <- flowMappingStore.get(uniqueId.get) flowProcess <- weakFlowProcess.get @@ -163,10 +171,9 @@ object RuntimeStats extends java.io.Serializable { }).getOrElse { sys.error("Error in job deployment, the FlowProcess for unique id %s isn't available".format(uniqueId)) } - } private[this] var prevFP: FlowProcess[_] = null - def addFlowProcess(fp: FlowProcess[_]): Unit = { + def addFlowProcess(fp: FlowProcess[_]): Unit = if (!(prevFP eq fp)) { val uniqueJobIdObj = fp.getProperty(UniqueID.UNIQUE_JOB_ID) if (uniqueJobIdObj != null) { @@ -181,14 +188,11 @@ object RuntimeStats extends java.io.Serializable { } prevFP = fp } - } /** - * For serialization, you may need to do: - * val keepAlive = RuntimeStats.getKeepAliveFunction - * outside of a closure passed to map/etc..., and then call: - * keepAlive() - * inside of your closure (mapping, reducing function) + * For serialization, you may need to do: val keepAlive = RuntimeStats.getKeepAliveFunction outside of a + * closure passed to map/etc..., and then call: keepAlive() inside of your closure (mapping, reducing + * function) */ def getKeepAliveFunction(implicit flowDef: FlowDef): () => Unit = { // Don't capture the flowDef, just the id @@ -207,14 +211,14 @@ class StatsFlowListener(f: Map[StatKey, Long] => Try[Unit]) extends FlowListener private var success = true - override def onCompleted(flow: Flow[_]): Unit = { + override def onCompleted(flow: Flow[_]): Unit = if (success) { val stats = flow.getFlowStats - val keys = stats.getCounterGroups.asScala.flatMap(g => stats.getCountersFor(g).asScala.map(c => StatKey(c, g))) + val keys = + stats.getCounterGroups.asScala.flatMap(g => stats.getCountersFor(g).asScala.map(c => StatKey(c, g))) val values = keys.map(k => (k, stats.getCounterValue(k.group, k.counter))).toMap f(values).get } - } override def onThrowable(flow: Flow[_], throwable: Throwable): Boolean = { success = false diff --git a/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala index b7f2216b13..ddceef9948 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala @@ -12,35 +12,34 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -import cascading.tuple.{ Tuple => CTuple, TupleEntry } +import cascading.tuple.{Tuple => CTuple, TupleEntry} /** - * Implements reductions on top of a simple abstraction for the Fields-API - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. + * Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism + * trick to return the type called Self in each operation. */ trait StreamOperations[+Self <: StreamOperations[Self]] extends Sortable[Self] with java.io.Serializable { + /** - * Corresponds to a Cascading Buffer - * which allows you to stream through the data, keeping some, dropping, scanning, etc... - * The iterator you are passed is lazy, and mapping will not trigger the - * entire evaluation. If you convert to a list (i.e. to reverse), you need to be aware - * that memory constraints may become an issue. + * Corresponds to a Cascading Buffer which allows you to stream through the data, keeping some, dropping, + * scanning, etc... The iterator you are passed is lazy, and mapping will not trigger the entire evaluation. + * If you convert to a list (i.e. to reverse), you need to be aware that memory constraints may become an + * issue. * - * WARNING: Any fields not referenced by the input fields will be aligned to the first output, - * and the final hadoop stream will have a length of the maximum of the output of this, and - * the input stream. So, if you change the length of your inputs, the other fields won't - * be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT TO KEEP ALIGNED IN THIS MAPPING! - * POB: This appears to be a Cascading design decision. + * WARNING: Any fields not referenced by the input fields will be aligned to the first output, and the final + * hadoop stream will have a length of the maximum of the output of this, and the input stream. So, if you + * change the length of your inputs, the other fields won't be aligned. YOU NEED TO INCLUDE ALL THE FIELDS + * YOU WANT TO KEEP ALIGNED IN THIS MAPPING! POB: This appears to be a Cascading design decision. * - * WARNING: mapfn needs to be stateless. Multiple calls needs to be safe (no mutable - * state captured) + * WARNING: mapfn needs to be stateless. Multiple calls needs to be safe (no mutable state captured) */ - def mapStream[T, X](fieldDef: (Fields, Fields))(mapfn: (Iterator[T]) => TraversableOnce[X])(implicit conv: TupleConverter[T], setter: TupleSetter[X]): Self + def mapStream[T, X](fieldDef: (Fields, Fields))( + mapfn: (Iterator[T]) => TraversableOnce[X] + )(implicit conv: TupleConverter[T], setter: TupleSetter[X]): Self ///////////////////////////////////////// // All the below functions are implemented in terms of the above @@ -49,43 +48,39 @@ trait StreamOperations[+Self <: StreamOperations[Self]] extends Sortable[Self] w /** * Remove the first cnt elements */ - def drop(cnt: Int): Self = { - mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS){ s => + def drop(cnt: Int): Self = + mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS) { s => s.drop(cnt) }(TupleConverter.CTupleConverter, TupleSetter.CTupleSetter) - } /** * Drop while the predicate is true, starting at the first false, output all */ - def dropWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = { - mapStream[TupleEntry, CTuple](f -> Fields.ARGS){ s => - s.dropWhile(te => fn(conv(te))).map { _.getTuple } + def dropWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = + mapStream[TupleEntry, CTuple](f -> Fields.ARGS) { s => + s.dropWhile(te => fn(conv(te))).map(_.getTuple) }(TupleConverter.TupleEntryConverter, TupleSetter.CTupleSetter) - } - def scanLeft[X, T](fieldDef: (Fields, Fields))(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self = { - mapStream[T, X](fieldDef){ s => + def scanLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self = + mapStream[T, X](fieldDef) { s => // scala's default is not consistent in 2.8 and 2.9, this standardizes the behavior new ScanLeftIterator(s, init, fn) }(conv, setter) - } /** * Only keep the first cnt elements */ - def take(cnt: Int): Self = { - mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS){ s => + def take(cnt: Int): Self = + mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS) { s => s.take(cnt) }(TupleConverter.CTupleConverter, TupleSetter.CTupleSetter) - } /** - * Take while the predicate is true, stopping at the - * first false. Output all taken elements. + * Take while the predicate is true, stopping at the first false. Output all taken elements. */ - def takeWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = { - mapStream[TupleEntry, CTuple](f -> Fields.ARGS){ s => - s.takeWhile(te => fn(conv(te))).map { _.getTuple } + def takeWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = + mapStream[TupleEntry, CTuple](f -> Fields.ARGS) { s => + s.takeWhile(te => fn(conv(te))).map(_.getTuple) }(TupleConverter.TupleEntryConverter, TupleSetter.CTupleSetter) - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/StringUtility.scala b/scalding-core/src/main/scala/com/twitter/scalding/StringUtility.scala index 1e421e0990..2543551819 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/StringUtility.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/StringUtility.scala @@ -1,7 +1,13 @@ package com.twitter.scalding object StringUtility { - private def fastSplitHelper(text: String, key: String, from: Int, textLength: Int, keyLength: Int): List[String] = { + private def fastSplitHelper( + text: String, + key: String, + from: Int, + textLength: Int, + keyLength: Int + ): List[String] = { val firstIndex = text.indexOf(key, from) if (firstIndex == -1) { if (from < textLength) { @@ -11,11 +17,16 @@ object StringUtility { } } else { // the text till the separator should be kept in any case - text.substring(from, firstIndex) :: fastSplitHelper(text, key, firstIndex + keyLength, textLength, keyLength) + text.substring(from, firstIndex) :: fastSplitHelper( + text, + key, + firstIndex + keyLength, + textLength, + keyLength + ) } } - def fastSplit(text: String, key: String): List[String] = { + def fastSplit(text: String, key: String): List[String] = fastSplitHelper(text, key, 0, text.length, key.length) - } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala index 06ba0b55b4..75075ada1f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tap.hadoop.{ TemplateTap => HTemplateTap } +import cascading.tap.hadoop.{TemplateTap => HTemplateTap} import cascading.tap.local.FileTap -import cascading.tap.local.{ TemplateTap => LTemplateTap } +import cascading.tap.local.{TemplateTap => LTemplateTap} import cascading.tap.SinkMode import cascading.tap.Tap import cascading.tuple.Fields @@ -37,12 +37,15 @@ abstract class TemplateSource extends SchemedSource with HfsTapProvider { /** * Creates the template tap. * - * @param readOrWrite Describes if this source is being read from or written to. - * @param mode The mode of the job. (implicit) + * @param readOrWrite + * Describes if this source is being read from or written to. + * @param mode + * The mode of the job. (implicit) * - * @return A cascading TemplateTap. + * @return + * A cascading TemplateTap. */ - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = readOrWrite match { case Read => throw new InvalidSourceException("Cannot use TemplateSource for input") case Write => { @@ -63,57 +66,69 @@ abstract class TemplateSource extends SchemedSource with HfsTapProvider { } } } - } /** * Validates the taps, makes sure there are no nulls as the path or template. * - * @param mode The mode of the job. + * @param mode + * The mode of the job. */ - override def validateTaps(mode: Mode): Unit = { + override def validateTaps(mode: Mode): Unit = if (basePath == null) { throw new InvalidSourceException("basePath cannot be null for TemplateTap") } else if (template == null) { throw new InvalidSourceException("template cannot be null for TemplateTap") } - } } /** * An implementation of TSV output, split over a template tap. * - * @param basePath The root path for the output. - * @param template The java formatter style string to use as the template. e.g. %s/%s. - * @param pathFields The set of fields to apply to the path. - * @param writeHeader Flag to indicate that the header should be written to the file. - * @param sinkMode How to handle conflicts with existing output. - * @param fields The set of fields to apply to the output. + * @param basePath + * The root path for the output. + * @param template + * The java formatter style string to use as the template. e.g. %s/%s. + * @param pathFields + * The set of fields to apply to the path. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param sinkMode + * How to handle conflicts with existing output. + * @param fields + * The set of fields to apply to the output. */ case class TemplatedTsv( - override val basePath: String, - override val template: String, - override val pathFields: Fields = Fields.ALL, - override val writeHeader: Boolean = false, - override val sinkMode: SinkMode = SinkMode.REPLACE, - override val fields: Fields = Fields.ALL) - extends TemplateSource with DelimitedScheme + override val basePath: String, + override val template: String, + override val pathFields: Fields = Fields.ALL, + override val writeHeader: Boolean = false, + override val sinkMode: SinkMode = SinkMode.REPLACE, + override val fields: Fields = Fields.ALL +) extends TemplateSource + with DelimitedScheme /** * An implementation of SequenceFile output, split over a template tap. * - * @param basePath The root path for the output. - * @param template The java formatter style string to use as the template. e.g. %s/%s. - * @param sequenceFields The set of fields to use for the sequence file. - * @param pathFields The set of fields to apply to the path. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param template + * The java formatter style string to use as the template. e.g. %s/%s. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param pathFields + * The set of fields to apply to the path. + * @param sinkMode + * How to handle conflicts with existing output. */ case class TemplatedSequenceFile( - override val basePath: String, - override val template: String, - val sequenceFields: Fields = Fields.ALL, - override val pathFields: Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends TemplateSource with SequenceFileScheme { + override val basePath: String, + override val template: String, + val sequenceFields: Fields = Fields.ALL, + override val pathFields: Fields = Fields.ALL, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends TemplateSource + with SequenceFileScheme { override val fields = sequenceFields } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala b/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala index 5ef5dd8209..5266a6d7ef 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.maple.tap.MemorySourceTap @@ -22,7 +22,7 @@ import cascading.tap.SinkMode import cascading.tap.Tap import cascading.scheme.NullScheme import com.twitter.scalding.tap.ScaldingHfs -import java.io.{ InputStream, OutputStream, Serializable } +import java.io.{InputStream, OutputStream, Serializable} import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.OutputCollector import org.apache.hadoop.mapred.RecordReader @@ -40,42 +40,47 @@ object TestTapFactory extends Serializable { "each sink in your job has a corresponding sink in the test sinks that is EXACTLY " + "equal. Call the '.sink' method on your JobTest to add test buffers for each sink." - def apply(src: Source, fields: Fields, sinkMode: SinkMode = SinkMode.REPLACE): TestTapFactory = new TestTapFactory(src, sinkMode) { - override def sourceFields: Fields = fields - override def sinkFields: Fields = fields - } - def apply[A, B](src: Source, scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B]): TestTapFactory = apply(src, scheme, SinkMode.REPLACE) - def apply[A, B](src: Source, - scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B], sinkMode: SinkMode): TestTapFactory = + def apply(src: Source, fields: Fields, sinkMode: SinkMode = SinkMode.REPLACE): TestTapFactory = + new TestTapFactory(src, sinkMode) { + override def sourceFields: Fields = fields + override def sinkFields: Fields = fields + } + def apply[A, B]( + src: Source, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B] + ): TestTapFactory = apply(src, scheme, SinkMode.REPLACE) + def apply[A, B]( + src: Source, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B], + sinkMode: SinkMode + ): TestTapFactory = new TestTapFactory(src, sinkMode) { override def hdfsScheme = Some(scheme) } } class TestTapFactory(src: Source, sinkMode: SinkMode) extends Serializable { def sourceFields: Fields = - hdfsScheme.map { _.getSourceFields }.getOrElse(sys.error("No sourceFields defined")) + hdfsScheme.map(_.getSourceFields).getOrElse(sys.error("No sourceFields defined")) def sinkFields: Fields = - hdfsScheme.map { _.getSinkFields }.getOrElse(sys.error("No sinkFields defined")) + hdfsScheme.map(_.getSinkFields).getOrElse(sys.error("No sinkFields defined")) def hdfsScheme: Option[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] = None @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { case Test(buffers) => { /* - * There MUST have already been a registered sink or source in the Test mode. - * to access this. You must explicitly name each of your test sources in your - * JobTest. - */ + * There MUST have already been a registered sink or source in the Test mode. + * to access this. You must explicitly name each of your test sources in your + * JobTest. + */ val errorMsg = readOrWrite match { - case Read => TestTapFactory.sourceNotFoundError + case Read => TestTapFactory.sourceNotFoundError case Write => TestTapFactory.sinkNotFoundError } - require( - buffers(src).isDefined, - errorMsg.format(src)) + require(buffers(src).isDefined, errorMsg.format(src)) val buffer = if (readOrWrite == Write) { val buf = buffers(src).get @@ -86,9 +91,7 @@ class TestTapFactory(src: Source, sinkMode: SinkMode) extends Serializable { // if the source is also used as a sink, we don't want its contents to get modified buffers(src).get.clone() } - new MemoryTap[InputStream, OutputStream]( - new NullScheme(sourceFields, sinkFields), - buffer) + new MemoryTap[InputStream, OutputStream](new NullScheme(sourceFields, sinkFields), buffer) } case hdfsTest @ HadoopTest(conf, buffers) => readOrWrite match { @@ -111,5 +114,4 @@ class TestTapFactory(src: Source, sinkMode: SinkMode) extends Serializable { throw new RuntimeException("TestTapFactory doesn't support mode: " + mode.toString) } } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala index 749f7fb4bd..17ca9caf8e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone @@ -27,17 +27,22 @@ object TimePathedSource { String.format(pattern, date.toCalendar(tz)) def stepSize(pattern: String, tz: TimeZone): Option[Duration] = - List("%1$tH" -> Hours(1), "%1$td" -> Days(1)(tz), - "%1$tm" -> Months(1)(tz), "%1$tY" -> Years(1)(tz)) + List("%1$tH" -> Hours(1), "%1$td" -> Days(1)(tz), "%1$tm" -> Months(1)(tz), "%1$tY" -> Years(1)(tz)) .find { unitDur: (String, Duration) => pattern.contains(unitDur._1) } .map(_._2) /** * Gives all paths in the given daterange with windows based on the provided duration. */ - def allPathsWithDuration(pattern: String, duration: Duration, dateRange: DateRange, tz: TimeZone): Iterable[String] = + def allPathsWithDuration( + pattern: String, + duration: Duration, + dateRange: DateRange, + tz: TimeZone + ): Iterable[String] = // This method is exhaustive, but too expensive for Cascading's JobConf writing. - dateRange.each(duration) + dateRange + .each(duration) .map { dr: DateRange => toPath(pattern, dr.start, tz) } @@ -45,12 +50,11 @@ object TimePathedSource { /** * Gives all read paths in the given daterange. */ - def readPathsFor(pattern: String, dateRange: DateRange, tz: TimeZone): Iterable[String] = { + def readPathsFor(pattern: String, dateRange: DateRange, tz: TimeZone): Iterable[String] = TimePathedSource.stepSize(pattern, tz) match { case Some(duration) => allPathsWithDuration(pattern, duration, dateRange, tz) - case None => sys.error(s"No suitable step size for pattern: $pattern") + case None => sys.error(s"No suitable step size for pattern: $pattern") } - } /** * Gives the write path based on daterange end. @@ -63,16 +67,17 @@ object TimePathedSource { } } -abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: DateRange, val tz: TimeZone) extends FileSource { +abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: DateRange, val tz: TimeZone) + extends FileSource { override def hdfsPaths = patterns - .flatMap{ pattern: String => + .flatMap { pattern: String => Globifier(pattern)(tz).globify(dateRange) } /** - * Override this if you have for instance an hourly pattern but want to run every 6 hours. - * By default, we call TimePathedSource.stepSize(pattern, tz) + * Override this if you have for instance an hourly pattern but want to run every 6 hours. By default, we + * call TimePathedSource.stepSize(pattern, tz) */ protected def defaultDurationFor(pattern: String): Option[Duration] = TimePathedSource.stepSize(pattern, tz) @@ -80,7 +85,7 @@ abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: Dat protected def allPathsFor(pattern: String): Iterable[String] = defaultDurationFor(pattern) match { case Some(duration) => TimePathedSource.allPathsWithDuration(pattern, duration, dateRange, tz) - case None => sys.error(s"No suitable step size for pattern: $pattern") + case None => sys.error(s"No suitable step size for pattern: $pattern") } /** These are all the paths we will read for this data completely enumerated */ @@ -88,20 +93,19 @@ abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: Dat patterns.flatMap(allPathsFor(_)) /** - * Get path statuses based on daterange. This tests each path with pathIsGood - * (which by default checks that there is at least on file in that directory) + * Get path statuses based on daterange. This tests each path with pathIsGood (which by default checks that + * there is at least on file in that directory) */ def getPathStatuses(conf: Configuration): Iterable[(String, Boolean)] = - allPaths.map { path => (path, pathIsGood(path, conf)) } + allPaths.map(path => (path, pathIsGood(path, conf))) // Override because we want to check UNGLOBIFIED paths that each are present. override def hdfsReadPathsAreGood(conf: Configuration): Boolean = - getPathStatuses(conf).forall { - case (path, good) => - if (!good) { - System.err.println("[ERROR] Path: " + path + " is missing in: " + toString) - } - good + getPathStatuses(conf).forall { case (path, good) => + if (!good) { + System.err.println("[ERROR] Path: " + path + " is missing in: " + toString) + } + good } override def toString = "TimeSeqPathedSource(" + patterns.mkString(",") + @@ -120,13 +124,11 @@ abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: Dat } /** - * This will automatically produce a globbed version of the given path. - * THIS MEANS YOU MUST END WITH A / followed by * to match a file - * For writing, we write to the directory specified by the END time. + * This will automatically produce a globbed version of the given path. THIS MEANS YOU MUST END WITH A / + * followed by * to match a file For writing, we write to the directory specified by the END time. */ -abstract class TimePathedSource(val pattern: String, - dateRange: DateRange, - tz: TimeZone) extends TimeSeqPathedSource(Seq(pattern), dateRange, tz) { +abstract class TimePathedSource(val pattern: String, dateRange: DateRange, tz: TimeZone) + extends TimeSeqPathedSource(Seq(pattern), dateRange, tz) { //Write to the path defined by the end time: override def hdfsWritePath = TimePathedSource.writePathFor(pattern, dateRange, tz) @@ -141,14 +143,12 @@ abstract class TimePathedSource(val pattern: String, * A source that contains the most recent existing path in this date range. */ abstract class MostRecentGoodSource(p: String, dr: DateRange, t: TimeZone) - extends TimePathedSource(p, dr, t) { + extends TimePathedSource(p, dr, t) { override def toString = "MostRecentGoodSource(" + p + ", " + dr + ", " + t + ")" - override protected def goodHdfsPaths(hdfsMode: Hdfs) = getPathStatuses(hdfsMode.jobConf) - .toList - .reverse + override protected def goodHdfsPaths(hdfsMode: Hdfs) = getPathStatuses(hdfsMode.jobConf).toList.reverse .find(_._2) .map(_._1) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala b/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala index b3e5d89c6c..87911dd95b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.hadoop.HadoopFlow @@ -20,7 +20,7 @@ import cascading.flow.planner.BaseFlowStep import org.apache.hadoop.conf.Configured import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.util.{ GenericOptionsParser, Tool => HTool, ToolRunner } +import org.apache.hadoop.util.{GenericOptionsParser, Tool => HTool, ToolRunner} import scala.annotation.tailrec import scala.collection.JavaConverters._ @@ -30,13 +30,12 @@ class Tool extends Configured with HTool { var rootJob: Option[(Args) => Job] = None // Allows you to set the job for the Tool to run - def setJobConstructor(jobc: (Args) => Job): Unit = { + def setJobConstructor(jobc: (Args) => Job): Unit = if (rootJob.isDefined) { sys.error("Job is already defined") } else { rootJob = Some(jobc) } - } protected def getJob(args: Args): Job = rootJob match { case Some(job) => job(args) @@ -52,9 +51,8 @@ class Tool extends Configured with HTool { // This both updates the jobConf with hadoop arguments // and returns all the non-hadoop arguments. Should be called once if // you want to process hadoop arguments (like -libjars). - protected def nonHadoopArgsFrom(args: Array[String]): Array[String] = { + protected def nonHadoopArgsFrom(args: Array[String]): Array[String] = (new GenericOptionsParser(getConf, args)).getRemainingArgs - } def parseModeArgs(args: Array[String]): (Mode, Args) = { val a = Args(nonHadoopArgsFrom(args)) @@ -77,20 +75,20 @@ class Tool extends Configured with HTool { } /* - * This is a tail recursive loop that runs all the - * jobs spawned from this one - */ + * This is a tail recursive loop that runs all the + * jobs spawned from this one + */ val jobName = job.getClass.getName @tailrec def start(j: Job, cnt: Int): Unit = { val successful = if (onlyPrintGraph) { val flow = j.buildFlow /* - * This just writes out the graph representing - * all the cascading elements that are created for this - * flow. Use graphviz to render it as a PDF. - * The job is NOT run in this case. - */ + * This just writes out the graph representing + * all the cascading elements that are created for this + * flow. Use graphviz to render it as a PDF. + * The job is NOT run in this case. + */ val thisDot = jobName + cnt + ".dot" println("writing DOT: " + thisDot) @@ -98,7 +96,7 @@ class Tool extends Configured with HTool { flow match { case hadoopFlow: HadoopFlow => val flowSteps = hadoopFlow.getFlowSteps.asScala - flowSteps.foreach(step => { + flowSteps.foreach { step => val baseFlowStep: BaseFlowStep[JobConf] = step.asInstanceOf[BaseFlowStep[JobConf]] val descriptions = baseFlowStep.getConfig.get(Config.StepDescriptions, "") if (!descriptions.isEmpty) { @@ -109,7 +107,7 @@ class Tool extends Configured with HTool { x.setAccessible(true) x.invoke(step, "%s %s".format(stepXofYData, descriptions)) } - }) + } case _ => // descriptions not yet supported in other modes } @@ -129,12 +127,14 @@ class Tool extends Configured with HTool { // we need to use match not foreach to get tail recursion j.next match { // linter:disable:UseOptionForeachNotPatMatch case Some(nextj) => start(nextj, cnt + 1) - case None => () + case None => () } } else { - throw new RuntimeException("Job failed to run: " + jobName + - (if (cnt > 0) { " child: " + cnt.toString + ", class: " + j.getClass.getName } - else { "" })) + throw new RuntimeException( + "Job failed to run: " + jobName + + (if (cnt > 0) { " child: " + cnt.toString + ", class: " + j.getClass.getName } + else { "" }) + ) } } //start a counter to see how deep we recurse: @@ -144,7 +144,7 @@ class Tool extends Configured with HTool { } object Tool { - def main(args: Array[String]): Unit = { + def main(args: Array[String]): Unit = try { ToolRunner.run(new JobConf, new Tool, ExpandLibJarsGlobs(args)) } catch { @@ -153,5 +153,4 @@ object Tool { throw new Throwable(RichXHandler(t), t) } } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala b/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala index 403727e1cc..978f1eda51 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala @@ -12,24 +12,21 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.lang.reflect.InvocationTargetException -import org.slf4j.{ Logger, LoggerFactory => LogManager } +import org.slf4j.{Logger, LoggerFactory => LogManager} /** - * Calling init registers "com.twitter.scalding" as a "tracing boundary" for - * Cascading. That means that when Cascading sends trace information to - * a DocumentService such as Driven, the trace will have information about - * the caller of Scalding instead of about the internals of Scalding. - * com.twitter.scalding.Job and its subclasses will automatically - * initialize Tracing. + * Calling init registers "com.twitter.scalding" as a "tracing boundary" for Cascading. That means that when + * Cascading sends trace information to a DocumentService such as Driven, the trace will have information + * about the caller of Scalding instead of about the internals of Scalding. com.twitter.scalding.Job and its + * subclasses will automatically initialize Tracing. * - * register and unregister methods are provided for testing, but - * should not be needed for most development + * register and unregister methods are provided for testing, but should not be needed for most development */ object Tracing { private val LOG: Logger = LogManager.getLogger(this.getClass) @@ -39,56 +36,52 @@ object Tracing { private val traceUtilClassName = "cascading.util.TraceUtil" /** - * Put a barrier at com.twitter.scalding, but exclude things like Tool - * that are common entry points for calling user code + * Put a barrier at com.twitter.scalding, but exclude things like Tool that are common entry points for + * calling user code */ private val defaultRegex = """^com\.twitter\.scalding\.(?!Tool|Job|ExecutionContext).*""" register() /** - * Forces the initialization of the Tracing object which in turn causes - * the one time registration of "com.twitter.scalding" as a - * tracing boundary in Cascading + * Forces the initialization of the Tracing object which in turn causes the one time registration of + * "com.twitter.scalding" as a tracing boundary in Cascading */ def init(): Unit = { /* do nothing */ } /** - * Explicitly registers "com.twitter.scalding" as a Cascading - * tracing boundary. Normally not needed, but may be useful - * after a call to unregister() + * Explicitly registers "com.twitter.scalding" as a Cascading tracing boundary. Normally not needed, but may + * be useful after a call to unregister() */ - def register(regex: String = defaultRegex) = invokeStaticMethod(traceUtilClassName, "registerApiBoundary", regex) + def register(regex: String = defaultRegex) = + invokeStaticMethod(traceUtilClassName, "registerApiBoundary", regex) /** - * Unregisters "com.twitter.scalding" as a Cascading - * tracing bounardy. After calling this, Cascading DocumentServices - * such as Driven will show nodes as being created by Scalding - * class such as RichPipe instead of end user written code. This - * should normally not be called but can be useful in testing - * the development of Scalding internals + * Unregisters "com.twitter.scalding" as a Cascading tracing bounardy. After calling this, Cascading + * DocumentServices such as Driven will show nodes as being created by Scalding class such as RichPipe + * instead of end user written code. This should normally not be called but can be useful in testing the + * development of Scalding internals */ - def unregister(regex: String = defaultRegex) = invokeStaticMethod(traceUtilClassName, "unregisterApiBoundary", regex) + def unregister(regex: String = defaultRegex) = + invokeStaticMethod(traceUtilClassName, "unregisterApiBoundary", regex) /** - * Use reflection to register/unregister tracing boundaries so that cascading versions prior to 2.6 can be used - * without completely breaking + * Use reflection to register/unregister tracing boundaries so that cascading versions prior to 2.6 can be + * used without completely breaking */ - private def invokeStaticMethod(clazz: String, methodName: String, args: AnyRef*): Unit = { + private def invokeStaticMethod(clazz: String, methodName: String, args: AnyRef*): Unit = try { - val argTypes = args map (_.getClass()) + val argTypes = args.map(_.getClass()) Class.forName(clazz).getMethod(methodName, argTypes: _*).invoke(null, args: _*) } catch { - case e @ (_: NoSuchMethodException | - _: SecurityException | - _: IllegalAccessException | - _: IllegalArgumentException | - _: InvocationTargetException | - _: NullPointerException | - _: ClassNotFoundException) => LOG.warn("There was an error initializing tracing. " + - "Tracing information in DocumentServices such as Driven may point to Scalding code instead of " + - "user code. The most likely cause is a mismatch in Cascading library version. Upgrading the " + - "Cascading library to at least 2.6 should fix this issue.The cause was [" + e + "]") + case e @ (_: NoSuchMethodException | _: SecurityException | _: IllegalAccessException | + _: IllegalArgumentException | _: InvocationTargetException | _: NullPointerException | + _: ClassNotFoundException) => + LOG.warn( + "There was an error initializing tracing. " + + "Tracing information in DocumentServices such as Driven may point to Scalding code instead of " + + "user code. The most likely cause is a mismatch in Cascading library version. Upgrading the " + + "Cascading library to at least 2.6 should fix this issue.The cause was [" + e + "]" + ) } - } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala index fc1dd7fb94..65ad9090cc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala @@ -12,35 +12,36 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields /** - * Mixed in to both TupleConverter and TupleSetter to improve arity safety - * of cascading jobs before we run anything on Hadoop. + * Mixed in to both TupleConverter and TupleSetter to improve arity safety of cascading jobs before we run + * anything on Hadoop. */ trait TupleArity { + /** - * Return the arity of product types, should probably only be used implicitly - * The use case here is to see how many fake field names we need in Cascading - * to hold an intermediate value for mapReduceMap + * Return the arity of product types, should probably only be used implicitly The use case here is to see + * how many fake field names we need in Cascading to hold an intermediate value for mapReduceMap */ def arity: Int /** - * assert that the arity of this setter matches the fields given. - * if arity == -1, we can't check, and if Fields is not a definite - * size, (such as Fields.ALL), we also cannot check, so this should - * only be considered a weak check. + * assert that the arity of this setter matches the fields given. if arity == -1, we can't check, and if + * Fields is not a definite size, (such as Fields.ALL), we also cannot check, so this should only be + * considered a weak check. */ - def assertArityMatches(f: Fields): Unit = { + def assertArityMatches(f: Fields): Unit = //Fields.size == 0 for the indefinite Fields: ALL, GROUP, VALUES, UNKNOWN, etc.. if (f.size > 0 && arity >= 0) { - assert(arity == f.size, "Arity of (" + super.getClass + ") is " - + arity + ", which doesn't match: + (" + f.toString + ")") + assert( + arity == f.size, + "Arity of (" + super.getClass + ") is " + + arity + ", which doesn't match: + (" + f.toString + ")" + ) } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala index e52511d2af..600bb51a4a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding @deprecated("This trait does nothing now", "0.9.0") diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala index 700fbc85c9..da6ff34af0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala @@ -12,42 +12,43 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.TupleEntry -import cascading.tuple.{ Tuple => CTuple } +import cascading.tuple.{Tuple => CTuple} import com.twitter.scalding.serialization.Externalizer import scala.collection.breakOut /** - * Typeclass to represent converting from cascading TupleEntry to some type T. - * The most common application is to convert to scala Tuple objects for use - * with the Fields API. The typed API internally manually handles its mapping - * to cascading Tuples, so the implicit resolution mechanism is not used. + * Typeclass to represent converting from cascading TupleEntry to some type T. The most common application is + * to convert to scala Tuple objects for use with the Fields API. The typed API internally manually handles + * its mapping to cascading Tuples, so the implicit resolution mechanism is not used. * - * WARNING: if you are seeing issues with the singleConverter being found when you - * expect something else, you may have an issue where the enclosing scope needs to - * take an implicit TupleConverter of the correct type. + * WARNING: if you are seeing issues with the singleConverter being found when you expect something else, you + * may have an issue where the enclosing scope needs to take an implicit TupleConverter of the correct type. * - * Unfortunately, the semantics we want (prefer to flatten tuples, but otherwise - * put everything into one postition in the tuple) are somewhat difficlut to - * encode in scala. + * Unfortunately, the semantics we want (prefer to flatten tuples, but otherwise put everything into one + * postition in the tuple) are somewhat difficlut to encode in scala. */ -trait TupleConverter[@specialized(Int, Long, Float, Double) T] extends java.io.Serializable with TupleArity { self => +trait TupleConverter[@specialized(Int, Long, Float, Double) T] extends java.io.Serializable with TupleArity { + self => def apply(te: TupleEntry): T def andThen[U](fn: T => U): TupleConverter[U] = TupleConverter.AndThen(this, fn) } trait LowPriorityTupleConverters extends java.io.Serializable { - implicit def singleConverter[@specialized(Int, Long, Float, Double) A](implicit g: TupleGetter[A]): TupleConverter[A] = + implicit def singleConverter[@specialized(Int, Long, Float, Double) A](implicit + g: TupleGetter[A] + ): TupleConverter[A] = TupleConverter.Single[A](g) } object TupleConverter extends GeneratedTupleConverters { - final case class Single[@specialized(Int, Long, Float, Double) A](getter: TupleGetter[A]) extends TupleConverter[A] { + final case class Single[@specialized(Int, Long, Float, Double) A](getter: TupleGetter[A]) + extends TupleConverter[A] { def apply(tup: TupleEntry): A = getter.get(tup.getTuple, 0) def arity = 1 } @@ -66,11 +67,11 @@ object TupleConverter extends GeneratedTupleConverters { } /** - * Treat this TupleConverter as one for a superclass - * We do this because we want to use implicit resolution invariantly, - * but clearly, the operation is covariant + * Treat this TupleConverter as one for a superclass We do this because we want to use implicit resolution + * invariantly, but clearly, the operation is covariant */ - def asSuperConverter[T, U >: T](tc: TupleConverter[T]): TupleConverter[U] = tc.asInstanceOf[TupleConverter[U]] + def asSuperConverter[T, U >: T](tc: TupleConverter[T]): TupleConverter[U] = + tc.asInstanceOf[TupleConverter[U]] def build[T](thisArity: Int)(fn: TupleEntry => T): TupleConverter[T] = FromFn(fn, thisArity) def fromTupleEntry[T](t: TupleEntry)(implicit tc: TupleConverter[T]): T = tc(t) @@ -78,9 +79,8 @@ object TupleConverter extends GeneratedTupleConverters { def of[T](implicit tc: TupleConverter[T]): TupleConverter[T] = tc /** - * Copies the tupleEntry, since cascading may change it after the end of an - * operation (and it is not safe to assume the consumer has not kept a ref - * to this tuple) + * Copies the tupleEntry, since cascading may change it after the end of an operation (and it is not safe to + * assume the consumer has not kept a ref to this tuple) */ implicit lazy val TupleEntryConverter: TupleConverter[TupleEntry] = new TupleConverter[TupleEntry] { override def apply(tup: TupleEntry) = new TupleEntry(tup) @@ -88,9 +88,8 @@ object TupleConverter extends GeneratedTupleConverters { } /** - * Copies the tuple, since cascading may change it after the end of an - * operation (and it is not safe to assume the consumer has not kept a ref - * to this tuple + * Copies the tuple, since cascading may change it after the end of an operation (and it is not safe to + * assume the consumer has not kept a ref to this tuple */ implicit lazy val CTupleConverter: TupleConverter[CTuple] = new TupleConverter[CTuple] { override def apply(tup: TupleEntry) = tup.getTupleCopy @@ -104,7 +103,7 @@ object TupleConverter extends GeneratedTupleConverters { def wrap(tup: CTuple): Product = new Product { def canEqual(that: Any) = that match { case p: Product => true - case _ => false + case _ => false } def productArity = tup.size def productElement(idx: Int) = tup.getObject(idx) @@ -119,8 +118,8 @@ object TupleConverter extends GeneratedTupleConverters { } // Doesn't seem safe to make these implicit by default: /** - * Convert a TupleEntry to a List of CTuple, of length 2, with key, value - * from the TupleEntry (useful for RichPipe.unpivot) + * Convert a TupleEntry to a List of CTuple, of length 2, with key, value from the TupleEntry (useful for + * RichPipe.unpivot) */ object KeyValueList extends TupleConverter[List[CTuple]] { def apply(tupe: TupleEntry): List[CTuple] = { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala index 4c6237fc09..508f20f524 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala @@ -12,17 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tuple.{ Tuple => CTuple } +import cascading.tuple.{Tuple => CTuple} /** - * Typeclass roughly equivalent to a Lens, which allows getting items out of a tuple. - * This is useful because cascading has type coercion (string to int, for instance) that - * users expect in the fields API. This code is not used in the typesafe API, which - * does not allow suc silent coercion. - * See the generated TupleConverters for an example of where this is used + * Typeclass roughly equivalent to a Lens, which allows getting items out of a tuple. This is useful because + * cascading has type coercion (string to int, for instance) that users expect in the fields API. This code is + * not used in the typesafe API, which does not allow suc silent coercion. See the generated TupleConverters + * for an example of where this is used */ trait TupleGetter[@specialized(Int, Long, Float, Double) T] extends java.io.Serializable { def get(tup: CTuple, i: Int): T diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala index 326c96aad0..2632623162 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple._ @@ -23,13 +23,14 @@ import java.lang.reflect.Constructor import scala.reflect.Manifest /** - * Typeclass for packing a cascading Tuple into some type T, - * this is used to put fields of a cascading tuple into Thrift, Protobuf, - * or case classes, for instance, but you can add your own instances to control - * how this is done. + * Typeclass for packing a cascading Tuple into some type T, this is used to put fields of a cascading tuple + * into Thrift, Protobuf, or case classes, for instance, but you can add your own instances to control how + * this is done. * - * @author Argyris Zymnis - * @author Oscar Boykin + * @author + * Argyris Zymnis + * @author + * Oscar Boykin */ trait TuplePacker[T] extends java.io.Serializable { def newConverter(fields: Fields): TupleConverter[T] @@ -38,7 +39,8 @@ trait TuplePacker[T] extends java.io.Serializable { object TuplePacker extends CaseClassPackers trait CaseClassPackers extends LowPriorityTuplePackers { - implicit def caseClassPacker[T <: Product](implicit mf: Manifest[T]): OrderedTuplePacker[T] = new OrderedTuplePacker[T] + implicit def caseClassPacker[T <: Product](implicit mf: Manifest[T]): OrderedTuplePacker[T] = + new OrderedTuplePacker[T] } trait LowPriorityTuplePackers extends java.io.Serializable { @@ -46,12 +48,13 @@ trait LowPriorityTuplePackers extends java.io.Serializable { } /** - * Packs a tuple into any object with set methods, e.g. thrift or proto objects. - * TODO: verify that protobuf setters for field camel_name are of the form setCamelName. - * In that case this code works for proto. + * Packs a tuple into any object with set methods, e.g. thrift or proto objects. TODO: verify that protobuf + * setters for field camel_name are of the form setCamelName. In that case this code works for proto. * - * @author Argyris Zymnis - * @author Oscar Boykin + * @author + * Argyris Zymnis + * @author + * Oscar Boykin */ class ReflectionTuplePacker[T](implicit m: Manifest[T]) extends TuplePacker[T] { override def newConverter(fields: Fields) = new ReflectionTupleConverter[T](fields)(m) @@ -71,17 +74,16 @@ class ReflectionTupleConverter[T](fields: Fields)(implicit m: Manifest[T]) exten def validate(): Unit = { //We can't touch setters because that shouldn't be accessed until map/reduce side, not //on submitter. - val missing = Dsl.asList(fields).find { f => !getSetters.contains(f.toString) } + val missing = Dsl.asList(fields).find(f => !getSetters.contains(f.toString)) assert(missing.isEmpty, "Field: " + missing.get.toString + " not in setters") } validate() - def getSetters = m.runtimeClass - .getDeclaredMethods - .filter { _.getName.startsWith("set") } - .groupBy { setterToFieldName(_) } - .mapValues { _.head } + def getSetters = m.runtimeClass.getDeclaredMethods + .filter(_.getName.startsWith("set")) + .groupBy(setterToFieldName(_)) + .mapValues(_.head) // Do all the reflection for the setters we need: // This needs to be lazy because Method is not serializable @@ -111,10 +113,10 @@ class OrderedConstructorConverter[T](fields: Fields)(implicit mf: Manifest[T]) e override val arity = fields.size // Keep this as a method, so we can validate by calling, but don't serialize it, and keep it lazy // below - def getConstructor = mf.runtimeClass - .getConstructors - .filter { _.getParameterTypes.size == fields.size } - .head.asInstanceOf[Constructor[T]] + def getConstructor = mf.runtimeClass.getConstructors + .filter(_.getParameterTypes.size == fields.size) + .head + .asInstanceOf[Constructor[T]] //Make sure we can actually get a constructor: getConstructor @@ -123,7 +125,7 @@ class OrderedConstructorConverter[T](fields: Fields)(implicit mf: Manifest[T]) e override def apply(input: TupleEntry): T = { val tup = input.getTuple - val args = (0 until tup.size).map { tup.getObject(_) } + val args = (0 until tup.size).map(tup.getObject(_)) cons.newInstance(args: _*) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala index 16bb5ba202..232c36efa1 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala @@ -12,19 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tuple.{ Tuple => CTuple } +import cascading.tuple.{Tuple => CTuple} /** - * Typeclass to represent converting back to (setting into) a cascading Tuple - * This looks like it can be contravariant, but it can't because of our approach - * of falling back to the singleSetter, you really want the most specific setter - * you can get. Put more directly: a TupleSetter[Any] is not just as good as TupleSetter[(Int, Int)] - * from the scalding DSL's point of view. The latter will flatten the (Int, Int), but the former - * won't. + * Typeclass to represent converting back to (setting into) a cascading Tuple This looks like it can be + * contravariant, but it can't because of our approach of falling back to the singleSetter, you really want + * the most specific setter you can get. Put more directly: a TupleSetter[Any] is not just as good as + * TupleSetter[(Int, Int)] from the scalding DSL's point of view. The latter will flatten the (Int, Int), but + * the former won't. */ trait TupleSetter[T] extends java.io.Serializable with TupleArity { self => def apply(arg: T): CTuple @@ -34,10 +33,10 @@ trait TupleSetter[T] extends java.io.Serializable with TupleArity { self => } trait LowPriorityTupleSetters extends java.io.Serializable { + /** - * If it is not a scala Tuple, and not any defined in the object TupleSetter - * we just assume it is a single entry in the tuple - * For some reason, putting a val TupleSetter[Any] here messes up implicit resolution + * If it is not a scala Tuple, and not any defined in the object TupleSetter we just assume it is a single + * entry in the tuple For some reason, putting a val TupleSetter[Any] here messes up implicit resolution */ implicit def singleSetter[A]: TupleSetter[A] = TupleSetter.Single[A]() } @@ -59,9 +58,8 @@ object TupleSetter extends GeneratedTupleSetters { } /** - * Treat this TupleSetter as one for a subclass - * We do this because we want to use implicit resolution invariantly, - * but clearly, the operation is contravariant + * Treat this TupleSetter as one for a subclass We do this because we want to use implicit resolution + * invariantly, but clearly, the operation is contravariant */ def asSubSetter[T, U <: T](ts: TupleSetter[T]): TupleSetter[U] = ts.asInstanceOf[TupleSetter[U]] diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala index ce6e367860..58660946a0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple._ @@ -20,13 +20,13 @@ import cascading.tuple._ import scala.reflect.Manifest /** - * Typeclass for objects which unpack an object into a tuple. - * The packer can verify the arity, types, and also the existence - * of the getter methods at plan time, without having the job - * blow up in the middle of a run. + * Typeclass for objects which unpack an object into a tuple. The packer can verify the arity, types, and also + * the existence of the getter methods at plan time, without having the job blow up in the middle of a run. * - * @author Argyris Zymnis - * @author Oscar Boykin + * @author + * Argyris Zymnis + * @author + * Oscar Boykin */ object TupleUnpacker extends LowPriorityTupleUnpackers trait TupleUnpacker[T] extends java.io.Serializable { @@ -39,32 +39,28 @@ trait LowPriorityTupleUnpackers { } /** - * A helper for working with class reflection. - * Allows us to avoid code repetition. + * A helper for working with class reflection. Allows us to avoid code repetition. */ object ReflectionUtils { /** - * Returns the set of fields in the given class. - * We use a List to ensure fields are in the same - * order they were declared. + * Returns the set of fields in the given class. We use a List to ensure fields are in the same order they + * were declared. */ def fieldsOf[T](c: Class[T]): List[String] = c.getDeclaredFields - .map { f => f.getName } + .map(f => f.getName) .toList .distinct /** - * For a given class, give a function that takes - * a T, and a fieldname and returns the values. + * For a given class, give a function that takes a T, and a fieldname and returns the values. */ // def fieldGetters[T](c: Class[T]): (T,String) => AnyRef /** - * For a given class, give a function of T, fieldName, - * fieldValue that returns a new T (possibly a copy, - * if T is immutable). + * For a given class, give a function of T, fieldName, fieldValue that returns a new T (possibly a copy, if + * T is immutable). */ // def fieldSetters[T](c: Class[T]): (T,String,AnyRef) => T } @@ -77,9 +73,7 @@ class ReflectionTupleUnpacker[T](implicit m: Manifest[T]) extends TupleUnpacker[ lazy val allFields = new Fields(ReflectionUtils.fieldsOf(m.runtimeClass).toSeq: _*) /** - * A helper to check the passed-in - * fields to see if Fields.ALL is set. - * If it is, return lazy allFields. + * A helper to check the passed-in fields to see if Fields.ALL is set. If it is, return lazy allFields. */ def expandIfAll(fields: Fields) = if (fields.isAll) allFields else fields @@ -102,52 +96,49 @@ class ReflectionSetter[T](fields: Fields)(implicit m: Manifest[T]) extends Tuple // Methods and Fields are not serializable so we // make these defs instead of vals // TODO: filter by isAccessible, which somehow seems to fail - def methodMap = m.runtimeClass - .getDeclaredMethods + def methodMap = m.runtimeClass.getDeclaredMethods // Keep only methods with 0 parameter types - .filter { m => m.getParameterTypes.length == 0 } - .groupBy { _.getName } - .mapValues { _.head } + .filter(m => m.getParameterTypes.length == 0) + .groupBy(_.getName) + .mapValues(_.head) // TODO: filter by isAccessible, which somehow seems to fail - def fieldMap = m.runtimeClass - .getDeclaredFields - .groupBy { _.getName } - .mapValues { _.head } + def fieldMap = m.runtimeClass.getDeclaredFields + .groupBy(_.getName) + .mapValues(_.head) - def makeSetters = { + def makeSetters = (0 until fields.size).map { idx => val fieldName = fields.get(idx).toString setterForFieldName(fieldName) } - } // This validation makes sure that the setters exist // but does not save them in a val (due to serialization issues) def validate = makeSetters override def apply(input: T): Tuple = { - val values = setters.map { setFn => setFn(input) } + val values = setters.map(setFn => setFn(input)) new Tuple(values: _*) } override def arity = fields.size - private def setterForFieldName(fieldName: String): (T => AnyRef) = { + private def setterForFieldName(fieldName: String): (T => AnyRef) = getValueFromMethod(createGetter(fieldName)) .orElse(getValueFromMethod(fieldName)) .orElse(getValueFromField(fieldName)) .getOrElse( - throw new TupleUnpackerException("Unrecognized field: " + fieldName + " for class: " + m.runtimeClass.getName)) - } + throw new TupleUnpackerException( + "Unrecognized field: " + fieldName + " for class: " + m.runtimeClass.getName + ) + ) - private def getValueFromField(fieldName: String): Option[(T => AnyRef)] = { - fieldMap.get(fieldName).map { f => (x: T) => f.get(x) } - } + private def getValueFromField(fieldName: String): Option[(T => AnyRef)] = + fieldMap.get(fieldName).map(f => (x: T) => f.get(x)) - private def getValueFromMethod(methodName: String): Option[(T => AnyRef)] = { - methodMap.get(methodName).map { m => (x: T) => m.invoke(x) } - } + private def getValueFromMethod(methodName: String): Option[(T => AnyRef)] = + methodMap.get(methodName).map(m => (x: T) => m.invoke(x)) private def upperFirst(s: String) = s.substring(0, 1).toUpperCase + s.substring(1) private def createGetter(s: String) = "get" + upperFirst(s) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala index 6d09a18aca..ddd1714e54 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala @@ -1,4 +1,3 @@ - /* Copyright 2014 Twitter, Inc. @@ -13,40 +12,38 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields import scala.annotation.implicitNotFound -import scala.language.experimental.{ macros => sMacros } +import scala.language.experimental.{macros => sMacros} /** - * This class is used to bind together a Fields instance which may - * contain a type array via getTypes, a TupleConverter and TupleSetter, - * which are inverses of one another. Note the size of the Fields object - * and the arity values for the converter and setter are all the same. - * Note in the com.twitter.scalding.macros package there are macros to - * generate this for case classes, which may be very convenient. + * This class is used to bind together a Fields instance which may contain a type array via getTypes, a + * TupleConverter and TupleSetter, which are inverses of one another. Note the size of the Fields object and + * the arity values for the converter and setter are all the same. Note in the com.twitter.scalding.macros + * package there are macros to generate this for case classes, which may be very convenient. */ -@implicitNotFound("""This class is used to bind together a Fields instance to an instance of type T. There is a implicit macro that generates a TypeDescriptor[T] for any type T where T is Boolean, String, Short, Int, Long, FLoat, or Double, or an option of these (with the exception of Option[String]), or a tuple or case class of a supported type. (Nested tuples and case classes are allowed.) Note: Option[String] specifically is not allowed as Some("") and None are indistinguishable. If your type T is not one of these, then you must write your own TypeDescriptor.""") +@implicitNotFound( + """This class is used to bind together a Fields instance to an instance of type T. There is a implicit macro that generates a TypeDescriptor[T] for any type T where T is Boolean, String, Short, Int, Long, FLoat, or Double, or an option of these (with the exception of Option[String]), or a tuple or case class of a supported type. (Nested tuples and case classes are allowed.) Note: Option[String] specifically is not allowed as Some("") and None are indistinguishable. If your type T is not one of these, then you must write your own TypeDescriptor.""" +) trait TypeDescriptor[T] extends java.io.Serializable { def setter: TupleSetter[T] def converter: TupleConverter[T] def fields: Fields } object TypeDescriptor { + /** - * This type descriptor flattens tuples and case classes left to right, - * depth first. It supports any type T where T is Boolean, String, - * Short, Int, Long, Float or Double, or an Option of these, or a tuple - * of a supported type. So, ((Int, Int), Int) is supported, and is - * flattened into a length 3 cascading Tuple/Fields. - * ((Int, Int), (Int, Int)) would be a length 4 cascading tuple, - * similarly with case classes. - * Note, the Fields types are populated at the end of this with the - * exception that Option[T] is recorded as Object (since recording it - * as the java type would have different consequences for Cascading's - * null handling. + * This type descriptor flattens tuples and case classes left to right, depth first. It supports any type T + * where T is Boolean, String, Short, Int, Long, Float or Double, or an Option of these, or a tuple of a + * supported type. So, ((Int, Int), Int) is supported, and is flattened into a length 3 cascading + * Tuple/Fields. ((Int, Int), (Int, Int)) would be a length 4 cascading tuple, similarly with case classes. + * Note, the Fields types are populated at the end of this with the exception that Option[T] is recorded as + * Object (since recording it as the java type would have different consequences for Cascading's null + * handling. */ - implicit def typeDescriptor[T]: TypeDescriptor[T] = macro com.twitter.scalding.macros.impl.TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] + implicit def typeDescriptor[T]: TypeDescriptor[T] = + macro com.twitter.scalding.macros.impl.TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala index ed5b9fe1b2..6b2654edee 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala @@ -21,8 +21,8 @@ import java.lang.reflect.Type import cascading.tuple.Fields /** - * Trait to assist with creating objects such as [[TypedTsv]] to read from separated files. - * Override separator, skipHeader, writeHeader as needed. + * Trait to assist with creating objects such as [[TypedTsv]] to read from separated files. Override + * separator, skipHeader, writeHeader as needed. */ trait TypedSeperatedFile extends Serializable { def separator: String @@ -40,7 +40,10 @@ trait TypedSeperatedFile extends Serializable { def apply[T: Manifest: TupleConverter: TupleSetter](path: String, f: Fields): FixedPathTypedDelimited[T] = apply(Seq(path), f) - def apply[T: Manifest: TupleConverter: TupleSetter](paths: Seq[String], f: Fields): FixedPathTypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + f: Fields + ): FixedPathTypedDelimited[T] = new FixedPathTypedDelimited[T](paths, f, skipHeader, writeHeader, separator) } @@ -73,29 +76,41 @@ object TypedOsv extends TypedSeperatedFile { } object FixedPathTypedDelimited { - def apply[T: Manifest: TupleConverter: TupleSetter](path: String, separator: String): FixedPathTypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter]( + path: String, + separator: String + ): FixedPathTypedDelimited[T] = apply(Seq(path), separator) - def apply[T: Manifest: TupleConverter: TupleSetter](paths: Seq[String], separator: String): FixedPathTypedDelimited[T] = { + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + separator: String + ): FixedPathTypedDelimited[T] = { val f = Dsl.intFields(0 until implicitly[TupleConverter[T]].arity) apply(paths, f, separator) } - def apply[T: Manifest: TupleConverter: TupleSetter](path: String, f: Fields, separator: String): FixedPathTypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter]( + path: String, + f: Fields, + separator: String + ): FixedPathTypedDelimited[T] = apply(Seq(path), f, separator) - def apply[T: Manifest: TupleConverter: TupleSetter](paths: Seq[String], f: Fields, separator: String): FixedPathTypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + f: Fields, + separator: String + ): FixedPathTypedDelimited[T] = new FixedPathTypedDelimited[T](paths, f, false, false, separator) } /** - * Allows you to set the types, prefer this: - * If T is a subclass of Product, we assume it is a tuple. If it is not, wrap T in a Tuple1: - * e.g. TypedTsv[Tuple1[List[Int]]] + * Allows you to set the types, prefer this: If T is a subclass of Product, we assume it is a tuple. If it is + * not, wrap T in a Tuple1: e.g. TypedTsv[Tuple1[List[Int]]] */ @deprecated("Use TypedTextDelimited instead", "2015-07") -trait TypedDelimited[T] extends DelimitedScheme - with Mappable[T] with TypedSink[T] { +trait TypedDelimited[T] extends DelimitedScheme with Mappable[T] with TypedSink[T] { override val skipHeader: Boolean = false override val writeHeader: Boolean = false @@ -111,7 +126,7 @@ trait TypedDelimited[T] extends DelimitedScheme override val types: Array[Class[_]] = if (classOf[scala.Product].isAssignableFrom(mf.runtimeClass)) { //Assume this is a Tuple: - mf.typeArguments.map { _.runtimeClass }.toArray + mf.typeArguments.map(_.runtimeClass).toArray } else { //Assume there is only a single item Array(mf.runtimeClass) @@ -126,19 +141,25 @@ trait TypedDelimited[T] extends DelimitedScheme } @deprecated("Use FixedTypedText instead", "2015-07") -class FixedPathTypedDelimited[T](p: Seq[String], - override val fields: Fields = Fields.ALL, - override val skipHeader: Boolean = false, - override val writeHeader: Boolean = false, - override val separator: String = "\t")(implicit override val mf: Manifest[T], override val conv: TupleConverter[T], - override val tset: TupleSetter[T]) extends FixedPathSource(p: _*) - with TypedDelimited[T] { +class FixedPathTypedDelimited[T]( + p: Seq[String], + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val separator: String = "\t" +)(implicit + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends FixedPathSource(p: _*) + with TypedDelimited[T] { override lazy val toString: String = "FixedPathTypedDelimited" + ((p, fields, skipHeader, writeHeader, separator, mf).toString) override def equals(that: Any): Boolean = Option(that) - .map { _.toString == this.toString }.getOrElse(false) + .map(_.toString == this.toString) + .getOrElse(false) override lazy val hashCode: Int = toString.hashCode } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala index a14deb4e54..cf165352ed 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala @@ -9,7 +9,9 @@ object TypedPipeChecker { * Takes a List and a transform function. * The resulting TypedPipe form the transform will be run through asserts */ - def checkOutputTransform[T, U, R](input: List[T])(transform: TypedPipe[T] => TypedPipe[U])(assertions: List[U] => R): R = + def checkOutputTransform[T, U, R](input: List[T])(transform: TypedPipe[T] => TypedPipe[U])( + assertions: List[U] => R + ): R = assertions(inMemoryToList(transform(TypedPipe.from(input)))) /* @@ -24,8 +26,7 @@ object TypedPipeChecker { * Execute a TypedPipe in memory and return the result as a List */ def inMemoryToList[T](output: TypedPipe[T]): List[T] = - output - .toIterableExecution + output.toIterableExecution .waitFor(Config.unitTestDefault, Local(strictSources = true)) .get .toList diff --git a/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala index 8355ef1e0c..25f6da7b47 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala @@ -12,21 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.scheme.hadoop.{ WritableSequenceFile => CHWritableSequenceFile } +import cascading.scheme.hadoop.{WritableSequenceFile => CHWritableSequenceFile} import cascading.tap.SinkMode import cascading.tuple.Fields import org.apache.hadoop.io.Writable trait WritableSequenceFileScheme extends SchemedSource { + /** - * There are three allowed cases: - * fields.size == 1 and keyType == null - * fields.size == 1 and valueType == null - * fields.size == 2 and keyType != null and valueType != null + * There are three allowed cases: fields.size == 1 and keyType == null fields.size == 1 and valueType == + * null fields.size == 2 and keyType != null and valueType != null */ def fields: Fields def keyType: Class[_ <: Writable] @@ -34,24 +33,28 @@ trait WritableSequenceFileScheme extends SchemedSource { // TODO Cascading doesn't support local mode yet override def hdfsScheme = - HadoopSchemeInstance(new CHWritableSequenceFile(fields, keyType, valueType).asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new CHWritableSequenceFile(fields, keyType, valueType) + .asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]] + ) } object WritableSequenceFile { + /** by default uses the first two fields in the tuple */ def apply[K <: Writable: Manifest, V <: Writable: Manifest](path: String): WritableSequenceFile[K, V] = WritableSequenceFile(path, Dsl.intFields(0 to 1)) } case class WritableSequenceFile[K <: Writable: Manifest, V <: Writable: Manifest]( - p: String, - f: Fields, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends FixedPathSource(p) - with WritableSequenceFileScheme - with LocalTapSource - with TypedSink[(K, V)] - with Mappable[(K, V)] { + p: String, + f: Fields, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) + with WritableSequenceFileScheme + with LocalTapSource + with TypedSink[(K, V)] + with Mappable[(K, V)] { override val fields = f override val keyType = manifest[K].runtimeClass.asInstanceOf[Class[_ <: Writable]] @@ -67,21 +70,25 @@ case class WritableSequenceFile[K <: Writable: Manifest, V <: Writable: Manifest } object MultipleWritableSequenceFiles { + /** by default uses the first two fields in the tuple */ - def apply[K <: Writable: Manifest, V <: Writable: Manifest](paths: Seq[String]): MultipleWritableSequenceFiles[K, V] = + def apply[K <: Writable: Manifest, V <: Writable: Manifest]( + paths: Seq[String] + ): MultipleWritableSequenceFiles[K, V] = MultipleWritableSequenceFiles(paths, Dsl.intFields(0 to 1)) } /** - * This is only a TypedSource (which is a superclass of Mappable) as sinking into multiple directories - * is not well defined + * This is only a TypedSource (which is a superclass of Mappable) as sinking into multiple directories is not + * well defined */ case class MultipleWritableSequenceFiles[K <: Writable: Manifest, V <: Writable: Manifest]( - p: Seq[String], f: Fields) - extends FixedPathSource(p: _*) - with WritableSequenceFileScheme - with LocalTapSource - with Mappable[(K, V)] { + p: Seq[String], + f: Fields +) extends FixedPathSource(p: _*) + with WritableSequenceFileScheme + with LocalTapSource + with Mappable[(K, V)] { override val fields = f override val keyType = manifest[K].runtimeClass.asInstanceOf[Class[_ <: Writable]] diff --git a/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala b/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala index 7b1fb7b519..2a087de77d 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala @@ -4,8 +4,10 @@ import cascading.flow.planner.PlannerException /** * Provide handlers and mapping for exceptions - * @param xMap - mapping as Map with Throwable class as key and String as value - * @param dVal - default value for undefined keys in mapping + * @param xMap + * - mapping as Map with Throwable class as key and String as value + * @param dVal + * - default value for undefined keys in mapping */ class XHandler(xMap: Map[Class[_ <: Throwable], String], dVal: String) { @@ -17,8 +19,8 @@ class XHandler(xMap: Map[Class[_ <: Throwable], String], dVal: String) { } /** - * Provide apply method for creating XHandlers with default or custom settings - * and contain messages and mapping + * Provide apply method for creating XHandlers with default or custom settings and contain messages and + * mapping */ object RichXHandler { @@ -27,8 +29,9 @@ object RichXHandler { val BinaryProblem = "GUESS: This may be a problem with the binary version of a dependency. " + "Check which versions of dependencies you're pulling in." - val RequiredCascadingFabricNotInClassPath = "GUESS: Required Cascading fabric is not supplied in the classpath." + - "Check which versions and variants of dependencies you're pulling in." + val RequiredCascadingFabricNotInClassPath = + "GUESS: Required Cascading fabric is not supplied in the classpath." + + "Check which versions and variants of dependencies you're pulling in." val DataIsMissing = "GUESS: Data is missing from the path you provided." @@ -40,14 +43,15 @@ object RichXHandler { classOf[AbstractMethodError] -> BinaryProblem, classOf[NoSuchMethodError] -> BinaryProblem, classOf[InvalidSourceException] -> DataIsMissing, - classOf[PlannerException] -> RequireSinks) + classOf[PlannerException] -> RequireSinks + ) val gitHubUrl = "https://github.com/twitter/scalding/wiki/Common-Exceptions-and-possible-reasons#" @annotation.tailrec final def rootOf(t: Throwable): Throwable = t.getCause match { - case null => t + case null => t case cause => rootOf(cause) } @@ -55,8 +59,8 @@ object RichXHandler { final def peelUntilMappable(t: Throwable): Class[_ <: Throwable] = (mapping.get(t.getClass), t.getCause) match { case (Some(diag), _) => t.getClass // we're going to find a mappable cause. - case (None, null) => t.getClass // we're at the root. There won't be any cause - case (None, cause) => peelUntilMappable(cause) + case (None, null) => t.getClass // we're at the root. There won't be any cause + case (None, cause) => peelUntilMappable(cause) } def createXUrl(t: Throwable): String = @@ -66,7 +70,8 @@ object RichXHandler { new XHandler(xMap, dVal) def apply(t: Throwable): String = - mapping.get(peelUntilMappable(t)) + mapping + .get(peelUntilMappable(t)) .map(_ + "\n") .getOrElse("") + "If you know what exactly caused this error, please consider contributing to GitHub via following link.\n" + diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala index 2f3c4b57d8..881d1db7e7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala @@ -16,12 +16,15 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { def withSchema(schema: Fields) = new TestSource(this, schema) } - class SimpleTypeTestSourceWithoutSchema[T](val data: Iterable[T])(implicit setter: TupleSetter[T]) extends TestSourceWithoutSchema { + class SimpleTypeTestSourceWithoutSchema[T](val data: Iterable[T])(implicit setter: TupleSetter[T]) + extends TestSourceWithoutSchema { def addSourceToJob(jobTest: JobTest, source: Source): JobTest = jobTest.source[T](source, data)(setter) } - implicit def fromSimpleTypeDataToSourceWithoutSchema[T](data: Iterable[T])(implicit setter: TupleSetter[T]): SimpleTypeTestSourceWithoutSchema[T] = + implicit def fromSimpleTypeDataToSourceWithoutSchema[T](data: Iterable[T])(implicit + setter: TupleSetter[T] + ): SimpleTypeTestSourceWithoutSchema[T] = new SimpleTypeTestSourceWithoutSchema(data)(setter) class TestSource(data: TestSourceWithoutSchema, schema: Fields) { @@ -59,12 +62,17 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { } case class TestCaseWhen(sources: List[TestSource], operation: PipeOperation) { - def Then[OutputType](assertion: Buffer[OutputType] => Unit)(implicit conv: TupleConverter[OutputType]): Unit = { + def Then[OutputType](assertion: Buffer[OutputType] => Unit)(implicit + conv: TupleConverter[OutputType] + ): Unit = CompleteTestCase(sources, operation, assertion).run() - } } - case class CompleteTestCase[OutputType](sources: List[TestSource], operation: PipeOperation, assertion: Buffer[OutputType] => Unit)(implicit conv: TupleConverter[OutputType]) { + case class CompleteTestCase[OutputType]( + sources: List[TestSource], + operation: PipeOperation, + assertion: Buffer[OutputType] => Unit + )(implicit conv: TupleConverter[OutputType]) { class DummyJob(args: Args) extends Job(args) { val inputPipes: List[RichPipe] = sources.map(testSource => RichPipe(testSource.asSource.read)) @@ -78,7 +86,7 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { val jobTest = JobTest(new DummyJob(_)) // Add Sources - sources foreach { _.addSourceDataToJobTest(jobTest) } + sources.foreach(_.addSourceDataToJobTest(jobTest)) // Add Sink jobTest.sink[OutputType](Tsv("output")) { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala index bb37ee95b0..29e4c9965c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.bdd -import com.twitter.scalding.{ Dsl, RichPipe } +import com.twitter.scalding.{Dsl, RichPipe} import cascading.pipe.Pipe trait PipeOperationsConversions { @@ -8,8 +8,11 @@ trait PipeOperationsConversions { trait PipeOperation { def assertPipeSize(pipes: List[RichPipe], expectedSize: Int) = - require(pipes.size == expectedSize, "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + - "Verify matching of given and when clauses in test case definition") + require( + pipes.size == expectedSize, + "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + + "Verify matching of given and when clauses in test case definition" + ) def apply(pipes: List[RichPipe]): Pipe } @@ -40,27 +43,47 @@ trait PipeOperationsConversions { def apply(pipes: List[RichPipe]): Pipe = op(pipes.map(_.pipe)) } - implicit val fromSingleRichPipeFunctionToOperation: (RichPipe => RichPipe) => OnePipeOperation = (op: RichPipe => RichPipe) => new OnePipeOperation(op(_).pipe) - implicit val fromSingleRichPipeToPipeFunctionToOperation: (RichPipe => Pipe) => OnePipeOperation = (op: RichPipe => Pipe) => new OnePipeOperation(op(_)) - - implicit val fromTwoRichPipesFunctionToOperation: ((RichPipe, RichPipe) => RichPipe) => TwoPipesOperation = (op: (RichPipe, RichPipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) - implicit val fromTwoRichPipesToRichPipeFunctionToOperation: ((RichPipe, RichPipe) => Pipe) => TwoPipesOperation = (op: (RichPipe, RichPipe) => Pipe) => new TwoPipesOperation(op(_, _)) - - implicit val fromThreeRichPipesFunctionToOperation: ((RichPipe, RichPipe, RichPipe) => RichPipe) => ThreePipesOperation = (op: (RichPipe, RichPipe, RichPipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) - implicit val fromThreeRichPipesToPipeFunctionToOperation: ((RichPipe, RichPipe, RichPipe) => Pipe) => ThreePipesOperation = (op: (RichPipe, RichPipe, RichPipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) - - implicit val fromRichPipeListFunctionToOperation: (List[RichPipe] => RichPipe) => ListRichPipesOperation = (op: List[RichPipe] => RichPipe) => new ListRichPipesOperation(op(_).pipe) - implicit val fromRichPipeListToPipeFunctionToOperation: (List[RichPipe] => Pipe) => ListRichPipesOperation = (op: List[RichPipe] => Pipe) => new ListRichPipesOperation(op(_)) - - implicit val fromSinglePipeFunctionToOperation: (Pipe => RichPipe) => OnePipeOperation = (op: Pipe => RichPipe) => new OnePipeOperation(op(_).pipe) - implicit val fromSinglePipeToRichPipeFunctionToOperation: (Pipe => Pipe) => OnePipeOperation = (op: Pipe => Pipe) => new OnePipeOperation(op(_)) - - implicit val fromTwoPipeFunctionToOperation: ((Pipe, Pipe) => RichPipe) => TwoPipesOperation = (op: (Pipe, Pipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) - implicit val fromTwoRichPipeToPipeFunctionToOperation: ((Pipe, Pipe) => Pipe) => TwoPipesOperation = (op: (Pipe, Pipe) => Pipe) => new TwoPipesOperation(op(_, _)) - - implicit val fromThreePipeFunctionToOperation: ((Pipe, Pipe, Pipe) => RichPipe) => ThreePipesOperation = (op: (Pipe, Pipe, Pipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) - implicit val fromThreeRichPipeToPipeFunctionToOperation: ((Pipe, Pipe, Pipe) => Pipe) => ThreePipesOperation = (op: (Pipe, Pipe, Pipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) - - implicit val fromListPipeFunctionToOperation: (List[Pipe] => RichPipe) => ListPipesOperation = (op: List[Pipe] => RichPipe) => new ListPipesOperation(op(_).pipe) - implicit val fromListRichPipeToPipeFunctionToOperation: (List[Pipe] => Pipe) => ListPipesOperation = (op: List[Pipe] => Pipe) => new ListPipesOperation(op(_)) + implicit val fromSingleRichPipeFunctionToOperation: (RichPipe => RichPipe) => OnePipeOperation = + (op: RichPipe => RichPipe) => new OnePipeOperation(op(_).pipe) + implicit val fromSingleRichPipeToPipeFunctionToOperation: (RichPipe => Pipe) => OnePipeOperation = + (op: RichPipe => Pipe) => new OnePipeOperation(op(_)) + + implicit val fromTwoRichPipesFunctionToOperation: ((RichPipe, RichPipe) => RichPipe) => TwoPipesOperation = + (op: (RichPipe, RichPipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) + implicit val fromTwoRichPipesToRichPipeFunctionToOperation + : ((RichPipe, RichPipe) => Pipe) => TwoPipesOperation = (op: (RichPipe, RichPipe) => Pipe) => + new TwoPipesOperation(op(_, _)) + + implicit val fromThreeRichPipesFunctionToOperation + : ((RichPipe, RichPipe, RichPipe) => RichPipe) => ThreePipesOperation = + (op: (RichPipe, RichPipe, RichPipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) + implicit val fromThreeRichPipesToPipeFunctionToOperation + : ((RichPipe, RichPipe, RichPipe) => Pipe) => ThreePipesOperation = + (op: (RichPipe, RichPipe, RichPipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) + + implicit val fromRichPipeListFunctionToOperation: (List[RichPipe] => RichPipe) => ListRichPipesOperation = + (op: List[RichPipe] => RichPipe) => new ListRichPipesOperation(op(_).pipe) + implicit val fromRichPipeListToPipeFunctionToOperation: (List[RichPipe] => Pipe) => ListRichPipesOperation = + (op: List[RichPipe] => Pipe) => new ListRichPipesOperation(op(_)) + + implicit val fromSinglePipeFunctionToOperation: (Pipe => RichPipe) => OnePipeOperation = + (op: Pipe => RichPipe) => new OnePipeOperation(op(_).pipe) + implicit val fromSinglePipeToRichPipeFunctionToOperation: (Pipe => Pipe) => OnePipeOperation = + (op: Pipe => Pipe) => new OnePipeOperation(op(_)) + + implicit val fromTwoPipeFunctionToOperation: ((Pipe, Pipe) => RichPipe) => TwoPipesOperation = + (op: (Pipe, Pipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) + implicit val fromTwoRichPipeToPipeFunctionToOperation: ((Pipe, Pipe) => Pipe) => TwoPipesOperation = + (op: (Pipe, Pipe) => Pipe) => new TwoPipesOperation(op(_, _)) + + implicit val fromThreePipeFunctionToOperation: ((Pipe, Pipe, Pipe) => RichPipe) => ThreePipesOperation = + (op: (Pipe, Pipe, Pipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) + implicit val fromThreeRichPipeToPipeFunctionToOperation + : ((Pipe, Pipe, Pipe) => Pipe) => ThreePipesOperation = (op: (Pipe, Pipe, Pipe) => Pipe) => + new ThreePipesOperation(op(_, _, _)) + + implicit val fromListPipeFunctionToOperation: (List[Pipe] => RichPipe) => ListPipesOperation = + (op: List[Pipe] => RichPipe) => new ListPipesOperation(op(_).pipe) + implicit val fromListRichPipeToPipeFunctionToOperation: (List[Pipe] => Pipe) => ListPipesOperation = + (op: List[Pipe] => Pipe) => new ListPipesOperation(op(_)) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala index 7c1ace3474..9f52b95dbf 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala @@ -8,7 +8,8 @@ import TDsl._ trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { - def Given[TypeIn](source: TypedTestSource[TypeIn]): TestCaseGiven1[TypeIn] = new TestCaseGiven1[TypeIn](source) + def Given[TypeIn](source: TypedTestSource[TypeIn]): TestCaseGiven1[TypeIn] = + new TestCaseGiven1[TypeIn](source) def GivenSources(sources: List[TypedTestSource[_]]): TestCaseGivenList = new TestCaseGivenList(sources) @@ -16,10 +17,10 @@ trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { def data: Iterable[T] def asSource: Source = - IterableSource(data map { Tuple1(_) }, 'tuple) + IterableSource(data.map(Tuple1(_)), 'tuple) def readFromSourceAsTyped(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = - asSource.read.toTypedPipe[Tuple1[T]]('tuple) map { _._1 } + asSource.read.toTypedPipe[Tuple1[T]]('tuple).map(_._1) def addSourceDataToJobTest(jobTest: JobTest) = jobTest.source(asSource, data) } @@ -35,34 +36,54 @@ trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { case class TestCaseGiven1[TypeIn](source: TypedTestSource[TypeIn]) { def And[TypeIn2](other: TypedTestSource[TypeIn2]) = TestCaseGiven2[TypeIn, TypeIn2](source, other) - def When[TypeOut: Manifest: TupleConverter: TupleSetter](op: OneTypedPipeOperation[TypeIn, TypeOut]): TestCaseWhen[TypeOut] = TestCaseWhen(List(source), op) + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: OneTypedPipeOperation[TypeIn, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source), op) } - case class TestCaseGiven2[TypeIn1, TypeIn2](source: TypedTestSource[TypeIn1], other: TypedTestSource[TypeIn2]) { + case class TestCaseGiven2[TypeIn1, TypeIn2]( + source: TypedTestSource[TypeIn1], + other: TypedTestSource[TypeIn2] + ) { def And[TypeIn3](third: TypedTestSource[TypeIn3]) = TestCaseGiven3(source, other, third) - def When[TypeOut: Manifest: TupleConverter: TupleSetter](op: TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut]): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other), op) + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other), op) } - case class TestCaseGiven3[TypeIn1, TypeIn2, TypeIn3](source: TypedTestSource[TypeIn1], other: TypedTestSource[TypeIn2], third: TypedTestSource[TypeIn3]) { + case class TestCaseGiven3[TypeIn1, TypeIn2, TypeIn3]( + source: TypedTestSource[TypeIn1], + other: TypedTestSource[TypeIn2], + third: TypedTestSource[TypeIn3] + ) { def And(next: TypedTestSource[_]) = TestCaseGivenList(List(source, other, third, next)) - def When[TypeOut: Manifest: TupleConverter: TupleSetter](op: ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut]): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other, third), op) + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other, third), op) } case class TestCaseGivenList(sources: List[TypedTestSource[_]]) { def And(next: TypedTestSource[_]) = TestCaseGivenList((next :: sources.reverse).reverse) - def When[TypeOut: Manifest](op: ListOfTypedPipesOperations[TypeOut]): TestCaseWhen[TypeOut] = TestCaseWhen(sources, op) + def When[TypeOut: Manifest](op: ListOfTypedPipesOperations[TypeOut]): TestCaseWhen[TypeOut] = + TestCaseWhen(sources, op) } - case class TestCaseWhen[OutputType: Manifest](sources: List[TypedTestSource[_]], operation: TypedPipeOperation[OutputType]) { - def Then(assertion: Buffer[OutputType] => Unit): Unit = { + case class TestCaseWhen[OutputType: Manifest]( + sources: List[TypedTestSource[_]], + operation: TypedPipeOperation[OutputType] + ) { + def Then(assertion: Buffer[OutputType] => Unit): Unit = CompleteTestCase(sources, operation, assertion).run() - } } - case class CompleteTestCase[OutputType: Manifest](sources: List[TypedTestSource[_]], operation: TypedPipeOperation[OutputType], assertion: Buffer[OutputType] => Unit) { + case class CompleteTestCase[OutputType: Manifest]( + sources: List[TypedTestSource[_]], + operation: TypedPipeOperation[OutputType], + assertion: Buffer[OutputType] => Unit + ) { class DummyJob(args: Args) extends Job(args) { val inputPipes: List[TypedPipe[_]] = sources.map(testSource => testSource.readFromSourceAsTyped) @@ -81,7 +102,7 @@ trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { val jobTest = JobTest(new DummyJob(_)) // Add Sources - sources foreach { _.addSourceDataToJobTest(jobTest) } + sources.foreach(_.addSourceDataToJobTest(jobTest)) implicit val td: TypeDescriptor[OutputType] = new TypeDescriptor[OutputType] { def converter = TupleConverter.singleConverter @@ -90,8 +111,8 @@ trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { } // Add Sink - jobTest.sink[OutputType](TypedText.tsv[OutputType]("output")) { - buffer: Buffer[OutputType] => assertion(buffer) + jobTest.sink[OutputType](TypedText.tsv[OutputType]("output")) { buffer: Buffer[OutputType] => + assertion(buffer) } // Execute diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala index cc497ffd66..cb0ef31537 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala @@ -6,51 +6,70 @@ trait TypedPipeOperationsConversions { trait TypedPipeOperation[TypeOut] { def assertPipeSize(pipes: List[TypedPipe[_]], expectedSize: Int) = - require(pipes.size == expectedSize, "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + - "Verify matching of given and when clauses in test case definition") + require( + pipes.size == expectedSize, + "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + + "Verify matching of given and when clauses in test case definition" + ) def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] } - class OneTypedPipeOperation[TypeIn, TypeOut](op: TypedPipe[TypeIn] => TypedPipe[TypeOut]) extends TypedPipeOperation[TypeOut] { + class OneTypedPipeOperation[TypeIn, TypeOut](op: TypedPipe[TypeIn] => TypedPipe[TypeOut]) + extends TypedPipeOperation[TypeOut] { override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { assertPipeSize(pipes, 1) op(pipes.head.asInstanceOf[TypedPipe[TypeIn]]) } } - class TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut](op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut]) extends TypedPipeOperation[TypeOut] { + class TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut] + ) extends TypedPipeOperation[TypeOut] { override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { assertPipeSize(pipes, 2) op( pipes(0).asInstanceOf[TypedPipe[TypeIn1]], // linter:disable - pipes(1).asInstanceOf[TypedPipe[TypeIn2]]) + pipes(1).asInstanceOf[TypedPipe[TypeIn2]] + ) } } - class ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut](op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut]) extends TypedPipeOperation[TypeOut] { + class ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut] + ) extends TypedPipeOperation[TypeOut] { override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { assertPipeSize(pipes, 3) op( pipes(0).asInstanceOf[TypedPipe[TypeIn1]], // linter:disable pipes(1).asInstanceOf[TypedPipe[TypeIn2]], - pipes(2).asInstanceOf[TypedPipe[TypeIn3]]) + pipes(2).asInstanceOf[TypedPipe[TypeIn3]] + ) } } - class ListOfTypedPipesOperations[TypeOut](op: List[TypedPipe[_]] => TypedPipe[TypeOut]) extends TypedPipeOperation[TypeOut] { + class ListOfTypedPipesOperations[TypeOut](op: List[TypedPipe[_]] => TypedPipe[TypeOut]) + extends TypedPipeOperation[TypeOut] { override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = op(pipes) } - implicit def fromSingleTypedPipeFunctionToOperation[TypeIn, TypeOut](op: TypedPipe[TypeIn] => TypedPipe[TypeOut]): OneTypedPipeOperation[TypeIn, TypeOut] = + implicit def fromSingleTypedPipeFunctionToOperation[TypeIn, TypeOut]( + op: TypedPipe[TypeIn] => TypedPipe[TypeOut] + ): OneTypedPipeOperation[TypeIn, TypeOut] = new OneTypedPipeOperation[TypeIn, TypeOut](op) - implicit def fromTwoTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeOut](op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut]): TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut] = + implicit def fromTwoTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut] + ): TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut] = new TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut](op) - implicit def fromThreeTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut](op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut]): ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut] = + implicit def fromThreeTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut] + ): ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut] = new ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut](op) - implicit def fromListOfTypedPipesFunctionToOperation[TypeOut](op: List[TypedPipe[_]] => TypedPipe[TypeOut]): ListOfTypedPipesOperations[TypeOut] = + implicit def fromListOfTypedPipesFunctionToOperation[TypeOut]( + op: List[TypedPipe[_]] => TypedPipe[TypeOut] + ): ListOfTypedPipesOperations[TypeOut] = new ListOfTypedPipesOperations[TypeOut](op) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala index 0c8da2e65e..6b2a616a33 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala @@ -2,7 +2,7 @@ package com.twitter.scalding.estimation import cascading.flow.FlowStep import cascading.tap.hadoop.Hfs -import cascading.tap.{ CompositeTap, Tap } +import cascading.tap.{CompositeTap, Tap} import com.twitter.scalding.tap.GlobHfs import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory @@ -25,7 +25,7 @@ object Common { val conf = step.getConfig unrollTaps(step).flatMap { case tap: GlobHfs => Some(tap.toString -> tap.getSize(conf)) - case tap: Hfs => Some(tap.toString -> GlobHfs.getSize(tap.getPath, conf)) + case tap: Hfs => Some(tap.toString -> GlobHfs.getSize(tap.getPath, conf)) case tap => LOG.warn("InputSizeReducerEstimator unable to calculate size: " + tap) None @@ -33,4 +33,4 @@ object Common { } def totalInputSize(step: FlowStep[JobConf]): Long = inputSizes(step).map(_._2).sum -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala index f16181071e..82b6036543 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala @@ -1,19 +1,21 @@ package com.twitter.scalding.estimation -import cascading.flow.{ Flow, FlowStep } +import cascading.flow.{Flow, FlowStep} import com.twitter.algebird.Monoid import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory -import scala.util.{ Failure, Success } +import scala.util.{Failure, Success} case class FlowStrategyInfo( - flow: Flow[JobConf], - predecessorSteps: Seq[FlowStep[JobConf]], - step: FlowStep[JobConf]) + flow: Flow[JobConf], + predecessorSteps: Seq[FlowStep[JobConf]], + step: FlowStep[JobConf] +) /** * Trait for estimation some parameters of Job. - * @tparam T return type of estimation + * @tparam T + * return type of estimation */ trait Estimator[T] { def estimate(info: FlowStrategyInfo): Option[T] @@ -22,12 +24,11 @@ trait Estimator[T] { case class FallbackEstimator[T](first: Estimator[T], fallback: Estimator[T]) extends Estimator[T] { private val LOG = LoggerFactory.getLogger(this.getClass) - override def estimate(info: FlowStrategyInfo): Option[T] = { + override def estimate(info: FlowStrategyInfo): Option[T] = first.estimate(info).orElse { LOG.warn(s"$first estimator failed. Falling back to $fallback.") fallback.estimate(info) } - } } class FallbackEstimatorMonoid[T] extends Monoid[Estimator[T]] { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala index 488b45b280..94beb9db3d 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala @@ -6,35 +6,37 @@ import scala.util.Try * Info about a prior FlowStep, provided by implementers of HistoryService */ final case class FlowStepHistory( - keys: FlowStepKeys, - submitTimeMillis: Long, - launchTimeMillis: Long, - finishTimeMillis: Long, - totalMaps: Long, - totalReduces: Long, - finishedMaps: Long, - finishedReduces: Long, - failedMaps: Long, - failedReduces: Long, - mapFileBytesRead: Long, - mapFileBytesWritten: Long, - mapOutputBytes: Long, - reduceFileBytesRead: Long, - hdfsBytesRead: Long, - hdfsBytesWritten: Long, - mapperTimeMillis: Long, - reducerTimeMillis: Long, - reduceShuffleBytes: Long, - cost: Double, - tasks: Seq[Task]) + keys: FlowStepKeys, + submitTimeMillis: Long, + launchTimeMillis: Long, + finishTimeMillis: Long, + totalMaps: Long, + totalReduces: Long, + finishedMaps: Long, + finishedReduces: Long, + failedMaps: Long, + failedReduces: Long, + mapFileBytesRead: Long, + mapFileBytesWritten: Long, + mapOutputBytes: Long, + reduceFileBytesRead: Long, + hdfsBytesRead: Long, + hdfsBytesWritten: Long, + mapperTimeMillis: Long, + reducerTimeMillis: Long, + reduceShuffleBytes: Long, + cost: Double, + tasks: Seq[Task] +) final case class FlowStepKeys( - jobName: String, - user: String, - priority: String, - status: String, - version: String, - queue: String) + jobName: String, + user: String, + priority: String, + status: String, + version: String, + queue: String +) final case class Task(details: Map[String, Any], counters: Map[String, Long]) { def taskType: Option[String] = details.get(Task.TaskType).map(_.asInstanceOf[String]) @@ -47,4 +49,3 @@ object Task { trait HistoryService { def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala index 55f67e4fd2..10dc824d07 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala @@ -3,6 +3,7 @@ package com.twitter.scalding.estimation.memory import org.apache.hadoop.mapred.JobConf object MemoryEstimatorConfig { + /** Output param: what the original job map memory was. */ val originalMapMemory = "scalding.map.memory.estimator.original" @@ -10,9 +11,8 @@ object MemoryEstimatorConfig { val originalReduceMemory = "scalding.reduce.memory.estimator.original" /** - * Value of alpha for exponential smoothing. - * Lower values ensure more smoothing and less importance to newer data - * Higher values provide lesser smoothing and more importance to newer data + * Value of alpha for exponential smoothing. Lower values ensure more smoothing and less importance to newer + * data Higher values provide lesser smoothing and more importance to newer data */ val alphaKey = "scalding.memory.estimator.alpha" @@ -25,7 +25,7 @@ object MemoryEstimatorConfig { val minContainerMemoryKey = "scalding.memory.estimator.container.min" - /** yarn allocates in increments. So we might as well round up our container ask **/ + /** yarn allocates in increments. So we might as well round up our container ask * */ val yarnSchedulerIncrementAllocationMB = "yarn.scheduler.increment-allocation-mb" /** Maximum number of history items to use for memory estimation. */ diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala index ac961bb38e..5342e52189 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala @@ -1,10 +1,10 @@ package com.twitter.scalding.estimation.memory -import cascading.flow.{ Flow, FlowStep, FlowStepStrategy } +import cascading.flow.{Flow, FlowStep, FlowStepStrategy} import com.twitter.algebird.Monoid -import com.twitter.scalding.estimation.{ Estimator, FallbackEstimatorMonoid, FlowStrategyInfo } -import com.twitter.scalding.{ Config, StringUtility } -import java.util.{ List => JList } +import com.twitter.scalding.estimation.{Estimator, FallbackEstimatorMonoid, FlowStrategyInfo} +import com.twitter.scalding.{Config, StringUtility} +import java.util.{List => JList} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ @@ -17,38 +17,37 @@ object MemoryEstimatorStepStrategy extends FlowStepStrategy[JobConf] { new FallbackEstimatorMonoid[MemoryEstimate] /** - * Make memory estimate, possibly overriding explicitly-set memory settings, - * and save useful info (such as the original & estimate value of memory settings) - * in JobConf for later consumption. + * Make memory estimate, possibly overriding explicitly-set memory settings, and save useful info (such as + * the original & estimate value of memory settings) in JobConf for later consumption. * * Called by Cascading at the start of each job step. */ final override def apply( - flow: Flow[JobConf], - preds: JList[FlowStep[JobConf]], - step: FlowStep[JobConf]): Unit = { - + flow: Flow[JobConf], + preds: JList[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = if (skipMemoryEstimation(step)) { LOG.info(s"Skipping memory estimation as ${Config.MemoryEstimators} is not set ") } else { estimate(flow, preds.asScala, step) } - } private[estimation] def skipMemoryEstimation(step: FlowStep[JobConf]): Boolean = step.getConfig.get(Config.MemoryEstimators, "").isEmpty private[estimation] def estimate( - flow: Flow[JobConf], - preds: Seq[FlowStep[JobConf]], - step: FlowStep[JobConf]): Unit = { + flow: Flow[JobConf], + preds: Seq[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = { val conf = step.getConfig Option(conf.get(Config.MemoryEstimators)).foreach { clsNames => - val clsLoader = Thread.currentThread.getContextClassLoader - val estimators = StringUtility.fastSplit(clsNames, ",") + val estimators = StringUtility + .fastSplit(clsNames, ",") .map(clsLoader.loadClass(_).newInstance.asInstanceOf[Estimator[MemoryEstimate]]) val combinedEstimator = Monoid.sum(estimators) @@ -73,7 +72,11 @@ object MemoryEstimatorStepStrategy extends FlowStepStrategy[JobConf] { } } - private[estimation] def setMemory(memorySettings: (Long, Long), keys: (String, String), conf: JobConf): Unit = { + private[estimation] def setMemory( + memorySettings: (Long, Long), + keys: (String, String), + conf: JobConf + ): Unit = { val (xmxMemory, containerMemory) = memorySettings val (xmxKey, containerKey) = keys @@ -85,7 +88,8 @@ object MemoryEstimatorStepStrategy extends FlowStepStrategy[JobConf] { private[estimation] def setXmxMemory(xmxKey: String, xmxMemory: Long, conf: JobConf): Unit = { val xmxOpts = conf.get(xmxKey, "") //remove existing xmx / xms - val xmxOptsWithoutXm = xmxOpts.split(" ").filterNot(s => s.startsWith("-Xmx") || s.startsWith("-Xms")).mkString(" ") + val xmxOptsWithoutXm = + xmxOpts.split(" ").filterNot(s => s.startsWith("-Xmx") || s.startsWith("-Xms")).mkString(" ") conf.set(xmxKey, xmxOptsWithoutXm + s" -Xmx${xmxMemory}m") } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala index 8d28749ad9..424fd0ab8c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.estimation.memory -import com.twitter.scalding.estimation.{ FlowStepHistory, FlowStrategyInfo, HistoryEstimator, Task } +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryEstimator, Task} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory @@ -26,7 +26,11 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { override def maxHistoryItems(conf: JobConf): Int = MemoryEstimatorConfig.getMaxHistory(conf) - override protected def estimate(info: FlowStrategyInfo, conf: JobConf, history: Seq[FlowStepHistory]): Option[MemoryEstimate] = { + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[MemoryEstimate] = { // iterate over mem history // collect: for maps, list of max memory in past runs // for reduce, list of max memory in past runs @@ -41,9 +45,12 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { val containerMemoryOfMapper = containerMemory(xmxMemoryOfMapper, conf) val containerMemoryOfReducer = containerMemory(xmxMemoryOfReducer, conf) - Some(MemoryEstimate( - cappedMemory(containerMemoryOfMapper, conf), - cappedMemory(containerMemoryOfReducer, conf))) + Some( + MemoryEstimate( + cappedMemory(containerMemoryOfMapper, conf), + cappedMemory(containerMemoryOfReducer, conf) + ) + ) } private def xmxMemory(historyMemory: Seq[Long], conf: JobConf): Double = { @@ -55,14 +62,15 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { //TODO handle gc - LOG.info(s"Calculated xmx memory for: $historyMemory smoothAvg = $smoothEstimation, scaled: $scaledEstimation") + LOG.info( + s"Calculated xmx memory for: $historyMemory smoothAvg = $smoothEstimation, scaled: $scaledEstimation" + ) scaledEstimation / (1024L * 1024) } - private def containerMemory(xmxMemory: Double, conf: JobConf): Double = { + private def containerMemory(xmxMemory: Double, conf: JobConf): Double = xmxMemory * MemoryEstimatorConfig.getXmxScaleFactor(conf) - } private def cappedMemory(containerMemory: Double, conf: JobConf): Option[(Long, Long)] = { val schedulerIncrement = MemoryEstimatorConfig.getYarnSchedulerIncrement(conf) @@ -85,8 +93,8 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { private def historyMemory(history: FlowStepHistory): (Option[Long], Option[Long]) = { LOG.debug(s"Processing tasks: ${history.tasks}") - val reduceTasks: Seq[Task] = history.tasks.filter { t => t.taskType.contains("REDUCE") } - val mapTasks: Seq[Task] = history.tasks.filter { t => t.taskType.contains("MAP") } + val reduceTasks: Seq[Task] = history.tasks.filter(t => t.taskType.contains("REDUCE")) + val mapTasks: Seq[Task] = history.tasks.filter(t => t.taskType.contains("MAP")) // handle empty task list due to either no task history / lack of reducers val maxReduceCommittedHeap: Option[Long] = @@ -101,7 +109,9 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { else Some(mapTasks.flatMap(_.committedHeapBytes).max) - LOG.info(s"Calculated max committed heap for job: ${history.keys}, map: $maxMapCommittedHeap reduce: $maxReduceCommittedHeap") + LOG.info( + s"Calculated max committed heap for job: ${history.keys}, map: $maxMapCommittedHeap reduce: $maxReduceCommittedHeap" + ) (maxMapCommittedHeap, maxReduceCommittedHeap) } @@ -114,4 +124,4 @@ trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { private def roundUp(value: Double, block: Double): Long = (Math.ceil(value / block) * block).toLong -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala index 1ded2d86af..e4c637af81 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala @@ -7,7 +7,7 @@ import java.net.URI import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.mapreduce.filecache.{ DistributedCache => HDistributedCache } +import org.apache.hadoop.mapreduce.filecache.{DistributedCache => HDistributedCache} import org.apache.hadoop.fs.Path object URIHasher { @@ -21,9 +21,11 @@ object URIHasher { /** * generates hashes of hdfs URIs using algebird's MurmurHash128 - * @param uri the URI to generate a hash for - * @return a hex-encoded string of the bytes of the 128 bit hash. The results are zero padded on the left, so - * this string will always be 32 characters long. + * @param uri + * the URI to generate a hash for + * @return + * a hex-encoded string of the bytes of the 128 bit hash. The results are zero padded on the left, so this + * string will always be 32 characters long. */ def apply(uri: URI): String = { val (h1, h2) = HashFunc(uri.toASCIIString) @@ -33,15 +35,15 @@ object URIHasher { } /** - * The distributed cache is simply hadoop's method for allowing each node local access to a - * specific file. The registration of that file must be called with the Configuration of the job, - * and not when it's on a mapper or reducer. Additionally, a unique name for the node-local access - * path must be used to prevent collisions in the cluster. This class provides this functionality. + * The distributed cache is simply hadoop's method for allowing each node local access to a specific file. The + * registration of that file must be called with the Configuration of the job, and not when it's on a mapper + * or reducer. Additionally, a unique name for the node-local access path must be used to prevent collisions + * in the cluster. This class provides this functionality. * - * In the configuration phase, the file URI is used to construct an UncachedFile instance. The name - * of the symlink to use on the mappers is only available after calling the add() method, which - * registers the file and computes the unique symlink name and returns a CachedFile instance. - * The CachedFile instance is Serializable, it's designed to be assigned to a val and accessed later. + * In the configuration phase, the file URI is used to construct an UncachedFile instance. The name of the + * symlink to use on the mappers is only available after calling the add() method, which registers the file + * and computes the unique symlink name and returns a CachedFile instance. The CachedFile instance is + * Serializable, it's designed to be assigned to a val and accessed later. * * The local symlink is available thorugh .file or .path depending on what type you need. * @@ -60,33 +62,35 @@ object URIHasher { * * {{{ * object YourExecJob extends ExecutionApp { - * override def job = - * Execution.withCachedFile("/path/to/your/file.txt") { file => - * doSomething(theCachedFile.path) - * } + * override def job = + * Execution.withCachedFile("/path/to/your/file.txt") { file => + * doSomething(theCachedFile.path) + * } * } * * example with Execution and multiple files: * * object YourExecJob extends ExecutionApp { - * override def job = - * Execution.withCachedFile("/path/to/your/one.txt") { one => - * Execution.withCachedFile("/path/to/your/second.txt") { second => - * doSomething(one.path, second.path) - * } - * } + * override def job = + * Execution.withCachedFile("/path/to/your/one.txt") { one => + * Execution.withCachedFile("/path/to/your/second.txt") { second => + * doSomething(one.path, second.path) + * } + * } * } * * }}} - * */ object DistributedCacheFile { + /** - * Create an object that can be used to register a given URI (representing an hdfs file) - * that should be added to the DistributedCache. + * Create an object that can be used to register a given URI (representing an hdfs file) that should be + * added to the DistributedCache. * - * @param uri The fully qualified URI that points to the hdfs file to add - * @return A CachedFile instance + * @param uri + * The fully qualified URI that points to the hdfs file to add + * @return + * A CachedFile instance */ def apply(uri: URI)(implicit mode: Mode): CachedFile = { val cachedFile = UncachedFile(Right(uri)).cached(mode) @@ -110,13 +114,12 @@ object DistributedCacheFile { private[scalding] def cachedFile(path: String, mode: Mode): CachedFile = UncachedFile(Left(path)).cached(mode) - private[scalding] def addCachedFile(cachedFile: CachedFile, mode: Mode): Unit = { + private[scalding] def addCachedFile(cachedFile: CachedFile, mode: Mode): Unit = (cachedFile, mode) match { case (hadoopFile: HadoopCachedFile, hadoopMode: HadoopMode) => HDistributedCache.addCacheFile(symlinkedUriFor(hadoopFile.sourceUri), hadoopMode.jobConf) case _ => } - } def symlinkNameFor(uri: URI): String = { val hexsum = URIHasher(uri) @@ -133,17 +136,17 @@ final case class UncachedFile private[scalding] (source: Either[String, URI]) { def cached(mode: Mode): CachedFile = mode match { - case Hdfs(_, conf) => addHdfs(conf) - case HadoopTest(conf, _) => addHdfs(conf) + case Hdfs(_, conf) => addHdfs(conf) + case HadoopTest(conf, _) => addHdfs(conf) case (Local(_) | Test(_)) => addLocal() - case _ => throw new RuntimeException("unhandled mode: %s".format(mode)) + case _ => throw new RuntimeException("unhandled mode: %s".format(mode)) } private[this] def addLocal(): CachedFile = { val path = source match { case Left(strPath) => strPath - case Right(uri) => uri.getPath + case Right(uri) => uri.getPath } LocallyCachedFile(path) @@ -164,7 +167,7 @@ final case class UncachedFile private[scalding] (source: Either[String, URI]) { val sourceUri = source match { case Left(strPath) => makeQualifiedStr(strPath, conf) - case Right(uri) => makeQualifiedURI(uri, conf) + case Right(uri) => makeQualifiedURI(uri, conf) } HadoopCachedFile(sourceUri) @@ -172,6 +175,7 @@ final case class UncachedFile private[scalding] (source: Either[String, URI]) { } sealed abstract class CachedFile { + /** The path to the cahced file on disk (the symlink registered at configuration time) */ def path: String diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala index aa3d822da7..7a98dd380c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala @@ -21,10 +21,14 @@ import com.twitter.scalding._ import com.twitter.scalding.macros.impl._ object MacroImplicits { + /** * This method provides proof that the given type is a case class. */ - implicit def materializeCaseClassTupleSetter[T]: TupleSetter[T] = macro TupleSetterImpl.caseClassTupleSetterImpl[T] - implicit def materializeCaseClassTupleConverter[T]: TupleConverter[T] = macro TupleConverterImpl.caseClassTupleConverterImpl[T] - implicit def materializeCaseClassTypeDescriptor[T]: TypeDescriptor[T] = macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] + implicit def materializeCaseClassTupleSetter[T]: TupleSetter[T] = + macro TupleSetterImpl.caseClassTupleSetterImpl[T] + implicit def materializeCaseClassTupleConverter[T]: TupleConverter[T] = + macro TupleConverterImpl.caseClassTupleConverterImpl[T] + implicit def materializeCaseClassTypeDescriptor[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala index deec82f58e..790fc767ae 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala @@ -28,10 +28,12 @@ object Macros { // In the unknown casehowever if a type is reached that we don't know what to do we store that type into the tuple. def caseClassTupleSetter[T]: TupleSetter[T] = macro TupleSetterImpl.caseClassTupleSetterImpl[T] - def caseClassTupleSetterWithUnknown[T]: TupleSetter[T] = macro TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T] + def caseClassTupleSetterWithUnknown[T]: TupleSetter[T] = + macro TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T] def caseClassTupleConverter[T]: TupleConverter[T] = macro TupleConverterImpl.caseClassTupleConverterImpl[T] - def caseClassTupleConverterWithUnknown[T]: TupleConverter[T] = macro TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T] + def caseClassTupleConverterWithUnknown[T]: TupleConverter[T] = + macro TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T] def toFields[T]: Fields = macro FieldsProviderImpl.toFieldsImpl[T] def toFieldsWithUnknown[T]: Fields = macro FieldsProviderImpl.toFieldsWithUnknownImpl[T] @@ -42,6 +44,8 @@ object Macros { def toIndexedFields[T]: Fields = macro FieldsProviderImpl.toIndexedFieldsImpl[T] def toIndexedFieldsWithUnknown[T]: Fields = macro FieldsProviderImpl.toIndexedFieldsWithUnknownImpl[T] - def caseClassTypeDescriptor[T]: TypeDescriptor[T] = macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] - def caseClassTypeDescriptorWithUnknown[T]: TypeDescriptor[T] = macro TypeDescriptorProviderImpl.caseClassTypeDescriptorWithUnknownImpl[T] + def caseClassTypeDescriptor[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] + def caseClassTypeDescriptorWithUnknown[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorWithUnknownImpl[T] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala index 7202be2096..1493c17172 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala @@ -16,20 +16,22 @@ package com.twitter.scalding.macros.impl import scala.reflect.macros.Context -import scala.util.{ Failure, Success } +import scala.util.{Failure, Success} /** - * Helper class for generating setters from case class to - * other types. E.g. cascading Tuple, jdbc PreparedStatement + * Helper class for generating setters from case class to other types. E.g. cascading Tuple, jdbc + * PreparedStatement */ object CaseClassBasedSetterImpl { - def apply[T](c: Context)(container: c.TermName, allowUnknownTypes: Boolean, - fsetter: CaseClassFieldSetter)(implicit T: c.WeakTypeTag[T]): (Int, c.Tree) = { + def apply[T](c: Context)(container: c.TermName, allowUnknownTypes: Boolean, fsetter: CaseClassFieldSetter)( + implicit T: c.WeakTypeTag[T] + ): (Int, c.Tree) = { import c.universe._ sealed trait SetterBuilder { def columns: Int + /** * This Tree assumes that "val $value = ..." has been set */ @@ -39,8 +41,8 @@ object CaseClassBasedSetterImpl { def columns = 1 def setTree(value: Tree, offset: Int) = fsetter.from(c)(tpe, offset, container, value) match { case Success(tree) => tree - case Failure(e) => c.abort(c.enclosingPosition, - s"Case class ${T} is supported. Error on $tpe, ${e.getMessage}") + case Failure(e) => + c.abort(c.enclosingPosition, s"Case class $T is supported. Error on $tpe, ${e.getMessage}") } } case object DefaultSetter extends SetterBuilder { @@ -63,12 +65,12 @@ object CaseClassBasedSetterImpl { final case class CaseClassSetter(members: Vector[(Tree => Tree, SetterBuilder)]) extends SetterBuilder { val columns = members.map(_._2.columns).sum def setTree(value: Tree, offset: Int) = { - val setters = members.scanLeft((offset, Option.empty[Tree])) { - case ((off, _), (access, sb)) => + val setters = members + .scanLeft((offset, Option.empty[Tree])) { case ((off, _), (access, sb)) => val cca = newTermName(c.fresh("access")) val ccaT = q"$cca" (off + sb.columns, Some(q"val $cca = ${access(value)}; ${sb.setTree(ccaT, off)}")) - } + } .collect { case (_, Some(tree)) => tree } q"""..$setters""" } @@ -93,24 +95,22 @@ object CaseClassBasedSetterImpl { case tpe if tpe.erasure =:= typeOf[Option[Any]] => val innerType = tpe.asInstanceOf[TypeRefApi].args.head OptionSetter(matchField(innerType)) - case tpe if (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass) => - CaseClassSetter(expandMethod(normalized(tpe)).map { - case (fn, tpe) => - (fn, matchField(tpe)) + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => + CaseClassSetter(expandMethod(normalized(tpe)).map { case (fn, tpe) => + (fn, matchField(tpe)) }) case tpe if allowUnknownTypes => DefaultSetter case _ => - c.abort(c.enclosingPosition, - s"Case class ${T.tpe} is not supported at type: $outerType") + c.abort(c.enclosingPosition, s"Case class ${T.tpe} is not supported at type: $outerType") } } def expandMethod(outerTpe: Type): Vector[(Tree => Tree, Type)] = - outerTpe - .declarations + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .map { accessorMethod => - val fieldType = normalized(accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass)) + val fieldType = + normalized(accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass)) ({ pTree: Tree => q"""$pTree.$accessorMethod""" }, fieldType) } @@ -118,7 +118,8 @@ object CaseClassBasedSetterImpl { // in TupleSetterImpl, the outer-most input val is called t, so we pass that in here: val sb = matchField(normalized(T.tpe)) - if (sb.columns == 0) c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") + if (sb.columns == 0) + c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") (sb.columns, sb.setTree(q"t", 0)) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala index 6e3b6337c0..abe07cbc66 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala @@ -20,8 +20,8 @@ import scala.reflect.macros.Context import scala.util.Try /** - * Helper to set fields from a case class to other "container" types - * E.g. cascading Tuple, jdbc PreparedStatement + * Helper to set fields from a case class to other "container" types E.g. cascading Tuple, jdbc + * PreparedStatement */ trait CaseClassFieldSetter { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala index a3800f7ad1..8bc2ce9391 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala @@ -29,8 +29,7 @@ sealed trait NamingScheme case object Indexed extends NamingScheme /** - * Uses prefixes for naming nested fields. - * For e.g. for the following nested case class: + * Uses prefixes for naming nested fields. For e.g. for the following nested case class: * {{{ * case class Outer(id: Long, name: String, details: Inner) * case class Inner(phone: Int) @@ -40,8 +39,7 @@ case object Indexed extends NamingScheme case object NamedWithPrefix extends NamingScheme /** - * No prefixes for naming nested fields. - * For e.g. for the following nested case class: + * No prefixes for naming nested fields. For e.g. for the following nested case class: * {{{ * case class Outer(id: Long, name: String, details: Inner) * case class Inner(phone: Int) @@ -53,8 +51,8 @@ case object NamedWithPrefix extends NamingScheme case object NamedNoPrefix extends NamingScheme /** - * This class contains the core macro implementations. This is in a separate module to allow it to be in - * a separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. */ object FieldsProviderImpl { def toFieldsImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = @@ -63,16 +61,22 @@ object FieldsProviderImpl { def toFieldsWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = toFieldsCommonImpl(c, NamedWithPrefix, true)(T) - def toFieldsWithUnknownNoPrefixImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = + def toFieldsWithUnknownNoPrefixImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = toFieldsCommonImpl(c, NamedNoPrefix, true)(T) def toIndexedFieldsImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = toFieldsCommonImpl(c, Indexed, false)(T) - def toIndexedFieldsWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = + def toIndexedFieldsWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = toFieldsCommonImpl(c, Indexed, true)(T) - def toFieldsCommonImpl[T](c: Context, namingScheme: NamingScheme, allowUnknownTypes: Boolean)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = { + def toFieldsCommonImpl[T](c: Context, namingScheme: NamingScheme, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = { import c.universe._ import TypeDescriptorProviderImpl.optionInner @@ -81,18 +85,19 @@ object FieldsProviderImpl { def isNumbered(t: Type): Boolean = t match { case tpe if tpe =:= typeOf[Boolean] => true - case tpe if tpe =:= typeOf[Short] => true - case tpe if tpe =:= typeOf[Int] => true - case tpe if tpe =:= typeOf[Long] => true - case tpe if tpe =:= typeOf[Float] => true - case tpe if tpe =:= typeOf[Double] => true - case tpe if tpe =:= typeOf[String] => true - case tpe => optionInner(c)(tpe) match { // linter:disable:UseOptionExistsNotPatMatch - case Some(t) => - // we need this match style to do tailrec - isNumbered(t) - case None => false - } + case tpe if tpe =:= typeOf[Short] => true + case tpe if tpe =:= typeOf[Int] => true + case tpe if tpe =:= typeOf[Long] => true + case tpe if tpe =:= typeOf[Float] => true + case tpe if tpe =:= typeOf[Double] => true + case tpe if tpe =:= typeOf[String] => true + case tpe => + optionInner(c)(tpe) match { // linter:disable:UseOptionExistsNotPatMatch + case Some(t) => + // we need this match style to do tailrec + isNumbered(t) + case None => false + } } object FieldBuilder { @@ -136,17 +141,17 @@ object FieldsProviderImpl { */ def matchField(fieldType: Type, name: String): FieldBuilder = fieldType match { - case tpe if tpe =:= typeOf[String] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[String] => Primitive(name, tpe) case tpe if tpe =:= typeOf[Boolean] => Primitive(name, tpe) - case tpe if tpe =:= typeOf[Short] => Primitive(name, tpe) - case tpe if tpe =:= typeOf[Int] => Primitive(name, tpe) - case tpe if tpe =:= typeOf[Long] => Primitive(name, tpe) - case tpe if tpe =:= typeOf[Float] => Primitive(name, tpe) - case tpe if tpe =:= typeOf[Double] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Short] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Int] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Long] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Float] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Double] => Primitive(name, tpe) case tpe if tpe.erasure =:= typeOf[Option[Any]] => val innerType = tpe.asInstanceOf[TypeRefApi].args.head OptionBuilder(matchField(innerType, name)) - case tpe if (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass) => + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => CaseClassBuilder(name, expandMethod(tpe).map { case (t, s) => matchField(t, s) }) case tpe if allowUnknownTypes => Primitive(name, tpe) case tpe => @@ -154,14 +159,14 @@ object FieldsProviderImpl { } def expandMethod(outerTpe: Type): Vector[(Type, String)] = - outerTpe - .declarations + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .map { accessorMethod => val fieldName = accessorMethod.name.toString val fieldType = accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass) (fieldType, fieldName) - }.toVector + } + .toVector val builder = matchField(T.tpe, "") if (builder.columnTypes.isEmpty) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala index e8b43768c5..8886bb38ea 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala @@ -18,26 +18,30 @@ package com.twitter.scalding.macros.impl import scala.reflect.macros.Context import com.twitter.scalding._ + /** - * This class contains the core macro implementations. This is in a separate module to allow it to be in - * a separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. */ object TupleConverterImpl { def caseClassTupleConverterImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleConverter[T]] = caseClassTupleConverterCommonImpl(c, false) - def caseClassTupleConverterWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleConverter[T]] = + def caseClassTupleConverterWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleConverter[T]] = caseClassTupleConverterCommonImpl(c, true) - def caseClassTupleConverterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleConverter[T]] = { + def caseClassTupleConverterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleConverter[T]] = { import c.universe._ import TypeDescriptorProviderImpl.evidentColumn def membersOf(outerTpe: Type): Vector[Type] = - outerTpe - .declarations + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .map { accessorMethod => accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass) @@ -63,11 +67,11 @@ object TupleConverterImpl { final case class CaseClassBuilder(tpe: Type, members: Vector[ConverterBuilder]) extends ConverterBuilder { val columns = members.map(_.columns).sum def applyTree(offset: Int) = { - val trees = members.scanLeft((offset, Option.empty[Tree])) { - case ((o, _), cb) => + val trees = members + .scanLeft((offset, Option.empty[Tree])) { case ((o, _), cb) => val nextOffset = o + cb.columns (nextOffset, Some(cb.applyTree(o))) - } + } .collect { case (_, Some(tree)) => tree } q"${tpe.typeSymbol.companionSymbol}(..$trees)" @@ -104,17 +108,20 @@ object TupleConverterImpl { case Some(ev) => // we can recurse here OptionBuilder(ev, matchField(innerType)) } - case tpe if (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass) => + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => CaseClassBuilder(tpe, membersOf(tpe).map(matchField)) case tpe if allowUnknownTypes => - PrimitiveBuilder(idx => q"""t.getObject(${idx}).asInstanceOf[$tpe]""") + PrimitiveBuilder(idx => q"""t.getObject($idx).asInstanceOf[$tpe]""") case tpe => - c.abort(c.enclosingPosition, - s"${T.tpe} is not pure primitives, Option of a primitive, nested case classes when looking at type ${tpe}") + c.abort( + c.enclosingPosition, + s"${T.tpe} is not pure primitives, Option of a primitive, nested case classes when looking at type $tpe" + ) } val builder = matchField(T.tpe) - if (builder.columns == 0) c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") + if (builder.columns == 0) + c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") val res = q""" new _root_.com.twitter.scalding.TupleConverter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala index d6fbd921a5..d5a2726db0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala @@ -38,20 +38,22 @@ object TupleFieldSetter extends CaseClassFieldSetter { q"""$container.set($idx, $fieldValue)""" } - override def from(c: Context)(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { + override def from( + c: Context + )(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { import c.universe._ - def simpleType(accessor: Tree) = q"""${accessor}(${idx}, $fieldValue)""" + def simpleType(accessor: Tree) = q"""$accessor($idx, $fieldValue)""" fieldType match { - case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") + case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") case tpe if tpe =:= typeOf[Boolean] => simpleType(q"$container.setBoolean") - case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") - case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInteger") - case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") - case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") - case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") - case _ => sys.error(s"Unsupported primitive type ${fieldType}") + case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") + case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInteger") + case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") + case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") + case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") + case _ => sys.error(s"Unsupported primitive type $fieldType") } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala index bb1c67a5b9..89e29a76fe 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala @@ -18,19 +18,24 @@ package com.twitter.scalding.macros.impl import scala.reflect.macros.Context import com.twitter.scalding._ + /** - * This class contains the core macro implementations. This is in a separate module to allow it to be in - * a separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. */ object TupleSetterImpl { def caseClassTupleSetterImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleSetter[T]] = caseClassTupleSetterCommonImpl(c, false) - def caseClassTupleSetterWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleSetter[T]] = + def caseClassTupleSetterWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleSetter[T]] = caseClassTupleSetterCommonImpl(c, true) - def caseClassTupleSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleSetter[T]] = { + def caseClassTupleSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleSetter[T]] = { import c.universe._ val tupTerm = newTermName(c.fresh("tup")) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala index 349cfe70f3..9d12969044 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala @@ -18,29 +18,31 @@ package com.twitter.scalding.macros.impl import scala.reflect.macros.Context import com.twitter.scalding._ + /** - * This class contains the core macro implementations. This is in a separate module to allow it to be in - * a separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. */ object TypeDescriptorProviderImpl { def caseClassTypeDescriptorImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TypeDescriptor[T]] = caseClassTypeDescriptorCommonImpl(c, false)(T) - def caseClassTypeDescriptorWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TypeDescriptor[T]] = + def caseClassTypeDescriptorWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TypeDescriptor[T]] = caseClassTypeDescriptorCommonImpl(c, true)(T) /** - * When flattening a nested structure with Options, the evidentColumn is a column, relative to the - * the first 0-offset column, that represents evidence of this T, and hence set of columns, are - * present or absent. This is to handle Option types in text files such as CSV and TSV. - * a type T is evident if it the evidentColumn.exists + * When flattening a nested structure with Options, the evidentColumn is a column, relative to the the first + * 0-offset column, that represents evidence of this T, and hence set of columns, are present or absent. + * This is to handle Option types in text files such as CSV and TSV. a type T is evident if it the + * evidentColumn.exists * - * primitive numbers are evident - * case classes are evident if they have at least one evident member. + * primitive numbers are evident case classes are evident if they have at least one evident member. * - * Strings are not evident (we can't distinguish Empty from "") - * Option[T] is not evident (we can't tell Some(None) from None). + * Strings are not evident (we can't distinguish Empty from "") Option[T] is not evident (we can't tell + * Some(None) from None). */ def evidentColumn(c: Context, allowUnknown: Boolean = false)(tpe: c.universe.Type): Option[Int] = { import c.universe._ @@ -61,17 +63,17 @@ object TypeDescriptorProviderImpl { if (allowUnknown) thisColumn else (offset + 1, None) case tpe if tpe =:= typeOf[Boolean] => thisColumn - case tpe if tpe =:= typeOf[Short] => thisColumn - case tpe if tpe =:= typeOf[Int] => thisColumn - case tpe if tpe =:= typeOf[Long] => thisColumn - case tpe if tpe =:= typeOf[Float] => thisColumn - case tpe if tpe =:= typeOf[Double] => thisColumn + case tpe if tpe =:= typeOf[Short] => thisColumn + case tpe if tpe =:= typeOf[Int] => thisColumn + case tpe if tpe =:= typeOf[Long] => thisColumn + case tpe if tpe =:= typeOf[Float] => thisColumn + case tpe if tpe =:= typeOf[Double] => thisColumn // We recurse on Option and case classes case tpe if tpe.erasure =:= typeOf[Option[Any]] => val innerTpe = optionInner(c)(tpe).get // we have no evidentColumn, but we need to compute the next index (go(innerTpe, offset)._1, None) - case tpe if (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass) => + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => val flattened = flattenOnce(tpe) .scanLeft((offset, Option.empty[Int])) { case ((off, _), t) => go(t, off) } @@ -80,7 +82,7 @@ object TypeDescriptorProviderImpl { (nextPos, ev) case _ if allowUnknown => thisColumn case t => - c.abort(c.enclosingPosition, s"Case class ${tpe} at $t is not pure primitives or nested case classes") + c.abort(c.enclosingPosition, s"Case class $tpe at $t is not pure primitives or nested case classes") } } go(tpe, 0)._2 @@ -93,7 +95,8 @@ object TypeDescriptorProviderImpl { def isTuple[T](c: Context)(implicit T: c.WeakTypeTag[T]): Boolean = { import c.universe._ - val tupleTypes = List(typeOf[Tuple1[Any]], + val tupleTypes = List( + typeOf[Tuple1[Any]], typeOf[Tuple2[Any, Any]], typeOf[Tuple3[Any, Any, Any]], typeOf[Tuple4[Any, Any, Any, Any]], @@ -110,15 +113,88 @@ object TypeDescriptorProviderImpl { typeOf[Tuple15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], typeOf[Tuple16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], typeOf[Tuple17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[Tuple18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[Tuple19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[Tuple20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[Tuple21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[Tuple22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]) - (tupleTypes.exists { _ =:= T.tpe.erasure }) + typeOf[ + Tuple18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] + ], + typeOf[ + Tuple19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] + ], + typeOf[Tuple20[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Tuple21[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Tuple22[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]] + ) + tupleTypes.exists { _ =:= T.tpe.erasure } } - def caseClassTypeDescriptorCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit T: c.WeakTypeTag[T]): c.Expr[TypeDescriptor[T]] = { + def caseClassTypeDescriptorCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TypeDescriptor[T]] = { import c.universe._ val converter = TupleConverterImpl.caseClassTupleConverterCommonImpl[T](c, allowUnknownTypes) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala index 442cd4fedb..8249bfbf71 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala @@ -6,42 +6,31 @@ import cascading.tuple.TupleEntry import cascading.pipe.Pipe /** - * Serve as a repo for self-contained combinatorial functions with no dependencies - * such as - * combinations, aka n choose k, nCk - * permutations , aka nPk - * subset sum : numbers that add up to a finite sum - * weightedSum: For weights (a,b,c, ...), want integers (x,y,z,...) to satisfy constraint |ax + by + cz + ... - result | < error - * ... + * Serve as a repo for self-contained combinatorial functions with no dependencies such as combinations, aka n + * choose k, nCk permutations , aka nPk subset sum : numbers that add up to a finite sum weightedSum: For + * weights (a,b,c, ...), want integers (x,y,z,...) to satisfy constraint |ax + by + cz + ... - result | < + * error ... * - * @author : Krishnan Raman, kraman@twitter.com + * @author + * : Krishnan Raman, kraman@twitter.com */ object Combinatorics { /** - * Given an int k, and an input of size n, - * return a pipe with nCk combinations, with k columns per row - * + * Given an int k, and an input of size n, return a pipe with nCk combinations, with k columns per row * * Computes nCk = n choose k, for large values of nCk * - * Use-case: Say you have 100 hashtags sitting in an array - * You want a table with 5 hashtags per row, all possible combinations - * If the hashtags are sitting in a string array, then - * combinations[String]( hashtags, 5) - * will create the 100 chose 5 combinations. + * Use-case: Say you have 100 hashtags sitting in an array You want a table with 5 hashtags per row, all + * possible combinations If the hashtags are sitting in a string array, then combinations[String]( hashtags, + * 5) will create the 100 chose 5 combinations. * * Algorithm: Use k pipes, cross pipes two at a time, filter out non-monotonic entries * - * eg. 10C2 = 10 choose 2 - * Use 2 pipes. - * Pipe1 = (1,2,3,...10) - * Pipe2 = (2,3,4....10) - * Cross Pipe1 with Pipe2 for 10*9 = 90 tuples - * Filter out tuples that are non-monotonic - * For (t1,t2) we want t1 Symbol("n" + x)) // all column names - val pipes = allc.zipWithIndex.map(x => { + val pipes = allc.zipWithIndex.map { x => val num = x._2 + 1 val pipe = IterableSource((num to n), x._1).read (pipe, num) - }) + } - val res = pipes.reduceLeft((a, b) => { + val res = pipes.reduceLeft { (a, b) => val num = b._2 val prevname = Symbol("n" + (num - 1)) val myname = Symbol("n" + num) val mypipe = a._1 .crossWithSmaller(b._1) - .filter(prevname, myname){ - foo: (Int, Int) => - val (nn1, nn2) = foo - nn1 < nn2 + .filter(prevname, myname) { foo: (Int, Int) => + val (nn1, nn2) = foo + nn1 < nn2 } (mypipe, -1) - })._1 + }._1 - (1 to k).foldLeft(res)((a, b) => { + (1 to k).foldLeft(res) { (a, b) => val myname = Symbol("n" + b) val newname = Symbol("k" + b) - a.map(myname -> newname){ - inpc: Int => input(inpc - 1) + a.map(myname -> newname) { inpc: Int => + input(inpc - 1) }.discard(myname) - }) + } } /** * Return a pipe with all nCk combinations, with k columns per row */ - def combinations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = combinations[Int]((1 to n).toArray, k) + def combinations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = + combinations[Int]((1 to n).toArray, k) /** - * Return a pipe with all nPk permutations, with k columns per row - * For details, see combinations(...) above + * Return a pipe with all nPk permutations, with k columns per row For details, see combinations(...) above */ def permutations[T](input: IndexedSeq[T], k: Int)(implicit flowDef: FlowDef, mode: Mode): Pipe = { @@ -101,125 +89,135 @@ object Combinatorics { // on a given row, we cannot have duplicate columns in a permutation val res = pipes - .reduceLeft((a, b) => { a.crossWithSmaller(b) }) - .filter(allc) { - x: TupleEntry => - Boolean - val values = (0 until allc.size).map(i => x.getInteger(i.asInstanceOf[java.lang.Integer])) - values.size == values.distinct.size + .reduceLeft((a, b) => a.crossWithSmaller(b)) + .filter(allc) { x: TupleEntry => + Boolean + val values = (0 until allc.size).map(i => x.getInteger(i.asInstanceOf[java.lang.Integer])) + values.size == values.distinct.size } // map numerals to actual data - (1 to k).foldLeft(res)((a, b) => { + (1 to k).foldLeft(res) { (a, b) => val myname = Symbol("n" + b) val newname = Symbol("k" + b) - a.map(myname -> newname){ - inpc: Int => input(inpc - 1) + a.map(myname -> newname) { inpc: Int => + input(inpc - 1) }.discard(myname) - }) + } } /** * Return a pipe with all nPk permutations, with k columns per row */ - def permutations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = permutations[Int]((1 to n).toArray, k) + def permutations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = + permutations[Int]((1 to n).toArray, k) /** - * Goal: Given weights (a,b,c, ...), we seek integers (x,y,z,...) to satisft - * the constraint |ax + by + cz + ... - result | < error + * Goal: Given weights (a,b,c, ...), we seek integers (x,y,z,...) to satisft the constraint |ax + by + cz + + * ... - result | < error * - * Parameters: The weights (a,b,c,...) must be non-negative doubles. - * Our search space is 0 to result/min(weights) - * The returned pipe will contain integer tuples (x,y,z,...) that satisfy ax+by+cz +... = result + * Parameters: The weights (a,b,c,...) must be non-negative doubles. Our search space is 0 to + * result/min(weights) The returned pipe will contain integer tuples (x,y,z,...) that satisfy ax+by+cz +... + * = result * - * Note: This is NOT Simplex - * WE use a slughtly-improved brute-force algorithm that performs well on account of parallelization. - * Algorithm: - * Create as many pipes as the number of weights - * Each pipe copntains integral multiples of the weight w ie. (0,1w,2w,3w,4w,....) - * Iterate as below - - * Cross two pipes - * Create a temp column that stores intermediate results - * Apply progressive filtering on the temp column - * Discard the temp column - * Once all pipes are crossed, test for temp column within error bounds of result - * Discard duplicates at end of process + * Note: This is NOT Simplex WE use a slughtly-improved brute-force algorithm that performs well on account + * of parallelization. Algorithm: Create as many pipes as the number of weights Each pipe copntains integral + * multiples of the weight w ie. (0,1w,2w,3w,4w,....) Iterate as below - Cross two pipes Create a temp + * column that stores intermediate results Apply progressive filtering on the temp column Discard the temp + * column Once all pipes are crossed, test for temp column within error bounds of result Discard duplicates + * at end of process * * Usecase: We'd like to generate all integer tuples for typical usecases like * - * 0. How many ways can you invest $1000 in facebook, microsoft, hp ? - * val cash = 1000.0 - * val error = 5.0 // max error $5, so its ok if we cannot invest the last $5 or less - * val (FB, MSFT, HP) = (23.3,27.4,51.2) // share prices - * val stocks = IndexedSeq( FB,MSFT,HP ) - * weightedSum( stocks, cash, error).write( Tsv("invest.txt")) - * - * 1. find all (x,y,z) such that 2x+3y+5z = 23, with max error 1 - * weightedSum( IndexedSeq(2.0,3.0,5.0), 23.0, 1.0) + * 0. How many ways can you invest $1000 in facebook, microsoft, hp ? val cash = 1000.0 val error = 5.0 // + * max error $5, so its ok if we cannot invest the last $5 or less val (FB, MSFT, HP) = (23.3,27.4,51.2) // + * share prices val stocks = IndexedSeq( FB,MSFT,HP ) weightedSum( stocks, cash, error).write( + * Tsv("invest.txt")) * - * 2. find all (a,b,c,d) such that 2a+12b+12.5c+34.7d = 3490 with max error 3 - * weightedSum( IndexedSeq(2.0,12.0,2.5,34.7),3490.0,3.0) + * 1. find all (x,y,z) such that 2x+3y+5z = 23, with max error 1 weightedSum( IndexedSeq(2.0,3.0,5.0), + * 23.0, 1.0) * - * This is at the heart of portfolio mgmt( Markowitz optimization), subset-sum, operations-research LP problems. + * 2. find all (a,b,c,d) such that 2a+12b+12.5c+34.7d = 3490 with max error 3 weightedSum( + * IndexedSeq(2.0,12.0,2.5,34.7),3490.0,3.0) * + * This is at the heart of portfolio mgmt( Markowitz optimization), subset-sum, operations-research LP + * problems. */ - def weightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit flowDef: FlowDef, mode: Mode): Pipe = { + def weightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit + flowDef: FlowDef, + mode: Mode + ): Pipe = { val numWeights = weights.size val allColumns = (1 to numWeights).map(x => Symbol("k" + x)) // create as many single-column pipes as the number of weights - val pipes = allColumns.zip(weights).map(x => { - val (name, wt) = x - val points = Stream.iterate(0.0) { _ + wt }.takeWhile(_ <= result) - IterableSource(points, name).read - }).zip(allColumns) + val pipes = allColumns + .zip(weights) + .map { x => + val (name, wt) = x + val points = Stream.iterate(0.0)(_ + wt).takeWhile(_ <= result) + IterableSource(points, name).read + } + .zip(allColumns) val first = pipes.head val accum = (first._1, List[Symbol](first._2)) val rest = pipes.tail - val res = rest.foldLeft(accum)((a, b) => { - - val (apipe, aname) = a - val (bpipe, bname) = b - val allc = (List(aname)).flatten ++ List[Symbol](bname) - - // Algorithm: - // Cross two pipes - // Create a temp column that stores intermediate results - // Apply progressive filtering on the temp column - // Discard the temp column - // Once all pipes are crossed, test for temp column within error bounds of result - // Discard duplicates at end of process - - (apipe.crossWithSmaller(bpipe) - .map(allc -> 'temp){ - x: TupleEntry => - val values = (0 until allc.size).map(i => x.getDouble(i.asInstanceOf[java.lang.Integer])) - values.sum - }.filter('temp){ - x: Double => if (allc.size == numWeights) (math.abs(x - result) <= error) else (x <= result) - }.discard('temp), allc) - })._1.unique(allColumns) - - (1 to numWeights).zip(weights).foldLeft(res) ((a, b) => { - val (num, wt) = b - val myname = Symbol("k" + num) - a.map(myname -> myname){ x: Int => (x / wt).toInt } - }) + val res = rest + .foldLeft(accum) { (a, b) => + val (apipe, aname) = a + val (bpipe, bname) = b + val allc = (List(aname)).flatten ++ List[Symbol](bname) + + // Algorithm: + // Cross two pipes + // Create a temp column that stores intermediate results + // Apply progressive filtering on the temp column + // Discard the temp column + // Once all pipes are crossed, test for temp column within error bounds of result + // Discard duplicates at end of process + + ( + apipe + .crossWithSmaller(bpipe) + .map(allc -> 'temp) { x: TupleEntry => + val values = (0 until allc.size).map(i => x.getDouble(i.asInstanceOf[java.lang.Integer])) + values.sum + } + .filter('temp) { x: Double => + if (allc.size == numWeights) (math.abs(x - result) <= error) else (x <= result) + } + .discard('temp), + allc + ) + } + ._1 + .unique(allColumns) + + (1 to numWeights) + .zip(weights) + .foldLeft(res) { (a, b) => + val (num, wt) = b + val myname = Symbol("k" + num) + a.map(myname -> myname) { x: Int => (x / wt).toInt } + } } /** - * Does the exact same thing as weightedSum, but filters out tuples with a weight of 0 - * The returned pipe contain only positive non-zero weights. + * Does the exact same thing as weightedSum, but filters out tuples with a weight of 0 The returned pipe + * contain only positive non-zero weights. */ - def positiveWeightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit flowDef: FlowDef, mode: Mode): Pipe = { + def positiveWeightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit + flowDef: FlowDef, + mode: Mode + ): Pipe = { val allColumns = (1 to weights.size).map(x => Symbol("k" + x)) weightedSum(weights, result, error) .filter(allColumns) { x: TupleEntry => - (0 until allColumns.size).forall { i => x.getDouble(java.lang.Integer.valueOf(i)) != 0.0 } + (0 until allColumns.size).forall(i => x.getDouble(java.lang.Integer.valueOf(i)) != 0.0) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala index 0922bec394..29916be750 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala @@ -2,14 +2,16 @@ package com.twitter.scalding.mathematics class Histogram(map: Map[Double, Long], binWidth: Double) { lazy val size = map.values.sum - lazy val sum = map.foldLeft(0.0){ case (acc, (bin, count)) => acc + bin * count } + lazy val sum = map.foldLeft(0.0) { case (acc, (bin, count)) => acc + bin * count } lazy val keys = map.keys.toList.sorted lazy val min = keys.head lazy val max = keys.last lazy val stdDev = { - val squaredDiff = map.foldLeft(0.0){ case (acc, (bin, count)) => acc + count * math.pow(bin - mean, 2.0) } + val squaredDiff = map.foldLeft(0.0) { case (acc, (bin, count)) => + acc + count * math.pow(bin - mean, 2.0) + } math.sqrt(squaredDiff / size) } @@ -35,7 +37,7 @@ class Histogram(map: Map[Double, Long], binWidth: Double) { result } - def percentile(p: Int) = keys.find{ bin => cdf(bin) * 100 >= p }.getOrElse(-1d) + def percentile(p: Int) = keys.find(bin => cdf(bin) * 100 >= p).getOrElse(-1d) lazy val median = percentile(50) lazy val q1 = percentile(25) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala index b20598937c..958b684284 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import com.twitter.algebird.{ Monoid, Group, Ring, Field } +import com.twitter.algebird.{Field, Group, Monoid, Ring} import com.twitter.algebird.field._ // backwards compatiblity support import com.twitter.scalding._ @@ -28,91 +28,117 @@ import cascading.flow._ import com.twitter.scalding.Dsl._ /** - * Matrix class - represents an infinite (hopefully sparse) matrix. - * any elements without a row are interpretted to be zero. - * the pipe hold ('rowIdx, 'colIdx, 'val) where in principle - * each row/col/value type is generic, with the constraint that ValT is a Ring[T] - * In practice, RowT and ColT are going to be Strings, Integers or Longs in the usual case. + * Matrix class - represents an infinite (hopefully sparse) matrix. any elements without a row are + * interpretted to be zero. the pipe hold ('rowIdx, 'colIdx, 'val) where in principle each row/col/value type + * is generic, with the constraint that ValT is a Ring[T] In practice, RowT and ColT are going to be Strings, + * Integers or Longs in the usual case. * - * WARNING: - * It is NOT OKAY to use the same instance of Matrix/Row/Col with DIFFERENT Monoids/Rings/Fields. - * If you want to change, midstream, the Monoid on your ValT, you have to construct a new Matrix. - * This is due to caching of internal computation graphs. + * WARNING: It is NOT OKAY to use the same instance of Matrix/Row/Col with DIFFERENT Monoids/Rings/Fields. If + * you want to change, midstream, the Monoid on your ValT, you have to construct a new Matrix. This is due to + * caching of internal computation graphs. * - * RowVector - handles matrices of row dimension one. It is the result of some of the matrix methods and has methods - * that return ColVector and diagonal matrix + * RowVector - handles matrices of row dimension one. It is the result of some of the matrix methods and has + * methods that return ColVector and diagonal matrix * - * ColVector - handles matrices of col dimension one. It is the result of some of the matrix methods and has methods - * that return RowVector and diagonal matrix + * ColVector - handles matrices of col dimension one. It is the result of some of the matrix methods and has + * methods that return RowVector and diagonal matrix */ // Implicit coversions // Add methods we want to add to pipes here: class MatrixPipeExtensions(pipe: Pipe) { - def toMatrix[RowT, ColT, ValT](fields: Fields)(implicit conv: TupleConverter[(RowT, ColT, ValT)], setter: TupleSetter[(RowT, ColT, ValT)]) = { - val matPipe = RichPipe(pipe).mapTo(fields -> ('row, 'col, 'val))((tup: (RowT, ColT, ValT)) => tup)(conv, setter) + def toMatrix[RowT, ColT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(RowT, ColT, ValT)], setter: TupleSetter[(RowT, ColT, ValT)]) = { + val matPipe = + RichPipe(pipe).mapTo(fields -> ('row, 'col, 'val))((tup: (RowT, ColT, ValT)) => tup)(conv, setter) new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) } - def mapToMatrix[T, RowT, ColT, ValT](fields: Fields)(mapfn: T => (RowT, ColT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { + def mapToMatrix[T, RowT, ColT, ValT](fields: Fields)( + mapfn: T => (RowT, ColT, ValT) + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { val matPipe = RichPipe(pipe).mapTo(fields -> ('row, 'col, 'val))(mapfn)(conv, setter) new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) } - def flatMapToMatrix[T, RowT, ColT, ValT](fields: Fields)(flatMapfn: T => Iterable[(RowT, ColT, ValT)])(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { + def flatMapToMatrix[T, RowT, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(RowT, ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { val matPipe = RichPipe(pipe).flatMapTo(fields -> ('row, 'col, 'val))(flatMapfn)(conv, setter) new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) } - private def groupPipeIntoMap[ColT, ValT](pipe: Pipe): Pipe = { - pipe.groupBy('group, 'row) { - _.mapReduceMap[(ColT, ValT), Map[ColT, ValT], Map[ColT, ValT]](('col, 'val) -> 'val) { (colval: (ColT, ValT)) => Map(colval._1 -> colval._2) } { (l: Map[ColT, ValT], r: Map[ColT, ValT]) => l ++ r } { (red: Map[ColT, ValT]) => red } - } + private def groupPipeIntoMap[ColT, ValT](pipe: Pipe): Pipe = + pipe + .groupBy('group, 'row) { + _.mapReduceMap[(ColT, ValT), Map[ColT, ValT], Map[ColT, ValT]](('col, 'val) -> 'val) { + (colval: (ColT, ValT)) => Map(colval._1 -> colval._2) + }((l: Map[ColT, ValT], r: Map[ColT, ValT]) => l ++ r)((red: Map[ColT, ValT]) => red) + } .rename('group, 'col) - } - def toBlockMatrix[GroupT, RowT, ColT, ValT](fields: Fields)(implicit conv: TupleConverter[(GroupT, RowT, ColT, ValT)], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { + def toBlockMatrix[GroupT, RowT, ColT, ValT](fields: Fields)(implicit + conv: TupleConverter[(GroupT, RowT, ColT, ValT)], + setter: TupleSetter[(GroupT, RowT, ColT, ValT)] + ) = { val matPipe = RichPipe(pipe) .mapTo(fields -> ('group, 'row, 'col, 'val))((tup: (GroupT, RowT, ColT, ValT)) => tup)(conv, setter) new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def mapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)(mapfn: T => (GroupT, RowT, ColT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { + def mapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)( + mapfn: T => (GroupT, RowT, ColT, ValT) + )(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { val matPipe = RichPipe(pipe) .mapTo(fields -> ('group, 'row, 'col, 'val))(mapfn)(conv, setter) new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def flatMapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)(flatMapfn: T => Iterable[(GroupT, RowT, ColT, ValT)])(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { + def flatMapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(GroupT, RowT, ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { val matPipe = RichPipe(pipe).flatMapTo(fields -> ('group, 'row, 'col, 'val))(flatMapfn)(conv, setter) new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def toColVector[RowT, ValT](fields: Fields)(implicit conv: TupleConverter[(RowT, ValT)], setter: TupleSetter[(RowT, ValT)]) = { + def toColVector[RowT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(RowT, ValT)], setter: TupleSetter[(RowT, ValT)]) = { val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))((tup: (RowT, ValT)) => tup)(conv, setter) new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def mapToColVector[T, RowT, ValT](fields: Fields)(mapfn: T => (RowT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { + def mapToColVector[T, RowT, ValT]( + fields: Fields + )(mapfn: T => (RowT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))(mapfn)(conv, setter) new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def flatMapToColVector[T, RowT, ValT](fields: Fields)(flatMapfn: T => Iterable[(RowT, ValT)])(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { + def flatMapToColVector[T, RowT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(RowT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('row, 'val))(flatMapfn)(conv, setter) new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def toRowVector[ColT, ValT](fields: Fields)(implicit conv: TupleConverter[(ColT, ValT)], setter: TupleSetter[(ColT, ValT)]) = { + def toRowVector[ColT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(ColT, ValT)], setter: TupleSetter[(ColT, ValT)]) = { val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))((tup: (ColT, ValT)) => tup)(conv, setter) new RowVector[ColT, ValT]('col, 'val, vecPipe) } - def mapToRowVector[T, ColT, ValT](fields: Fields)(mapfn: T => (ColT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { + def mapToRowVector[T, ColT, ValT]( + fields: Fields + )(mapfn: T => (ColT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))(mapfn)(conv, setter) new RowVector[ColT, ValT]('col, 'val, vecPipe) } - def flatMapToRowVector[T, ColT, ValT](fields: Fields)(flatMapfn: T => Iterable[(ColT, ValT)])(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { + def flatMapToRowVector[T, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('col, 'val))(flatMapfn)(conv, setter) new RowVector[ColT, ValT]('col, 'val, vecPipe) } @@ -123,44 +149,58 @@ class MatrixPipeExtensions(pipe: Pipe) { * This is the enrichment pattern on Mappable[T] for converting to Matrix types */ class MatrixMappableExtensions[T](mappable: Mappable[T])(implicit fd: FlowDef, mode: Mode) { - def toMatrix[Row, Col, Val](implicit ev: <:<[T, (Row, Col, Val)], - setter: TupleSetter[(Row, Col, Val)]): Matrix[Row, Col, Val] = - mapToMatrix { _.asInstanceOf[(Row, Col, Val)] } - - def mapToMatrix[Row, Col, Val](fn: (T) => (Row, Col, Val))(implicit setter: TupleSetter[(Row, Col, Val)]): Matrix[Row, Col, Val] = { + def toMatrix[Row, Col, Val](implicit + ev: <:<[T, (Row, Col, Val)], + setter: TupleSetter[(Row, Col, Val)] + ): Matrix[Row, Col, Val] = + mapToMatrix(_.asInstanceOf[(Row, Col, Val)]) + + def mapToMatrix[Row, Col, Val]( + fn: (T) => (Row, Col, Val) + )(implicit setter: TupleSetter[(Row, Col, Val)]): Matrix[Row, Col, Val] = { val fields = ('row, 'col, 'val) val matPipe = mappable.mapTo(fields)(fn) new Matrix[Row, Col, Val]('row, 'col, 'val, matPipe) } - def toBlockMatrix[Group, Row, Col, Val](implicit ev: <:<[T, (Group, Row, Col, Val)], ord: Ordering[(Group, Row)], - setter: TupleSetter[(Group, Row, Col, Val)]): BlockMatrix[Group, Row, Col, Val] = - mapToBlockMatrix { _.asInstanceOf[(Group, Row, Col, Val)] } + def toBlockMatrix[Group, Row, Col, Val](implicit + ev: <:<[T, (Group, Row, Col, Val)], + ord: Ordering[(Group, Row)], + setter: TupleSetter[(Group, Row, Col, Val)] + ): BlockMatrix[Group, Row, Col, Val] = + mapToBlockMatrix(_.asInstanceOf[(Group, Row, Col, Val)]) - def mapToBlockMatrix[Group, Row, Col, Val](fn: (T) => (Group, Row, Col, Val))(implicit ord: Ordering[(Group, Row)]): BlockMatrix[Group, Row, Col, Val] = { + def mapToBlockMatrix[Group, Row, Col, Val]( + fn: (T) => (Group, Row, Col, Val) + )(implicit ord: Ordering[(Group, Row)]): BlockMatrix[Group, Row, Col, Val] = { val matPipe = TypedPipe .from(mappable) .map(fn) .groupBy(t => (t._1, t._2)) - .mapValueStream(s => Iterator(s.map{ case (_, _, c, v) => (c, v) }.toMap)) + .mapValueStream(s => Iterator(s.map { case (_, _, c, v) => (c, v) }.toMap)) .toTypedPipe - .map{ case ((g, r), m) => (r, g, m) } + .map { case ((g, r), m) => (r, g, m) } .toPipe(('row, 'col, 'val)) new BlockMatrix[Group, Row, Col, Val](new Matrix('row, 'col, 'val, matPipe)) } - def toRow[Row, Val](implicit ev: <:<[T, (Row, Val)], setter: TupleSetter[(Row, Val)]): RowVector[Row, Val] = mapToRow { _.asInstanceOf[(Row, Val)] } + def toRow[Row, Val](implicit ev: <:<[T, (Row, Val)], setter: TupleSetter[(Row, Val)]): RowVector[Row, Val] = + mapToRow(_.asInstanceOf[(Row, Val)]) - def mapToRow[Row, Val](fn: (T) => (Row, Val))(implicit setter: TupleSetter[(Row, Val)], fd: FlowDef): RowVector[Row, Val] = { + def mapToRow[Row, Val]( + fn: (T) => (Row, Val) + )(implicit setter: TupleSetter[(Row, Val)], fd: FlowDef): RowVector[Row, Val] = { val fields = ('row, 'val) val rowPipe = mappable.mapTo(fields)(fn) new RowVector[Row, Val]('row, 'val, rowPipe) } def toCol[Col, Val](implicit ev: <:<[T, (Col, Val)], setter: TupleSetter[(Col, Val)]): ColVector[Col, Val] = - mapToCol { _.asInstanceOf[(Col, Val)] } + mapToCol(_.asInstanceOf[(Col, Val)]) - def mapToCol[Col, Val](fn: (T) => (Col, Val))(implicit setter: TupleSetter[(Col, Val)]): ColVector[Col, Val] = { + def mapToCol[Col, Val]( + fn: (T) => (Col, Val) + )(implicit setter: TupleSetter[(Col, Val)]): ColVector[Col, Val] = { val fields = ('col, 'val) val colPipe = mappable.mapTo(fields)(fn) new ColVector[Col, Val]('col, 'val, colPipe) @@ -170,19 +210,20 @@ class MatrixMappableExtensions[T](mappable: Mappable[T])(implicit fd: FlowDef, m object Matrix { // If this function is implicit, you can use the PipeExtensions methods on pipe implicit def pipeExtensions[P <% Pipe](p: P): MatrixPipeExtensions = new MatrixPipeExtensions(p) - implicit def mappableExtensions[T](mt: Mappable[T])(implicit fd: FlowDef, mode: Mode): MatrixMappableExtensions[T] = + implicit def mappableExtensions[T]( + mt: Mappable[T] + )(implicit fd: FlowDef, mode: Mode): MatrixMappableExtensions[T] = new MatrixMappableExtensions(mt)(fd, mode) - def filterOutZeros[ValT](fSym: Symbol, group: Monoid[ValT])(fpipe: Pipe): Pipe = { + def filterOutZeros[ValT](fSym: Symbol, group: Monoid[ValT])(fpipe: Pipe): Pipe = fpipe.filter(fSym) { tup: Tuple1[ValT] => group.isNonZero(tup._1) } - } def meanCenter[T](vct: Iterable[(T, Double)]): Iterable[(T, Double)] = { - val valList = vct.map { _._2 } + val valList = vct.map(_._2) val sum = valList.sum val count = valList.size val avg = sum / count - vct.map { tup => (tup._1, tup._2 - avg) } + vct.map(tup => (tup._1, tup._2 - avg)) } implicit def literalToScalar[ValT](v: ValT): LiteralScalar[ValT] = new LiteralScalar(v) @@ -190,7 +231,7 @@ object Matrix { // Converts to Matrix for addition implicit def diagonalToMatrix[RowT, ValT](diag: DiagonalMatrix[RowT, ValT]): Matrix[RowT, RowT, ValT] = { val colSym = newSymbol(Set(diag.idxSym, diag.valSym), 'col) - val newPipe = diag.pipe.map(diag.idxSym -> colSym) { (x: RowT) => x } + val newPipe = diag.pipe.map(diag.idxSym -> colSym)((x: RowT) => x) new Matrix[RowT, RowT, ValT](diag.idxSym, colSym, diag.valSym, newPipe, diag.sizeHint) } } @@ -207,9 +248,14 @@ trait WrappedPipe { } } -class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSym: Symbol, - inPipe: Pipe, val sizeHint: SizeHint = NoClue) - extends WrappedPipe with java.io.Serializable { +class Matrix[RowT, ColT, ValT]( + val rowSym: Symbol, + val colSym: Symbol, + val valSym: Symbol, + inPipe: Pipe, + val sizeHint: SizeHint = NoClue +) extends WrappedPipe + with java.io.Serializable { import Matrix._ import MatrixProduct._ import Dsl.ensureUniqueFields @@ -224,29 +270,31 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy def hasHint = sizeHint != NoClue override def hashCode = inPipe.hashCode - override def equals(that: Any): Boolean = { + override def equals(that: Any): Boolean = (that != null) && (that.isInstanceOf[Matrix[_, _, _]]) && { val thatM = that.asInstanceOf[Matrix[RowT, ColT, ValT]] (this.rowSym == thatM.rowSym) && (this.colSym == thatM.colSym) && - (this.valSym == thatM.valSym) && (this.pipe == thatM.pipe) + (this.valSym == thatM.valSym) && (this.pipe == thatM.pipe) } - } // Value operations def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): Matrix[RowT, ColT, ValU] = { val newPipe = pipe.flatMap(valSym -> valSym) { imp: Tuple1[ValT] => //Ensure an arity of 1 //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } new Matrix[RowT, ColT, ValU](this.rowSym, this.colSym, this.valSym, newPipe, sizeHint) } + /** - * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the matrix. This does not enumerate the zeros + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the matrix. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT, RowT, ColT) => ValNew)(implicit mon: Monoid[ValNew]): Matrix[RowT, ColT, ValNew] = { + def mapWithIndex[ValNew]( + fn: (ValT, RowT, ColT) => ValNew + )(implicit mon: Monoid[ValNew]): Matrix[RowT, ColT, ValNew] = { val newPipe = pipe.flatMap(fields -> fields) { imp: (RowT, ColT, ValT) => - mon.nonZeroOption(fn(imp._3, imp._1, imp._2)).map { (imp._1, imp._2, _) } + mon.nonZeroOption(fn(imp._3, imp._1, imp._2)).map((imp._1, imp._2, _)) } new Matrix[RowT, ColT, ValNew](rowSym, colSym, valSym, newPipe, sizeHint) } @@ -261,16 +309,18 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy } // Binarize values, all x != 0 become 1 - def binarizeAs[NewValT](implicit mon: Monoid[ValT], ring: Ring[NewValT]): Matrix[RowT, ColT, NewValT] = { - mapValues(x => if (mon.isNonZero(x)) { ring.one } else { ring.zero })(ring) - } + def binarizeAs[NewValT](implicit mon: Monoid[ValT], ring: Ring[NewValT]): Matrix[RowT, ColT, NewValT] = + mapValues(x => + if (mon.isNonZero(x)) { ring.one } + else { ring.zero } + )(ring) // Row Operations // Get a specific row def getRow(index: RowT): RowVector[ColT, ValT] = { val newPipe = inPipe - .filter(rowSym){ input: RowT => input == index } + .filter(rowSym) { input: RowT => input == index } .project(colSym, valSym) val newHint = sizeHint.setRows(1L) new RowVector[ColT, ValT](colSym, valSym, newPipe, newHint) @@ -280,10 +330,10 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy def reduceRowVectors(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): RowVector[ColT, ValT] = { val newPipe = filterOutZeros(valSym, mon) { pipe.groupBy(colSym) { - _.reduce(valSym) { (x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(fn(x._1, y._1)) } - // Matrices are generally huge and cascading has problems with diverse key spaces and - // mapside operations - // TODO continually evaluate if this is needed to avoid OOM + _.reduce(valSym)((x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(fn(x._1, y._1))) + // Matrices are generally huge and cascading has problems with diverse key spaces and + // mapside operations + // TODO continually evaluate if this is needed to avoid OOM .reducers(MatrixProduct.numOfReducers(sizeHint)) .forceToReducers } @@ -293,75 +343,71 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy } // Sums all the rows per column - def sumRowVectors(implicit mon: Monoid[ValT]): RowVector[ColT, ValT] = { + def sumRowVectors(implicit mon: Monoid[ValT]): RowVector[ColT, ValT] = this.reduceRowVectors((x, y) => mon.plus(x, y)) - } // Maps rows using a per-row mapping function // Use this for non-decomposable vector processing functions // and with vectors that can fit in one-single machine memory - def mapRows(fn: Iterable[(ColT, ValT)] => Iterable[(ColT, ValT)])(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { + def mapRows( + fn: Iterable[(ColT, ValT)] => Iterable[(ColT, ValT)] + )(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { val newListSym = Symbol(colSym.name + "_" + valSym.name + "_list") // TODO, I think we can count the rows/cols for free here val newPipe = filterOutZeros(valSym, mon) { - pipe.groupBy(rowSym) { - _.toList[(ColT, ValT)]((colSym, valSym) -> newListSym) - } + pipe + .groupBy(rowSym) { + _.toList[(ColT, ValT)]((colSym, valSym) -> newListSym) + } .flatMapTo((rowSym, newListSym) -> (rowSym, colSym, valSym)) { tup: (RowT, List[(ColT, ValT)]) => val row = tup._1 val list = fn(tup._2) // Now flatten out to (row, col, val): - list.map{ imp: (ColT, ValT) => (row, imp._1, imp._2) } + list.map { imp: (ColT, ValT) => (row, imp._1, imp._2) } } } new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, newPipe, sizeHint) } - def topRowElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = { + def topRowElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = if (k < 1000) { topRowWithTiny(k) } else { - val newPipe = pipe.groupBy(rowSym){ - _ - .sortBy(valSym) - .reverse - .take(k) - } + val newPipe = pipe + .groupBy(rowSym) { + _.sortBy(valSym).reverse + .take(k) + } .project(rowSym, colSym, valSym) new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, newPipe, FiniteHint(-1L, k)) } - } protected def topRowWithTiny(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = { val topSym = Symbol(colSym.name + "_topK") - val newPipe = pipe.groupBy(rowSym){ - _ - .sortWithTake((colSym, valSym) -> 'top_vals, k) ((t0: (ColT, ValT), t1: (ColT, ValT)) => ord.gt(t0._2, t1._2)) - } + val newPipe = pipe + .groupBy(rowSym) { + _.sortWithTake((colSym, valSym) -> 'top_vals, k)((t0: (ColT, ValT), t1: (ColT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } .flatMapTo((0, 1) -> (rowSym, topSym, valSym)) { imp: (RowT, List[(ColT, ValT)]) => val row = imp._1 val list = imp._2 - list.map{ imp: (ColT, ValT) => (row, imp._1, imp._2) } + list.map { imp: (ColT, ValT) => (row, imp._1, imp._2) } } new Matrix[RowT, ColT, ValT](rowSym, topSym, valSym, newPipe, FiniteHint(-1L, k)) } protected lazy val rowL0Norm = { val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] - (matD.mapValues { x => 1.0 } - .sumColVectors - .diag - .inverse) * matD + (matD.mapValues(x => 1.0).sumColVectors.diag.inverse) * matD } def rowL0Normalize(implicit ev: =:=[ValT, Double]): Matrix[RowT, ColT, Double] = rowL0Norm protected lazy val rowL1Norm = { val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] - (matD.mapValues { x => x.abs } - .sumColVectors - .diag - .inverse) * matD + (matD.mapValues(x => x.abs).sumColVectors.diag.inverse) * matD } // Row L1 normalization, only makes sense for Doubles @@ -370,10 +416,11 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy protected lazy val rowL2Norm = { val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] - (matD.mapValues { x => x * x } + (matD + .mapValues(x => x * x) .sumColVectors .diag - .mapValues { x => scala.math.sqrt(x) } + .mapValues(x => scala.math.sqrt(x)) .diagonal .inverse) * matD } @@ -385,7 +432,7 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy // Double ValT only (only over the observed values, not dividing by the unobserved ones) def rowMeanCentering(implicit ev: =:=[ValT, Double]) = { val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] - matD.mapRows { Matrix.meanCenter } + matD.mapRows(Matrix.meanCenter) } // Row non-zeroes, ave and standard deviation in one pass - Double ValT only @@ -396,13 +443,14 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy val newValSym = Symbol(valSym.name + "_newVal") val newPipe = inPipe - .groupBy(rowSym) { _.sizeAveStdev((valSym) -> ('size, 'ave, 'stdev)) } - .flatMapTo((rowSym, 'size, 'ave, 'stdev) -> (rowSym, newColSym, newValSym)) { tup: (RowT, Long, Double, Double) => - val row = tup._1 - val size = tup._2.toDouble - val avg = tup._3 - val stdev = tup._4 - List((row, 1, size), (row, 2, avg), (row, 3, stdev)) + .groupBy(rowSym)(_.sizeAveStdev(valSym -> ('size, 'ave, 'stdev))) + .flatMapTo((rowSym, 'size, 'ave, 'stdev) -> (rowSym, newColSym, newValSym)) { + tup: (RowT, Long, Double, Double) => + val row = tup._1 + val size = tup._2.toDouble + val avg = tup._3 + val stdev = tup._4 + List((row, 1, size), (row, 2, avg), (row, 3, stdev)) } val newHint = sizeHint.setCols(3L) new Matrix[RowT, Int, Double](rowSym, newColSym, newValSym, newPipe, newHint) @@ -412,126 +460,112 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy // Column operations - see Row operations above - def getCol(index: ColT): ColVector[RowT, ValT] = { + def getCol(index: ColT): ColVector[RowT, ValT] = this.transpose.getRow(index).transpose - } - def reduceColVectors(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = { + def reduceColVectors(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = this.transpose.reduceRowVectors(fn)(mon).transpose - } - def sumColVectors(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = { + def sumColVectors(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = this.transpose.sumRowVectors(mon).transpose - } - def mapCols(fn: Iterable[(RowT, ValT)] => Iterable[(RowT, ValT)])(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { + def mapCols(fn: Iterable[(RowT, ValT)] => Iterable[(RowT, ValT)])(implicit + mon: Monoid[ValT] + ): Matrix[RowT, ColT, ValT] = this.transpose.mapRows(fn)(mon).transpose - } - def topColElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = { + def topColElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = this.transpose.topRowElems(k)(ord).transpose - } - def colL0Normalize(implicit ev: =:=[ValT, Double]) = { + def colL0Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL0Normalize.transpose - } - def colL1Normalize(implicit ev: =:=[ValT, Double]) = { + def colL1Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL1Normalize.transpose - } - def colL2Normalize(implicit ev: =:=[ValT, Double]) = { + def colL2Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL2Normalize.transpose - } - def colMeanCentering(implicit ev: =:=[ValT, Double]) = { + def colMeanCentering(implicit ev: =:=[ValT, Double]) = this.transpose.rowMeanCentering.transpose - } - def colSizeAveStdev(implicit ev: =:=[ValT, Double]) = { + def colSizeAveStdev(implicit ev: =:=[ValT, Double]) = this.transpose.rowSizeAveStdev - } - def *[That, Res](that: That)(implicit prod: MatrixProduct[Matrix[RowT, ColT, ValT], That, Res]): Res = { + def *[That, Res](that: That)(implicit prod: MatrixProduct[Matrix[RowT, ColT, ValT], That, Res]): Res = prod(this, that) - } def /(that: LiteralScalar[ValT])(implicit field: Field[ValT]) = { field.assertNotZero(that.value) mapValues(elem => field.div(elem, that.value)) } - def /(that: Scalar[ValT])(implicit field: Field[ValT]) = { + def /(that: Scalar[ValT])(implicit field: Field[ValT]) = nonZerosWith(that) - .mapValues({ leftRight: (ValT, ValT) => + .mapValues { leftRight: (ValT, ValT) => val (left, right) = leftRight field.div(left, right) - }) - } + } // Between Matrix value reduction - Generalizes matrix addition with an arbitrary value aggregation function // It assumes that the function fn(0,0) = 0 // This function assumes only one value in each matrix for a given row and column index. (no stacking of operations yet) // TODO: Optimize this later and be lazy on groups and joins. - def elemWiseOp(that: Matrix[RowT, ColT, ValT])(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { + def elemWiseOp(that: Matrix[RowT, ColT, ValT])(fn: (ValT, ValT) => ValT)(implicit + mon: Monoid[ValT] + ): Matrix[RowT, ColT, ValT] = // If the following is not true, it's not clear this is meaningful // assert(mon.isZero(fn(mon.zero,mon.zero)), "f is illdefined") - zip(that).mapValues({ pair => fn(pair._1, pair._2) })(mon) - } + zip(that).mapValues { pair => fn(pair._1, pair._2) }(mon) // Matrix summation - def +(that: Matrix[RowT, ColT, ValT])(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { + def +(that: Matrix[RowT, ColT, ValT])(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = if (equals(that)) { // No need to do any groupBy operation - mapValues { v => mon.plus(v, v) }(mon) + mapValues(v => mon.plus(v, v))(mon) } else { elemWiseOp(that)((x, y) => mon.plus(x, y))(mon) } - } // Matrix difference - def -(that: Matrix[RowT, ColT, ValT])(implicit grp: Group[ValT]): Matrix[RowT, ColT, ValT] = { + def -(that: Matrix[RowT, ColT, ValT])(implicit grp: Group[ValT]): Matrix[RowT, ColT, ValT] = elemWiseOp(that)((x, y) => grp.minus(x, y))(grp) - } // Matrix elementwise product / Hadamard product // see http://en.wikipedia.org/wiki/Hadamard_product_(matrices) - def hProd(mat: Matrix[RowT, ColT, ValT])(implicit ring: Ring[ValT]): Matrix[RowT, ColT, ValT] = { + def hProd(mat: Matrix[RowT, ColT, ValT])(implicit ring: Ring[ValT]): Matrix[RowT, ColT, ValT] = elemWiseOp(mat)((x, y) => ring.times(x, y))(ring) - } /** - * Considering the matrix as a graph, propagate the column: - * Does the calculation: \sum_{j where M(i,j) == true) c_j + * Considering the matrix as a graph, propagate the column: Does the calculation: \sum_{j where M(i,j) == + * true) c_j */ - def propagate[ColValT](vec: ColVector[ColT, ColValT])(implicit ev: =:=[ValT, Boolean], monT: Monoid[ColValT]): ColVector[RowT, ColValT] = { + def propagate[ColValT]( + vec: ColVector[ColT, ColValT] + )(implicit ev: =:=[ValT, Boolean], monT: Monoid[ColValT]): ColVector[RowT, ColValT] = { //This cast will always succeed: val boolMat = this.asInstanceOf[Matrix[RowT, ColT, Boolean]] - boolMat.zip(vec.transpose) - .mapValues { boolT => if (boolT._1) boolT._2 else monT.zero } - .sumColVectors + boolMat.zip(vec.transpose).mapValues(boolT => if (boolT._1) boolT._2 else monT.zero).sumColVectors } // Compute the sum of the main diagonal. Only makes sense cases where the row and col type are // equal - def trace(implicit mon: Monoid[ValT], ev: =:=[RowT, ColT]): Scalar[ValT] = { + def trace(implicit mon: Monoid[ValT], ev: =:=[RowT, ColT]): Scalar[ValT] = diagonal.trace(mon) - } // Compute the sum of all the elements in the matrix - def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = { + def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = sumRowVectors.sum - } - def transpose: Matrix[ColT, RowT, ValT] = { + def transpose: Matrix[ColT, RowT, ValT] = new Matrix[ColT, RowT, ValT](colSym, rowSym, valSym, inPipe, sizeHint.transpose) - } // This should only be called by def diagonal, which verifies that RowT == ColT protected lazy val mainDiagonal: DiagonalMatrix[RowT, ValT] = { - val diagPipe = pipe.filter(rowSym, colSym) { input: (RowT, RowT) => - (input._1 == input._2) - } + val diagPipe = pipe + .filter(rowSym, colSym) { input: (RowT, RowT) => + (input._1 == input._2) + } .project(rowSym, valSym) new DiagonalMatrix[RowT, ValT](rowSym, valSym, diagPipe, SizeHint.asDiagonal(sizeHint)) } @@ -543,7 +577,9 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy /* * This just removes zeros after the join inside a zip */ - private def cleanUpZipJoin[ValU](otherVSym: Fields, pairMonoid: Monoid[(ValT, ValU)])(joinedPipe: Pipe): Pipe = { + private def cleanUpZipJoin[ValU](otherVSym: Fields, pairMonoid: Monoid[(ValT, ValU)])( + joinedPipe: Pipe + ): Pipe = joinedPipe //Make sure the zeros are set correctly: .map(valSym -> valSym) { (x: ValT) => @@ -555,7 +591,6 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy //Put the pair into a single item, ugly in scalding sadly... .map(valSym.append(otherVSym) -> valSym) { tup: (ValT, ValU) => Tuple1(tup) } .project(rowColValSymbols) - } /* * This ensures both side rows and columns have correct indexes (fills in nulls from the other side @@ -575,7 +610,8 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy // Similar to zip, but combine the scalar on the right with all non-zeros in this matrix: def nonZerosWith[ValU](that: Scalar[ValU]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, that.valSym, that.pipe) - val newPipe = inPipe.crossWithTiny(newRPipe) + val newPipe = inPipe + .crossWithTiny(newRPipe) .map(valSym.append(getField(newRFields, 0)) -> valSym) { leftRight: (ValT, ValU) => Tuple1(leftRight) } .project(rowColValSymbols) new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, newPipe, sizeHint) @@ -583,20 +619,22 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy // Similar to zip, but combine the scalar on the right with all non-zeros in this matrix: def nonZerosWith[ValU](that: LiteralScalar[ValU]): Matrix[RowT, ColT, (ValT, ValU)] = { - val newPipe = inPipe.map(valSym -> valSym) { left: Tuple1[ValT] => - Tuple1((left._1, that.value)) - } + val newPipe = inPipe + .map(valSym -> valSym) { left: Tuple1[ValT] => + Tuple1((left._1, that.value)) + } .project(rowColValSymbols) new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, newPipe, sizeHint) } // Override the size hint - def withSizeHint(sh: SizeHint): Matrix[RowT, ColT, ValT] = { + def withSizeHint(sh: SizeHint): Matrix[RowT, ColT, ValT] = new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, pipe, sh) - } // Zip the given row with all the rows of the matrix - def zip[ValU](that: ColVector[RowT, ValU])(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { + def zip[ValU]( + that: ColVector[RowT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, (that.rowS, that.valS), that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -605,12 +643,14 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy val zipped = cleanUpZipJoin(getField(newRFields, 1), pairMonoid) { pipe .joinWithSmaller(rowSym -> getField(newRFields, 0), newRPipe, new OuterJoin) - .thenDo{ p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } } new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) } // Zip the given row with all the rows of the matrix - def zip[ValU](that: RowVector[ColT, ValU])(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { + def zip[ValU]( + that: RowVector[ColT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, (that.colS, that.valS), that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -619,13 +659,15 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy val zipped = cleanUpZipJoin(getField(newRFields, 1), pairMonoid) { pipe .joinWithSmaller(colSym -> getField(newRFields, 0), newRPipe, new OuterJoin) - .thenDo{ p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 0)), p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 0)), p) } } new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) } // This creates the matrix with pairs for the entries - def zip[ValU](that: Matrix[RowT, ColT, ValU])(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { + def zip[ValU]( + that: Matrix[RowT, ColT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, that.rowColValSymbols, that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -633,18 +675,20 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy // TODO optimize the number of reducers val zipped = cleanUpZipJoin[ValU](getField(newRFields, 2), pairMonoid) { pipe - .joinWithSmaller((rowSym, colSym) -> - (getField(newRFields, 0).append(getField(newRFields, 1))), - newRPipe, new OuterJoin) - .thenDo{ p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } - .thenDo{ p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 1)), p) } + .joinWithSmaller( + (rowSym, colSym) -> + (getField(newRFields, 0).append(getField(newRFields, 1))), + newRPipe, + new OuterJoin + ) + .thenDo { p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 1)), p) } } new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeHint) } - def toBlockMatrix[G](grouping: (RowT) => (G, RowT)): BlockMatrix[G, RowT, ColT, ValT] = { + def toBlockMatrix[G](grouping: (RowT) => (G, RowT)): BlockMatrix[G, RowT, ColT, ValT] = inPipe.map('row -> ('group, 'row))(grouping).toBlockMatrix(('group, 'row, 'col, 'val)) - } /** * removes any elements in this matrix that also appear in the argument matrix @@ -654,9 +698,12 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy val filterC = '___filterC___ val filterV = '___filterV___ - val joined = pipe.joinWithSmaller((rowSym, colSym) -> (filterR, filterC), - that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (filterR, filterC, filterV)), new LeftJoin) - val filtered = joined.filter(filterV){ x: ValU => null == x } + val joined = pipe.joinWithSmaller( + (rowSym, colSym) -> (filterR, filterC), + that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (filterR, filterC, filterV)), + new LeftJoin + ) + val filtered = joined.filter(filterV) { x: ValU => null == x } new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, filtered.project(rowSym, colSym, valSym)) } @@ -668,8 +715,10 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy val keepC = '___keepC___ val keepV = '___keepV___ - val joined = pipe.joinWithSmaller((rowSym, colSym) -> (keepR, keepC), - that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (keepR, keepC, keepV))) + val joined = pipe.joinWithSmaller( + (rowSym, colSym) -> (keepR, keepC), + that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (keepR, keepC, keepV)) + ) new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.project(rowSym, colSym, valSym)) } @@ -696,9 +745,19 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy */ def removeRowsBy[ValU](that: ColVector[RowT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ - val joined = pipe.joinWithSmaller(rowSym -> index, that.pipe.rename(that.rowS -> index).project(index), joiner = new LeftJoin) - new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.filter(index){ x: RowT => null == x } - .project(rowSym, colSym, valSym)) + val joined = pipe.joinWithSmaller( + rowSym -> index, + that.pipe.rename(that.rowS -> index).project(index), + joiner = new LeftJoin + ) + new Matrix[RowT, ColT, ValT]( + rowSym, + colSym, + valSym, + joined + .filter(index) { x: RowT => null == x } + .project(rowSym, colSym, valSym) + ) } /** @@ -706,32 +765,45 @@ class Matrix[RowT, ColT, ValT](val rowSym: Symbol, val colSym: Symbol, val valSy */ def removeColsBy[ValU](that: RowVector[ColT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ - val joined = pipe.joinWithSmaller(colSym -> index, that.pipe.rename(that.colS -> index).project(index), joiner = new LeftJoin) - new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.filter(index){ x: ColT => null == x } - .project(rowSym, colSym, valSym)) + val joined = pipe.joinWithSmaller( + colSym -> index, + that.pipe.rename(that.colS -> index).project(index), + joiner = new LeftJoin + ) + new Matrix[RowT, ColT, ValT]( + rowSym, + colSym, + valSym, + joined + .filter(index) { x: ColT => null == x } + .project(rowSym, colSym, valSym) + ) } /** - * Write the matrix, optionally renaming row,col,val fields to the given fields - * then return this. + * Write the matrix, optionally renaming row,col,val fields to the given fields then return this. */ - def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode): Matrix[RowT, ColT, ValT] = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit + fd: FlowDef, + mode: Mode + ): Matrix[RowT, ColT, ValT] = { writePipe(src, outFields) this } } class LiteralScalar[ValT](val value: ValT) extends java.io.Serializable { - def *[That, Res](that: That)(implicit prod: MatrixProduct[LiteralScalar[ValT], That, Res]): Res = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[LiteralScalar[ValT], That, Res]): Res = + prod(this, that) } class Scalar[ValT](val valSym: Symbol, inPipe: Pipe) extends WrappedPipe with java.io.Serializable { def pipe = inPipe def fields = valSym - def *[That, Res](that: That)(implicit prod: MatrixProduct[Scalar[ValT], That, Res]): Res = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[Scalar[ValT], That, Res]): Res = prod(this, that) + /** - * Write the Scalar, optionally renaming val fields to the given fields - * then return this. + * Write the Scalar, optionally renaming val fields to the given fields then return this. */ def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) @@ -739,11 +811,16 @@ class Scalar[ValT](val valSym: Symbol, inPipe: Pipe) extends WrappedPipe with ja } } -class DiagonalMatrix[IdxT, ValT](val idxSym: Symbol, - val valSym: Symbol, inPipe: Pipe, val sizeHint: SizeHint = FiniteHint(1L, -1L)) - extends WrappedPipe with java.io.Serializable { +class DiagonalMatrix[IdxT, ValT]( + val idxSym: Symbol, + val valSym: Symbol, + inPipe: Pipe, + val sizeHint: SizeHint = FiniteHint(1L, -1L) +) extends WrappedPipe + with java.io.Serializable { - def *[That, Res](that: That)(implicit prod: MatrixProduct[DiagonalMatrix[IdxT, ValT], That, Res]): Res = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[DiagonalMatrix[IdxT, ValT], That, Res]): Res = + prod(this, that) def pipe = inPipe def fields = (idxSym, valSym) @@ -755,17 +832,16 @@ class DiagonalMatrix[IdxT, ValT](val idxSym: Symbol, } new Scalar[ValT](valSym, scalarPipe) } - def toCol: ColVector[IdxT, ValT] = { + def toCol: ColVector[IdxT, ValT] = new ColVector[IdxT, ValT](idxSym, valSym, inPipe, sizeHint.setRows(1L)) - } - def toRow: RowVector[IdxT, ValT] = { + def toRow: RowVector[IdxT, ValT] = new RowVector[IdxT, ValT](idxSym, valSym, inPipe, sizeHint.setCols(1L)) - } // Inverse of this matrix *IGNORING ZEROS* def inverse(implicit field: Field[ValT]): DiagonalMatrix[IdxT, ValT] = { val diagPipe = inPipe.flatMap(valSym -> valSym) { element: ValT => - field.nonZeroOption(element) - .map { field.inverse } + field + .nonZeroOption(element) + .map(field.inverse) } new DiagonalMatrix[IdxT, ValT](idxSym, valSym, diagPipe, sizeHint) } @@ -774,14 +850,13 @@ class DiagonalMatrix[IdxT, ValT](val idxSym: Symbol, def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): DiagonalMatrix[IdxT, ValU] = { val newPipe = pipe.flatMap(valSym -> valSym) { imp: Tuple1[ValT] => // Ensure an arity of 1 //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } new DiagonalMatrix[IdxT, ValU](this.idxSym, this.valSym, newPipe, sizeHint) } /** - * Write optionally renaming val fields to the given fields - * then return this. + * Write optionally renaming val fields to the given fields then return this. */ def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) @@ -789,23 +864,31 @@ class DiagonalMatrix[IdxT, ValT](val idxSym: Symbol, } } -class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, val sizeH: SizeHint = FiniteHint(1L, -1L)) - extends java.io.Serializable with WrappedPipe { +class RowVector[ColT, ValT]( + val colS: Symbol, + val valS: Symbol, + inPipe: Pipe, + val sizeH: SizeHint = FiniteHint(1L, -1L) +) extends java.io.Serializable + with WrappedPipe { def pipe = inPipe.project(colS, valS) def fields = (colS, valS) - def *[That, Res](that: That)(implicit prod: MatrixProduct[RowVector[ColT, ValT], That, Res]): Res = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[RowVector[ColT, ValT], That, Res]): Res = + prod(this, that) - def +(that: RowVector[ColT, ValT])(implicit mon: Monoid[ValT]) = (this.toMatrix(true) + that.toMatrix(true)).getRow(true) + def +(that: RowVector[ColT, ValT])(implicit mon: Monoid[ValT]) = + (this.toMatrix(true) + that.toMatrix(true)).getRow(true) - def -(that: RowVector[ColT, ValT])(implicit group: Group[ValT]) = (this.toMatrix(true) - that.toMatrix(true)).getRow(true) + def -(that: RowVector[ColT, ValT])(implicit group: Group[ValT]) = + (this.toMatrix(true) - that.toMatrix(true)).getRow(true) - def hProd(that: RowVector[ColT, ValT])(implicit ring: Ring[ValT]): RowVector[ColT, ValT] = (this.transpose hProd that.transpose).transpose + def hProd(that: RowVector[ColT, ValT])(implicit ring: Ring[ValT]): RowVector[ColT, ValT] = + this.transpose.hProd(that.transpose).transpose - def transpose: ColVector[ColT, ValT] = { + def transpose: ColVector[ColT, ValT] = new ColVector[ColT, ValT](colS, valS, inPipe, sizeH.transpose) - } def diag: DiagonalMatrix[ColT, ValT] = { val newHint = SizeHint.asDiagonal(sizeH.setRowsToCols) @@ -813,12 +896,15 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va } /** - * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the vector. This does not enumerate the zeros + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the vector. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT, ColT) => ValNew)(implicit mon: Monoid[ValNew]): RowVector[ColT, ValNew] = { - val newPipe = pipe.mapTo((valS, colS) -> (valS, colS)) { tup: (ValT, ColT) => (fn(tup._1, tup._2), tup._2) } - .filter(valS) { (v: ValNew) => mon.isNonZero(v) } + def mapWithIndex[ValNew]( + fn: (ValT, ColT) => ValNew + )(implicit mon: Monoid[ValNew]): RowVector[ColT, ValNew] = { + val newPipe = pipe + .mapTo((valS, colS) -> (valS, colS)) { tup: (ValT, ColT) => (fn(tup._1, tup._2), tup._2) } + .filter(valS)((v: ValNew) => mon.isNonZero(v)) new RowVector(colS, valS, newPipe, sizeH) } @@ -826,7 +912,7 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): RowVector[ColT, ValU] = { val newPipe = pipe.flatMap(valS -> valS) { imp: Tuple1[ValT] => // Ensure an arity of 1 //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } new RowVector[ColT, ValU](this.colS, this.valS, newPipe, sizeH) } @@ -834,26 +920,31 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va /** * Do a right-propogation of a row, transpose of Matrix.propagate */ - def propagate[MatColT](mat: Matrix[ColT, MatColT, Boolean])(implicit monT: Monoid[ValT]): RowVector[MatColT, ValT] = { + def propagate[MatColT](mat: Matrix[ColT, MatColT, Boolean])(implicit + monT: Monoid[ValT] + ): RowVector[MatColT, ValT] = mat.transpose.propagate(this.transpose).transpose - } def L0Normalize(implicit ev: =:=[ValT, Double]): RowVector[ColT, ValT] = { val normedMatrix = this.toMatrix(0).rowL0Normalize - new RowVector(normedMatrix.colSym, + new RowVector( + normedMatrix.colSym, normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym)) + normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym) + ) } def L1Normalize(implicit ev: =:=[ValT, Double]): RowVector[ColT, ValT] = { val normedMatrix = this.toMatrix(0).rowL1Normalize - new RowVector(normedMatrix.colSym, + new RowVector( + normedMatrix.colSym, normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym)) + normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym) + ) } def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = { - val scalarPipe = pipe.groupAll{ + val scalarPipe = pipe.groupAll { _.reduce(valS -> valS) { (left: Tuple1[ValT], right: Tuple1[ValT]) => Tuple1(mon.plus(left._1, right._1)) } @@ -861,7 +952,7 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va new Scalar[ValT](valS, scalarPipe) } - def topElems(k: Int)(implicit ord: Ordering[ValT]): RowVector[ColT, ValT] = { + def topElems(k: Int)(implicit ord: Ordering[ValT]): RowVector[ColT, ValT] = // TODO this should be tunable: if (k < 1000) { topWithTiny(k) } else { @@ -869,41 +960,41 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va val ordValS = new Fields(fieldName) ordValS.setComparator(fieldName, ord) - val newPipe = pipe.groupAll{ - _ - .sortBy(ordValS) - .reverse - .take(k) - }.project(colS, valS) + val newPipe = pipe + .groupAll { + _.sortBy(ordValS).reverse + .take(k) + } + .project(colS, valS) new RowVector[ColT, ValT](colS, valS, newPipe, sizeH.setCols(k).setRows(1L)) } - } protected def topWithTiny(k: Int)(implicit ord: Ordering[ValT]): RowVector[ColT, ValT] = { val topSym = Symbol(colS.name + "_topK") - val newPipe = pipe.groupAll{ - _ - .sortWithTake((colS, valS) -> 'top_vals, k) ((t0: (ColT, ValT), t1: (ColT, ValT)) => ord.gt(t0._2, t1._2)) - } + val newPipe = pipe + .groupAll { + _.sortWithTake((colS, valS) -> 'top_vals, k)((t0: (ColT, ValT), t1: (ColT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } .flatMap('top_vals -> (topSym, valS)) { imp: List[(ColT, ValT)] => imp } new RowVector[ColT, ValT](topSym, valS, newPipe, sizeH.setCols(k).setRows(1L)) } def toMatrix[RowT](rowId: RowT): Matrix[RowT, ColT, ValT] = { val rowSym = newSymbol(Set(colS, valS), 'row) //Matrix.newSymbol(Set(colS, valS), 'row) - val newPipe = inPipe.map(() -> rowSym){ u: Unit => rowId } + val newPipe = inPipe + .map(() -> rowSym) { u: Unit => rowId } .project(rowSym, colS, valS) new Matrix[RowT, ColT, ValT](rowSym, colS, valS, newPipe, sizeH.setRows(1L)) } // Override the size hint - def withColsHint(cols: Long): RowVector[ColT, ValT] = { + def withColsHint(cols: Long): RowVector[ColT, ValT] = new RowVector[ColT, ValT](colS, valS, pipe, sizeH.setRows(1L).setCols(cols)) - } /** - * Write optionally renaming val fields to the given fields - * then return this. + * Write optionally renaming val fields to the given fields then return this. */ def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) @@ -911,23 +1002,31 @@ class RowVector[ColT, ValT](val colS: Symbol, val valS: Symbol, inPipe: Pipe, va } } -class ColVector[RowT, ValT](val rowS: Symbol, val valS: Symbol, inPipe: Pipe, val sizeH: SizeHint = FiniteHint(-1L, 1L)) - extends java.io.Serializable with WrappedPipe { +class ColVector[RowT, ValT]( + val rowS: Symbol, + val valS: Symbol, + inPipe: Pipe, + val sizeH: SizeHint = FiniteHint(-1L, 1L) +) extends java.io.Serializable + with WrappedPipe { def pipe = inPipe.project(rowS, valS) def fields = (rowS, valS) - def *[That, Res](that: That)(implicit prod: MatrixProduct[ColVector[RowT, ValT], That, Res]): Res = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[ColVector[RowT, ValT], That, Res]): Res = + prod(this, that) - def +(that: ColVector[RowT, ValT])(implicit mon: Monoid[ValT]) = (this.toMatrix(true) + that.toMatrix(true)).getCol(true) + def +(that: ColVector[RowT, ValT])(implicit mon: Monoid[ValT]) = + (this.toMatrix(true) + that.toMatrix(true)).getCol(true) - def -(that: ColVector[RowT, ValT])(implicit group: Group[ValT]) = (this.toMatrix(true) - that.toMatrix(true)).getCol(true) + def -(that: ColVector[RowT, ValT])(implicit group: Group[ValT]) = + (this.toMatrix(true) - that.toMatrix(true)).getCol(true) - def hProd(that: ColVector[RowT, ValT])(implicit ring: Ring[ValT]): ColVector[RowT, ValT] = (this.toMatrix(true) hProd that.toMatrix(true)).getCol(true) + def hProd(that: ColVector[RowT, ValT])(implicit ring: Ring[ValT]): ColVector[RowT, ValT] = + this.toMatrix(true).hProd(that.toMatrix(true)).getCol(true) - def transpose: RowVector[RowT, ValT] = { + def transpose: RowVector[RowT, ValT] = new RowVector[RowT, ValT](rowS, valS, inPipe, sizeH.transpose) - } def diag: DiagonalMatrix[RowT, ValT] = { val newHint = SizeHint.asDiagonal(sizeH.setColsToRows) @@ -935,22 +1034,24 @@ class ColVector[RowT, ValT](val rowS: Symbol, val valS: Symbol, inPipe: Pipe, va } /** - * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the vector. This does not enumerate the zeros + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the vector. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT, RowT) => ValNew)(implicit mon: Monoid[ValNew]): ColVector[RowT, ValNew] = transpose.mapWithIndex(fn).transpose + def mapWithIndex[ValNew](fn: (ValT, RowT) => ValNew)(implicit + mon: Monoid[ValNew] + ): ColVector[RowT, ValNew] = transpose.mapWithIndex(fn).transpose // Value operations def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): ColVector[RowT, ValU] = { val newPipe = pipe.flatMap(valS -> valS) { imp: Tuple1[ValT] => // Ensure an arity of 1 //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } new ColVector[RowT, ValU](this.rowS, this.valS, newPipe, sizeH) } def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = { - val scalarPipe = pipe.groupAll{ + val scalarPipe = pipe.groupAll { _.reduce(valS -> valS) { (left: Tuple1[ValT], right: Tuple1[ValT]) => Tuple1(mon.plus(left._1, right._1)) } @@ -960,56 +1061,60 @@ class ColVector[RowT, ValT](val rowS: Symbol, val valS: Symbol, inPipe: Pipe, va def L0Normalize(implicit ev: =:=[ValT, Double]): ColVector[RowT, ValT] = { val normedMatrix = this.toMatrix(0).colL0Normalize - new ColVector(normedMatrix.rowSym, + new ColVector( + normedMatrix.rowSym, normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym)) + normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym) + ) } def L1Normalize(implicit ev: =:=[ValT, Double]): ColVector[RowT, ValT] = { val normedMatrix = this.toMatrix(0).colL1Normalize - new ColVector(normedMatrix.rowSym, + new ColVector( + normedMatrix.rowSym, normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym)) + normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym) + ) } - def topElems(k: Int)(implicit ord: Ordering[ValT]): ColVector[RowT, ValT] = { + def topElems(k: Int)(implicit ord: Ordering[ValT]): ColVector[RowT, ValT] = if (k < 1000) { topWithTiny(k) } else { - val newPipe = pipe.groupAll{ - _ - .sortBy(valS) - .reverse - .take(k) - }.project(rowS, valS) + val newPipe = pipe + .groupAll { + _.sortBy(valS).reverse + .take(k) + } + .project(rowS, valS) new ColVector[RowT, ValT](rowS, valS, newPipe, sizeH.setCols(1L).setRows(k)) } - } protected def topWithTiny(k: Int)(implicit ord: Ordering[ValT]): ColVector[RowT, ValT] = { val topSym = Symbol(rowS.name + "_topK") - val newPipe = pipe.groupAll{ - _ - .sortWithTake((rowS, valS) -> 'top_vals, k) ((t0: (RowT, ValT), t1: (RowT, ValT)) => ord.gt(t0._2, t1._2)) - } + val newPipe = pipe + .groupAll { + _.sortWithTake((rowS, valS) -> 'top_vals, k)((t0: (RowT, ValT), t1: (RowT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } .flatMap('top_vals -> (topSym, valS)) { imp: List[(RowT, ValT)] => imp } new ColVector[RowT, ValT](topSym, valS, newPipe, sizeH.setCols(1L).setRows(k)) } def toMatrix[ColT](colIdx: ColT): Matrix[RowT, ColT, ValT] = { val colSym = newSymbol(Set(rowS, valS), 'col) //Matrix.newSymbol(Set(rowS, valS), 'col) - val newPipe = inPipe.map(() -> colSym){ u: Unit => colIdx } + val newPipe = inPipe + .map(() -> colSym) { u: Unit => colIdx } .project(rowS, colSym, valS) new Matrix[RowT, ColT, ValT](rowS, colSym, valS, newPipe, sizeH.setCols(1L)) } // Override the size hint - def withRowsHint(rows: Long): ColVector[RowT, ValT] = { + def withRowsHint(rows: Long): ColVector[RowT, ValT] = new ColVector[RowT, ValT](rowS, valS, pipe, sizeH.setRows(rows).setCols(1L)) - } /** - * Write optionally renaming val fields to the given fields - * then return this. + * Write optionally renaming val fields to the given fields then return this. */ def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) @@ -1018,21 +1123,24 @@ class ColVector[RowT, ValT](val rowS: Symbol, val valS: Symbol, inPipe: Pipe, va } /** - * BlockMatrix is 3 dimensional matrix where the rows are grouped - * It is useful for when we want to multiply groups of vectors only between themselves. - * For example, grouping users by countries and calculating products only between users from the same country + * BlockMatrix is 3 dimensional matrix where the rows are grouped It is useful for when we want to multiply + * groups of vectors only between themselves. For example, grouping users by countries and calculating + * products only between users from the same country */ class BlockMatrix[RowT, GroupT, ColT, ValT](private val mat: Matrix[RowT, GroupT, Map[ColT, ValT]]) { - def dotProd[RowT2](that: BlockMatrix[GroupT, RowT2, ColT, ValT])(implicit prod: MatrixProduct[Matrix[RowT, GroupT, Map[ColT, ValT]], Matrix[GroupT, RowT2, Map[ColT, ValT]], Matrix[RowT, RowT2, Map[ColT, ValT]]], - mon: Monoid[ValT]): Matrix[RowT, RowT2, ValT] = { + def dotProd[RowT2](that: BlockMatrix[GroupT, RowT2, ColT, ValT])(implicit + prod: MatrixProduct[ + Matrix[RowT, GroupT, Map[ColT, ValT]], + Matrix[GroupT, RowT2, Map[ColT, ValT]], + Matrix[RowT, RowT2, Map[ColT, ValT]] + ], + mon: Monoid[ValT] + ): Matrix[RowT, RowT2, ValT] = prod(mat, that.mat).mapValues(_.values.foldLeft(mon.zero)(mon.plus)) - } - def transpose: BlockMatrix[GroupT, RowT, ColT, ValT] = { + def transpose: BlockMatrix[GroupT, RowT, ColT, ValT] = new BlockMatrix(mat.transpose) - } - def withSizeHint(hint: SizeHint) = { + def withSizeHint(hint: SizeHint) = new BlockMatrix(mat.withSizeHint(hint)) - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala index fd3fe49daa..a8d56fa14a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import cascading.flow.FlowDef import com.twitter.scalding.serialization.OrderedSerialization2 import com.twitter.scalding._ -import com.twitter.scalding.typed.{ ValuePipe, EmptyValue, LiteralValue, ComputedValue } -import com.twitter.algebird.{ Semigroup, Monoid, Ring, Group, Field } +import com.twitter.scalding.typed.{ComputedValue, EmptyValue, LiteralValue, ValuePipe} +import com.twitter.algebird.{Field, Group, Monoid, Ring, Semigroup} import scala.collection.mutable.Map import scala.collection.mutable.HashMap @@ -28,16 +28,15 @@ import java.io.Serializable /** * This is the future Matrix API. The old one will be removed in scalding 0.10.0 (or 1.0.0). * - * Create Matrix2 instances with methods in the Matrix2 object. - * Note that this code optimizes the order in which it evaluates matrices, and replaces equivalent - * terms to avoid recomputation. Also, this code puts the parenthesis in the optimal place in - * terms of size according to the sizeHints. For instance: - * (A*B)*C == A*(B*C) but if B is a 10 x 10^6 matrix, and C is 10^6 x 100, - * it is better to do the B*C product first in order to avoid storing as much intermediate output. + * Create Matrix2 instances with methods in the Matrix2 object. Note that this code optimizes the order in + * which it evaluates matrices, and replaces equivalent terms to avoid recomputation. Also, this code puts the + * parenthesis in the optimal place in terms of size according to the sizeHints. For instance: (A*B)*C == + * A*(B*C) but if B is a 10 x 10^6 matrix, and C is 10^6 x 100, it is better to do the B*C product first in + * order to avoid storing as much intermediate output. * - * NOTE THIS REQUIREMENT: for each formula, you can only have one Ring[V] in scope. If you - * evaluate part of the formula with one Ring, and another part with another, you must go through - * a TypedPipe (call toTypedPipe) or the result may not be correct. + * NOTE THIS REQUIREMENT: for each formula, you can only have one Ring[V] in scope. If you evaluate part of + * the formula with one Ring, and another part with another, you must go through a TypedPipe (call + * toTypedPipe) or the result may not be correct. */ sealed trait Matrix2[R, C, V] extends Serializable { implicit def rowOrd: Ordering[R] @@ -47,10 +46,12 @@ sealed trait Matrix2[R, C, V] extends Serializable { def -(that: Matrix2[R, C, V])(implicit g: Group[V]): Matrix2[R, C, V] = Sum(this, that.negate, g) def unary_-(implicit g: Group[V]): Matrix2[R, C, V] = negate def negate(implicit g: Group[V]): Matrix2[R, C, V] + /** * Represents the pointwise, or Hadamard, product of two matrices. */ - def #*#(that: Matrix2[R, C, V])(implicit ring: Ring[V]): Matrix2[R, C, V] = HadamardProduct(this, that, ring) + def #*#(that: Matrix2[R, C, V])(implicit ring: Ring[V]): Matrix2[R, C, V] = + HadamardProduct(this, that, ring) // Matrix product def *[C2](that: Matrix2[C, C2, V])(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, C2, V] = Product(this, that, ring) @@ -58,16 +59,17 @@ sealed trait Matrix2[R, C, V] extends Serializable { def *(that: Scalar2[V])(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, C, V] = that * this def /(that: Scalar2[V])(implicit field: Field[V]): Matrix2[R, C, V] = - that divMatrix this + that.divMatrix(this) + /** * Convert the current Matrix to a TypedPipe */ def toTypedPipe: TypedPipe[(R, C, V)] def transpose: Matrix2[C, R, V] + /** - * Users should never need this. This is the current Matrix2, but in most optimized - * form. Usually, you will just do matrix operations until you eventually call write - * or toTypedPipe + * Users should never need this. This is the current Matrix2, but in most optimized form. Usually, you will + * just do matrix operations until you eventually call write or toTypedPipe */ def optimizedSelf: Matrix2[R, C, V] = Matrix2.optimize(this.asInstanceOf[Matrix2[Any, Any, V]])._2.asInstanceOf[Matrix2[R, C, V]] @@ -90,65 +92,70 @@ sealed trait Matrix2[R, C, V] extends Serializable { Product(this, OneC()(colOrd), ring) /** - * the result is the same as considering everything on the this to be like a 1 value - * so we just sum, using only a monoid on VecV, where this Matrix has the value true. - * This is useful for graph propagation of monoids, such as sketchs like HyperLogLog, - * BloomFilters or CountMinSketch. - * TODO This is a special kind of product that could be optimized like Product is + * the result is the same as considering everything on the this to be like a 1 value so we just sum, using + * only a monoid on VecV, where this Matrix has the value true. This is useful for graph propagation of + * monoids, such as sketchs like HyperLogLog, BloomFilters or CountMinSketch. TODO This is a special kind of + * product that could be optimized like Product is */ - def propagate[C2, VecV](vec: Matrix2[C, C2, VecV])(implicit ev: =:=[V, Boolean], - mon: Monoid[VecV], - mj: MatrixJoiner2): Matrix2[R, C2, VecV] = { + def propagate[C2, VecV]( + vec: Matrix2[C, C2, VecV] + )(implicit ev: =:=[V, Boolean], mon: Monoid[VecV], mj: MatrixJoiner2): Matrix2[R, C2, VecV] = { //This cast will always succeed: lazy val joinedBool = mj.join(this.asInstanceOf[Matrix2[R, C, Boolean]], vec) implicit val ord2: Ordering[C2] = vec.colOrd - lazy val resultPipe = joinedBool.flatMap { - case (key, ((row, bool), (col2, v))) => + lazy val resultPipe = joinedBool + .flatMap { case (key, ((row, bool), (col2, v))) => if (bool) Some((row, col2), v) else None // filter early - } + } .group // TODO we could be lazy with this group and combine with a sum .sum - .filter { kv => mon.isNonZero(kv._2) } + .filter(kv => mon.isNonZero(kv._2)) .map { case ((r, c2), v) => (r, c2, v) } MatrixLiteral(resultPipe, this.sizeHint) } - def propagateRow[C2](mat: Matrix2[C, C2, Boolean])(implicit ev: =:=[R, Unit], mon: Monoid[V], mj: MatrixJoiner2): Matrix2[Unit, C2, V] = + def propagateRow[C2]( + mat: Matrix2[C, C2, Boolean] + )(implicit ev: =:=[R, Unit], mon: Monoid[V], mj: MatrixJoiner2): Matrix2[Unit, C2, V] = mat.transpose.propagate(this.transpose.asInstanceOf[Matrix2[C, Unit, V]]).transpose // Binarize values, all x != 0 become 1 def binarizeAs[NewValT](implicit mon: Monoid[V], ring: Ring[NewValT]): Matrix2[R, C, NewValT] = { - lazy val newPipe = toTypedPipe.map { - case (r, c, x) => - (r, c, if (mon.isNonZero(x)) { ring.one } else { ring.zero }) - } - .filter { kv => ring.isNonZero(kv._3) } + lazy val newPipe = toTypedPipe + .map { case (r, c, x) => + ( + r, + c, + if (mon.isNonZero(x)) { ring.one } + else { ring.zero } + ) + } + .filter(kv => ring.isNonZero(kv._3)) MatrixLiteral(newPipe, this.sizeHint) } /** - * Row L2 normalization - * After this operation, the sum(|x|^2) along each row will be 1. + * Row L2 normalization After this operation, the sum(|x|^2) along each row will be 1. */ def rowL2Normalize(implicit num: Numeric[V], mj: MatrixJoiner2): Matrix2[R, C, Double] = { - val matD = MatrixLiteral(this.toTypedPipe.map{ case (r, c, x) => (r, c, num.toDouble(x)) }, this.sizeHint) - lazy val result = MatrixLiteral(this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x) * num.toDouble(x)) }, this.sizeHint) - .sumColVectors - .toTypedPipe + val matD = + MatrixLiteral(this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x)) }, this.sizeHint) + lazy val result = MatrixLiteral( + this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x) * num.toDouble(x)) }, + this.sizeHint + ).sumColVectors.toTypedPipe .map { case (r, c, x) => (r, r, 1 / scala.math.sqrt(x)) } // diagonal + inverse MatrixLiteral(result, SizeHint.asDiagonal(this.sizeHint.setRowsToCols)) * matD } /** - * Row L1 normalization - * After this operation, the sum(|x|) alone each row will be 1. + * Row L1 normalization After this operation, the sum(|x|) alone each row will be 1. */ def rowL1Normalize(implicit num: Numeric[V], mj: MatrixJoiner2): Matrix2[R, C, Double] = { - val matD = MatrixLiteral(this.toTypedPipe.map{ case (r, c, x) => (r, c, num.toDouble(x).abs) }, this.sizeHint) - lazy val result = matD - .sumColVectors - .toTypedPipe + val matD = + MatrixLiteral(this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x).abs) }, this.sizeHint) + lazy val result = matD.sumColVectors.toTypedPipe .map { case (r, c, x) => (r, r, 1 / x) } // diagonal + inverse MatrixLiteral(result, SizeHint.asDiagonal(this.sizeHint.setRowsToCols)) * matD } @@ -157,17 +164,21 @@ sealed trait Matrix2[R, C, V] extends Serializable { MatrixLiteral( toTypedPipe .filter { case (r, c, v) => Ordering[R].equiv(r, index) } - .map { case (r, c, v) => ((), c, v) }, this.sizeHint.setRows(1L)) + .map { case (r, c, v) => ((), c, v) }, + this.sizeHint.setRows(1L) + ) def getColumn(index: C): Matrix2[R, Unit, V] = MatrixLiteral( toTypedPipe .filter { case (r, c, v) => Ordering[C].equiv(c, index) } - .map { case (r, c, v) => (r, (), v) }, this.sizeHint.setCols(1L)) + .map { case (r, c, v) => (r, (), v) }, + this.sizeHint.setCols(1L) + ) /** - * Consider this Matrix as the r2 row of a matrix. The current matrix must be a row, - * which is to say, its row type must be Unit. + * Consider this Matrix as the r2 row of a matrix. The current matrix must be a row, which is to say, its + * row type must be Unit. */ def asRow[R2](r2: R2)(implicit ev: R =:= Unit, rowOrd: Ordering[R2]): Matrix2[R2, C, V] = MatrixLiteral(toTypedPipe.map { case (r, c, v) => (r2, c, v) }, this.sizeHint) @@ -178,23 +189,28 @@ sealed trait Matrix2[R, C, V] extends Serializable { // Compute the sum of the main diagonal. Only makes sense cases where the row and col type are // equal def trace(implicit mon: Monoid[V], ev: =:=[R, C]): Scalar2[V] = - Scalar2(toTypedPipe.asInstanceOf[TypedPipe[(R, R, V)]] - .filter{ case (r1, r2, _) => Ordering[R].equiv(r1, r2) } - .map{ case (_, _, x) => x } - .sum(mon)) + Scalar2( + toTypedPipe + .asInstanceOf[TypedPipe[(R, R, V)]] + .filter { case (r1, r2, _) => Ordering[R].equiv(r1, r2) } + .map { case (_, _, x) => x } + .sum(mon) + ) def write(sink: TypedSink[(R, C, V)])(implicit fd: FlowDef, m: Mode): Matrix2[R, C, V] = MatrixLiteral(toTypedPipe.write(sink), sizeHint) } /** - * This trait allows users to plug in join algorithms - * where they are needed to improve products and propagations. - * The default works well in most cases, but highly skewed matrices may need some - * special handling + * This trait allows users to plug in join algorithms where they are needed to improve products and + * propagations. The default works well in most cases, but highly skewed matrices may need some special + * handling */ trait MatrixJoiner2 extends java.io.Serializable { - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], right: Matrix2[C, C2, V2]): TypedPipe[(C, ((R, V), (C2, V2)))] + def join[R, C, V, C2, V2]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V2] + ): TypedPipe[(C, ((R, V), (C2, V2)))] } object MatrixJoiner2 { @@ -202,25 +218,30 @@ object MatrixJoiner2 { // comment this out to verify we are not hiding the user's suppled values implicit def default: MatrixJoiner2 = new DefaultMatrixJoiner(10000L) - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V2])(implicit mj: MatrixJoiner2): TypedPipe[(C, ((R, V), (C2, V2)))] = + def join[R, C, V, C2, V2](left: Matrix2[R, C, V], right: Matrix2[C, C2, V2])(implicit + mj: MatrixJoiner2 + ): TypedPipe[(C, ((R, V), (C2, V2)))] = mj.join(left, right) } /** - * This uses standard join if the matrices are comparable size and large, - * otherwise, if one is much smaller than the other, we use a hash join + * This uses standard join if the matrices are comparable size and large, otherwise, if one is much smaller + * than the other, we use a hash join */ class DefaultMatrixJoiner(sizeRatioThreshold: Long) extends MatrixJoiner2 { - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V2]): TypedPipe[(C, ((R, V), (C2, V2)))] = { + def join[R, C, V, C2, V2]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V2] + ): TypedPipe[(C, ((R, V), (C2, V2)))] = { implicit val cOrd: Ordering[C] = left.colOrd val one = left.toTypedPipe.map { case (r, c, v) => (c, (r, v)) }.group val two = right.toTypedPipe.map { case (c, c2, v2) => (c, (c2, v2)) }.group val sizeOne = left.sizeHint.total.getOrElse(BigInt(1L)) val sizeTwo = right.sizeHint.total.getOrElse(BigInt(1L)) - def swapInner[M, N](t: TypedPipe[(C, (M, N))]): TypedPipe[(C, (N, M))] = t.mapValues { t: (M, N) => t.swap } + def swapInner[M, N](t: TypedPipe[(C, (M, N))]): TypedPipe[(C, (N, M))] = t.mapValues { t: (M, N) => + t.swap + } // TODO: // use block join on tall skinny times skinny tall (or skewed): the result really big, // but the direct approach can't get much parallelism. @@ -244,7 +265,8 @@ final case class OneC[R, V](implicit override val rowOrd: Ordering[R]) extends M override val sizeHint: SizeHint = FiniteHint(Long.MaxValue, 1) override def colOrd = Ordering[Unit] def transpose = OneR() - override def negate(implicit g: Group[V]) = sys.error("Only used in intermediate computations, try (-1 * OneC)") + override def negate(implicit g: Group[V]) = + sys.error("Only used in intermediate computations, try (-1 * OneC)") def toTypedPipe = sys.error("Only used in intermediate computations") } @@ -255,30 +277,38 @@ final case class OneR[C, V](implicit override val colOrd: Ordering[C]) extends M override val sizeHint: SizeHint = FiniteHint(1, Long.MaxValue) override def rowOrd = Ordering[Unit] def transpose = OneC() - override def negate(implicit g: Group[V]) = sys.error("Only used in intermediate computations, try (-1 * OneR)") + override def negate(implicit g: Group[V]) = + sys.error("Only used in intermediate computations, try (-1 * OneR)") def toTypedPipe = sys.error("Only used in intermediate computations") } /** * Class representing a matrix product * - * @param left multiplicand - * @param right multiplier + * @param left + * multiplicand + * @param right + * multiplier * @param ring - * @param expressions a HashMap of common subtrees; None if possibly not optimal (did not go through optimize), Some(...) with a HashMap that was created in optimize + * @param expressions + * a HashMap of common subtrees; None if possibly not optimal (did not go through optimize), Some(...) with + * a HashMap that was created in optimize */ -final case class Product[R, C, C2, V](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V], - ring: Ring[V], - expressions: Option[Map[Matrix2[R, C2, V], TypedPipe[(R, C2, V)]]] = None)(implicit val joiner: MatrixJoiner2) extends Matrix2[R, C2, V] { +final case class Product[R, C, C2, V]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V], + ring: Ring[V], + expressions: Option[Map[Matrix2[R, C2, V], TypedPipe[(R, C2, V)]]] = None +)(implicit val joiner: MatrixJoiner2) + extends Matrix2[R, C2, V] { /** - * Structural, NOT mathematical equality (e.g. (A*B) * C != A * (B*C)) - * Used for the Matrix2OptimizationTest (so that it doesn't care about expressions) + * Structural, NOT mathematical equality (e.g. (A*B) * C != A * (B*C)) Used for the Matrix2OptimizationTest + * (so that it doesn't care about expressions) */ override def equals(obj: Any): Boolean = obj match { case Product(tl, tr, _, _) => left.equals(tl) && right.equals(tr) - case _ => false + case _ => false } override def hashCode(): Int = left.hashCode ^ right.hashCode @@ -292,14 +322,15 @@ final case class Product[R, C, C2, V](left: Matrix2[R, C, V], val localRing = ring val joined = (if (leftMatrix) { - val ord: Ordering[R] = left.rowOrd - left.toTypedPipe.groupBy(x => x._1)(ord) - } else { - val ord: Ordering[C] = right.rowOrd - right.toTypedPipe.groupBy(x => x._1)(ord) - }).mapValues { _._3 } + val ord: Ordering[R] = left.rowOrd + left.toTypedPipe.groupBy(x => x._1)(ord) + } else { + val ord: Ordering[C] = right.rowOrd + right.toTypedPipe.groupBy(x => x._1)(ord) + }) + .mapValues(_._3) .sum(localRing) - .filter { kv => localRing.isNonZero(kv._2) } + .filter(kv => localRing.isNonZero(kv._2)) if (leftMatrix) { joined.map { case (r, v) => (r, (), v) }.asInstanceOf[TypedPipe[(R, C2, V)]] // we know C2 is Unit @@ -316,7 +347,8 @@ final case class Product[R, C, C2, V](left: Matrix2[R, C, V], } else { implicit val ord: Ordering[C] = right.rowOrd val localRing = ring - joiner.join(left, right) + joiner + .join(left, right) .map { case (key, ((l1, lv), (r2, rv))) => (l1, r2, localRing.times(lv, rv)) } } } else { @@ -326,25 +358,27 @@ final case class Product[R, C, C2, V](left: Matrix2[R, C, V], } } - private def computePipe(joined: TypedPipe[(R, C2, V)] = toOuterSum): TypedPipe[(R, C2, V)] = { + private def computePipe(joined: TypedPipe[(R, C2, V)] = toOuterSum): TypedPipe[(R, C2, V)] = if (isSpecialCase) { joined } else { val localRing = ring - joined.groupBy(w => (w._1, w._2)).mapValues { _._3 } + joined + .groupBy(w => (w._1, w._2)) + .mapValues(_._3) .sum(localRing) - .filter { kv => localRing.isNonZero(kv._2) } + .filter(kv => localRing.isNonZero(kv._2)) .map { case ((r, c), v) => (r, c, v) } } - } override lazy val toTypedPipe: TypedPipe[(R, C2, V)] = { expressions match { - case Some(m) => m.get(this).getOrElse { - val result = computePipe() - m.put(this, result) - result - } + case Some(m) => + m.get(this).getOrElse { + val result = computePipe() + m.put(this, result) + result + } case None => optimizedSelf.toTypedPipe } } @@ -353,16 +387,16 @@ final case class Product[R, C, C2, V](left: Matrix2[R, C, V], implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C2] = right.colOrd - implicit def withOrderedSerialization: Ordering[(R, C2)] = OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) + implicit def withOrderedSerialization: Ordering[(R, C2)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) override lazy val transpose: Product[C2, C, R, V] = Product(right.transpose, left.transpose, ring) - override def negate(implicit g: Group[V]): Product[R, C, C2, V] = { + override def negate(implicit g: Group[V]): Product[R, C, C2, V] = if (left.sizeHint.total.getOrElse(BigInt(0L)) > right.sizeHint.total.getOrElse(BigInt(0L))) { Product(left, right.negate, ring, expressions) } else { Product(left.negate, right, ring, expressions) } - } /** * Trace(A B) = Trace(B A) so we optimize to choose the lowest cost item @@ -371,33 +405,34 @@ final case class Product[R, C, C2, V](left: Matrix2[R, C, V], val (cost1, plan1) = Matrix2.optimize(this.asInstanceOf[Matrix2[Any, Any, V]]) // linter:ignore val (cost2, plan2) = Matrix2.optimize( // linter:ignore Product(right.asInstanceOf[Matrix2[C, R, V]], left.asInstanceOf[Matrix2[R, C, V]], ring, None) - .asInstanceOf[Matrix2[Any, Any, V]]) + .asInstanceOf[Matrix2[Any, Any, V]] + ) if (cost1 > cost2) { val product2 = plan2.asInstanceOf[Product[C, R, C, V]] val ord = left.colOrd - val filtered = product2.toOuterSum.filter{ case (c1, c2, _) => ord.equiv(c1, c2) } - Scalar2(product2.computePipe(filtered).map{ case (_, _, x) => x }.sum(mon)) + val filtered = product2.toOuterSum.filter { case (c1, c2, _) => ord.equiv(c1, c2) } + Scalar2(product2.computePipe(filtered).map { case (_, _, x) => x }.sum(mon)) } else { val product1 = plan1.asInstanceOf[Product[R, C, R, V]] val ord = left.rowOrd - val filtered = product1.toOuterSum.filter{ case (r1, r2, _) => ord.equiv(r1, r2) } - Scalar2(product1.computePipe(filtered).map{ case (_, _, x) => x }.sum(mon)) + val filtered = product1.toOuterSum.filter { case (r1, r2, _) => ord.equiv(r1, r2) } + Scalar2(product1.computePipe(filtered).map { case (_, _, x) => x }.sum(mon)) } } } -final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], mon: Monoid[V]) extends Matrix2[R, C, V] { +final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], mon: Monoid[V]) + extends Matrix2[R, C, V] { def collectAddends(sum: Sum[R, C, V]): List[TypedPipe[(R, C, V)]] = { - def getLiteral(mat: Matrix2[R, C, V]): TypedPipe[(R, C, V)] = { + def getLiteral(mat: Matrix2[R, C, V]): TypedPipe[(R, C, V)] = mat match { - case x @ Product(_, _, _, _) => x.toOuterSum - case x @ MatrixLiteral(_, _) => x.toTypedPipe + case x @ Product(_, _, _, _) => x.toOuterSum + case x @ MatrixLiteral(_, _) => x.toTypedPipe case x @ HadamardProduct(_, _, _) => x.optimizedSelf.toTypedPipe - case _ => sys.error("Invalid addend") + case _ => sys.error("Invalid addend") } - } sum match { case Sum(l @ Sum(_, _, _), r @ Sum(_, _, _), _) => { @@ -421,9 +456,10 @@ final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], m } else { collectAddends(this) .reduce((x, y) => x ++ y) - .groupBy(x => (x._1, x._2)).mapValues { _._3 } + .groupBy(x => (x._1, x._2)) + .mapValues(_._3) .sum(mon) - .filter { kv => mon.isNonZero(kv._2) } + .filter(kv => mon.isNonZero(kv._2)) .map { case ((r, c), v) => (r, c, v) } } } @@ -432,7 +468,8 @@ final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], m implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C] = left.colOrd - implicit def withOrderedSerialization: Ordering[(R, C)] = OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) + implicit def withOrderedSerialization: Ordering[(R, C)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) override lazy val transpose: Sum[C, R, V] = Sum(left.transpose, right.transpose, mon) override def negate(implicit g: Group[V]): Sum[R, C, V] = Sum(left.negate, right.negate, mon) @@ -440,16 +477,21 @@ final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], m Sum(left.sumColVectors, right.sumColVectors, mon) override def trace(implicit mon: Monoid[V], ev: =:=[R, C]): Scalar2[V] = - Scalar2(collectAddends(this).map { pipe => - pipe.asInstanceOf[TypedPipe[(R, R, V)]] - .filter { case (r, c, v) => Ordering[R].equiv(r, c) } - .map { _._3 } - }.reduce(_ ++ _).sum) + Scalar2( + collectAddends(this) + .map { pipe => + pipe + .asInstanceOf[TypedPipe[(R, R, V)]] + .filter { case (r, c, v) => Ordering[R].equiv(r, c) } + .map(_._3) + } + .reduce(_ ++ _) + .sum + ) } -final case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], - right: Matrix2[R, C, V], - ring: Ring[V]) extends Matrix2[R, C, V] { +final case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], ring: Ring[V]) + extends Matrix2[R, C, V] { // TODO: optimize / combine with Sums: https://github.com/tomtau/scalding/issues/14#issuecomment-22971582 override lazy val toTypedPipe: TypedPipe[(R, C, V)] = { @@ -460,14 +502,15 @@ final case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], (left.optimizedSelf.toTypedPipe.map { case (r, c, v) => (r, c, (v, false)) } ++ right.optimizedSelf.toTypedPipe.map { case (r, c, v) => (r, c, (v, false)) }) .groupBy(x => (x._1, x._2)) - .mapValues { _._3 } + .mapValues(_._3) .reduce((x, y) => (ring.times(x._1, y._1), true)) - .filter { kv => kv._2._2 && ring.isNonZero(kv._2._1) } + .filter(kv => kv._2._2 && ring.isNonZero(kv._2._1)) .map { case ((r, c), v) => (r, c, v._1) } } } - override lazy val transpose: MatrixLiteral[C, R, V] = MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) + override lazy val transpose: MatrixLiteral[C, R, V] = + MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) override val sizeHint = left.sizeHint #*# right.sizeHint override def negate(implicit g: Group[V]): HadamardProduct[R, C, V] = if (left.sizeHint.total.getOrElse(BigInt(0L)) > right.sizeHint.total.getOrElse(BigInt(0L))) @@ -477,12 +520,15 @@ final case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C] = left.colOrd - implicit def withOrderedSerialization: Ordering[(R, C)] = OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) + implicit def withOrderedSerialization: Ordering[(R, C)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) } -final case class MatrixLiteral[R, C, V](override val toTypedPipe: TypedPipe[(R, C, V)], - override val sizeHint: SizeHint)(implicit override val rowOrd: Ordering[R], override val colOrd: Ordering[C]) - extends Matrix2[R, C, V] { +final case class MatrixLiteral[R, C, V]( + override val toTypedPipe: TypedPipe[(R, C, V)], + override val sizeHint: SizeHint +)(implicit override val rowOrd: Ordering[R], override val colOrd: Ordering[C]) + extends Matrix2[R, C, V] { override lazy val transpose: MatrixLiteral[C, R, V] = MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) @@ -497,17 +543,16 @@ final case class MatrixLiteral[R, C, V](override val toTypedPipe: TypedPipe[(R, trait Scalar2[V] extends Serializable { def value: ValuePipe[V] - def +(that: Scalar2[V])(implicit sg: Semigroup[V]): Scalar2[V] = { + def +(that: Scalar2[V])(implicit sg: Semigroup[V]): Scalar2[V] = (value, that.value) match { - case (EmptyValue, _) => that + case (EmptyValue, _) => that case (LiteralValue(v1), _) => that.map(sg.plus(v1, _)) - case (_, EmptyValue) => this + case (_, EmptyValue) => this case (_, LiteralValue(v2)) => map(sg.plus(_, v2)) // TODO: optimize sums of scalars like sums of matrices: // only one M/R pass for the whole Sum. case (_, ComputedValue(v2)) => Scalar2((value ++ v2).sum(sg)) } - } def -(that: Scalar2[V])(implicit g: Group[V]): Scalar2[V] = this + that.map(x => g.negate(x)) def *(that: Scalar2[V])(implicit ring: Ring[V]): Scalar2[V] = Scalar2(ValuePipe.fold(value, that.value)(ring.times _)) @@ -528,7 +573,7 @@ trait Scalar2[V] extends Serializable { else HadamardProduct(this * left, right, ring) case s @ Sum(left, right, mon) => Sum(this * left, this * right, mon) - case m @ MatrixLiteral(_, _) => timesLiteral(m) // handle literals here + case m @ MatrixLiteral(_, _) => timesLiteral(m) // handle literals here case x @ OneC() => Product(OneC[Unit, V](), toMatrix, ring) .asInstanceOf[Matrix2[R, C, V]] @@ -540,20 +585,20 @@ trait Scalar2[V] extends Serializable { def divMatrix[R, C](that: Matrix2[R, C, V])(implicit f: Field[V]): MatrixLiteral[R, C, V] = MatrixLiteral( that.toTypedPipe - .mapWithValue(value) { - case ((r, c, v), optV) => - (r, c, f.div(v, optV.getOrElse(f.zero))) + .mapWithValue(value) { case ((r, c, v), optV) => + (r, c, f.div(v, optV.getOrElse(f.zero))) }, - that.sizeHint)(that.rowOrd, that.colOrd) + that.sizeHint + )(that.rowOrd, that.colOrd) def timesLiteral[R, C](that: Matrix2[R, C, V])(implicit ring: Ring[V]): MatrixLiteral[R, C, V] = MatrixLiteral( that.toTypedPipe - .mapWithValue(value) { - case ((r, c, v), optV) => - (r, c, ring.times(optV.getOrElse(ring.zero), v)) + .mapWithValue(value) { case ((r, c, v), optV) => + (r, c, ring.times(optV.getOrElse(ring.zero), v)) }, - that.sizeHint)(that.rowOrd, that.colOrd) + that.sizeHint + )(that.rowOrd, that.colOrd) def map[U](fn: V => U): Scalar2[U] = Scalar2(value.map(fn)) def toMatrix: Matrix2[Unit, Unit, V] = @@ -580,21 +625,26 @@ object Matrix2 { def apply[R: Ordering, C: Ordering, V](t: TypedPipe[(R, C, V)], hint: SizeHint): Matrix2[R, C, V] = MatrixLiteral(t, hint) - def read[R, C, V](t: TypedSource[(R, C, V)], - hint: SizeHint)(implicit ordr: Ordering[R], ordc: Ordering[C]): Matrix2[R, C, V] = + def read[R, C, V](t: TypedSource[(R, C, V)], hint: SizeHint)(implicit + ordr: Ordering[R], + ordc: Ordering[C] + ): Matrix2[R, C, V] = MatrixLiteral(TypedPipe.from(t), hint) def J[R, C, V](implicit ordR: Ordering[R], ordC: Ordering[C], ring: Ring[V], mj: MatrixJoiner2) = Product(OneC[R, V]()(ordR), OneR[C, V]()(ordC), ring) /** - * The original prototype that employs the standard O(n^3) dynamic programming - * procedure to optimize a matrix chain factorization. + * The original prototype that employs the standard O(n^3) dynamic programming procedure to optimize a + * matrix chain factorization. * - * Now, it also "prefers" more spread out / bushy / less deep factorization - * which reflects more the Map/Reduce nature. + * Now, it also "prefers" more spread out / bushy / less deep factorization which reflects more the + * Map/Reduce nature. */ - def optimizeProductChain[V](p: IndexedSeq[Matrix2[Any, Any, V]], product: Option[(Ring[V], MatrixJoiner2)]): (BigInt, Matrix2[Any, Any, V]) = { + def optimizeProductChain[V]( + p: IndexedSeq[Matrix2[Any, Any, V]], + product: Option[(Ring[V], MatrixJoiner2)] + ): (BigInt, Matrix2[Any, Any, V]) = { val subchainCosts = HashMap.empty[(Int, Int), BigInt] @@ -625,7 +675,7 @@ object Matrix2 { * intermediate matrix (like `OneC`). This is not yet forbidden in the types. */ @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def generatePlan(i: Int, j: Int): Matrix2[Any, Any, V] = { + def generatePlan(i: Int, j: Int): Matrix2[Any, Any, V] = if (i == j) p(i) else { val k = splitMarkers((i, j)) @@ -635,8 +685,6 @@ object Matrix2 { Product(left, right, ring, Some(sharedMap))(joiner) } - } - val best = computeCosts(p, 0, p.length - 1) (best, generatePlan(0, p.length - 1)) @@ -644,27 +692,30 @@ object Matrix2 { /** * This function walks the input tree, finds basic blocks to optimize, - * i.e. matrix product chains that are not interrupted by summations. - * One example: - * A*B*C*(D+E)*(F*G) => "basic blocks" are ABC, D, E, and FG + * i.e. matrix product chains that are not interrupted by summations. One example: A*B*C*(D+E)*(F*G) => + * "basic blocks" are ABC, D, E, and FG * - * + it now does "global" optimization - i.e. over optimize over basic blocks. - * In the above example, we'd treat (D+E) as a temporary matrix T and optimize the whole chain ABCTFG + * + it now does "global" optimization - i.e. over optimize over basic blocks. In the above example, we'd + * treat (D+E) as a temporary matrix T and optimize the whole chain ABCTFG * - * Not sure if making use of distributivity to generate more variants would be good. - * In the above example, we could also generate ABCDFG + ABCEFG and have basic blocks: ABCDFG, and ABCEFG. - * But this would be almost twice as much work with the current cost estimation. + * Not sure if making use of distributivity to generate more variants would be good. In the above example, + * we could also generate ABCDFG + ABCEFG and have basic blocks: ABCDFG, and ABCEFG. But this would be + * almost twice as much work with the current cost estimation. */ def optimize[V](mf: Matrix2[Any, Any, V]): (BigInt, Matrix2[Any, Any, V]) = { def pair[X, Y](x: Option[X], y: Option[Y]): Option[(X, Y)] = - for { xi <- x; yi <- y } yield (xi, yi) + for { + xi <- x + yi <- y + } yield (xi, yi) /** * Recursive function - returns a flatten product chain and optimizes product chains under sums */ - def optimizeBasicBlocks(mf: Matrix2[Any, Any, V]): (List[Matrix2[Any, Any, V]], BigInt, Option[Ring[V]], Option[MatrixJoiner2]) = { - + def optimizeBasicBlocks( + mf: Matrix2[Any, Any, V] + ): (List[Matrix2[Any, Any, V]], BigInt, Option[Ring[V]], Option[MatrixJoiner2]) = mf match { // basic block of one matrix case element @ MatrixLiteral(_, _) => (List(element), 0, None, None) @@ -672,22 +723,30 @@ object Matrix2 { case Sum(left, right, mon) => { val (lastLChain, lastCost1, ringL, joinerL) = optimizeBasicBlocks(left) val (lastRChain, lastCost2, ringR, joinerR) = optimizeBasicBlocks(right) - val (cost1, newLeft) = optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore - val (cost2, newRight) = optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore - (List(Sum(newLeft, newRight, mon)), + val (cost1, newLeft) = + optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore + val (cost2, newRight) = + optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore + ( + List(Sum(newLeft, newRight, mon)), lastCost1 + lastCost2 + cost1 + cost2, ringL.orElse(ringR), - joinerL.orElse(joinerR)) + joinerL.orElse(joinerR) + ) } case HadamardProduct(left, right, ring) => { val (lastLChain, lastCost1, ringL, joinerL) = optimizeBasicBlocks(left) val (lastRChain, lastCost2, ringR, joinerR) = optimizeBasicBlocks(right) - val (cost1, newLeft) = optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore - val (cost2, newRight) = optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore - (List(HadamardProduct(newLeft, newRight, ring)), + val (cost1, newLeft) = + optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore + val (cost2, newRight) = + optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore + ( + List(HadamardProduct(newLeft, newRight, ring)), lastCost1 + lastCost2 + cost1 + cost2, ringL.orElse(ringR), - joinerL.orElse(joinerR)) + joinerL.orElse(joinerR) + ) } // chain (...something...)*(...something...) case p @ Product(left, right, ring, _) => { @@ -698,9 +757,9 @@ object Matrix2 { // OneC, OneR and potentially other intermediate matrices case el => (List(el), 0, None, None) } - } val (lastChain, lastCost, ring, joiner) = optimizeBasicBlocks(mf) - val (potentialCost, finalResult) = optimizeProductChain(lastChain.toIndexedSeq, pair(ring, joiner)) // linter:ignore + val (potentialCost, finalResult) = + optimizeProductChain(lastChain.toIndexedSeq, pair(ring, joiner)) // linter:ignore (lastCost + potentialCost, finalResult) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala index cf518eaf19..762d6608ea 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics @@ -35,14 +35,12 @@ abstract class MatrixJoiner extends java.io.Serializable { } case object AnyToTiny extends MatrixJoiner { - override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithTiny(joinFields, right) - } } class BigToSmall(red: Int) extends MatrixJoiner { - override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithSmaller(joinFields, right, reducers = red) - } } case object TinyToAny extends MatrixJoiner { @@ -52,9 +50,8 @@ case object TinyToAny extends MatrixJoiner { } } class SmallToBig(red: Int) extends MatrixJoiner { - override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithLarger(joinFields, right, reducers = red) - } } abstract class MatrixCrosser extends java.io.Serializable { @@ -62,15 +59,13 @@ abstract class MatrixCrosser extends java.io.Serializable { } case object AnyCrossTiny extends MatrixCrosser { - override def apply(left: Pipe, right: Pipe): Pipe = { + override def apply(left: Pipe, right: Pipe): Pipe = RichPipe(left).crossWithTiny(right) - } } case object AnyCrossSmall extends MatrixCrosser { - override def apply(left: Pipe, right: Pipe): Pipe = { + override def apply(left: Pipe, right: Pipe): Pipe = RichPipe(left).crossWithSmaller(right) - } } trait MatrixProduct[Left, Right, Result] extends java.io.Serializable { @@ -78,41 +73,48 @@ trait MatrixProduct[Left, Right, Result] extends java.io.Serializable { } /** - * TODO: Muliplication is the expensive stuff. We need to optimize the methods below: - * This object holds the implicits to handle matrix products between various types + * TODO: Muliplication is the expensive stuff. We need to optimize the methods below: This object holds the + * implicits to handle matrix products between various types */ object MatrixProduct extends java.io.Serializable { // These are VARS, so you can set them before you start: var maxTinyJoin = 100000L // Bigger than this, and we use joinWithSmaller var maxReducers = 200 - def numOfReducers(hint: SizeHint) = { - hint.total.map { tot => - // + 1L is to make sure there is at least once reducer - (tot / MatrixProduct.maxTinyJoin + 1L).toInt min MatrixProduct.maxReducers - }.getOrElse(-1) - } + def numOfReducers(hint: SizeHint) = + hint.total + .map { tot => + // + 1L is to make sure there is at least once reducer + (tot / MatrixProduct.maxTinyJoin + 1L).toInt.min(MatrixProduct.maxReducers) + } + .getOrElse(-1) def getJoiner(leftSize: SizeHint, rightSize: SizeHint): MatrixJoiner = { val newHint = leftSize * rightSize if (SizeHintOrdering.lteq(leftSize, rightSize)) { // If leftsize is definite: - leftSize.total.map { t => if (t < maxTinyJoin) TinyToAny else new SmallToBig(numOfReducers(newHint)) } + leftSize.total + .map(t => if (t < maxTinyJoin) TinyToAny else new SmallToBig(numOfReducers(newHint))) // Else just assume the right is smaller, but both are unknown: .getOrElse(new BigToSmall(numOfReducers(newHint))) } else { // left > right - rightSize.total.map { rs => - if (rs < maxTinyJoin) AnyToTiny else new BigToSmall(numOfReducers(newHint)) - }.getOrElse(new BigToSmall(numOfReducers(newHint))) + rightSize.total + .map { rs => + if (rs < maxTinyJoin) AnyToTiny else new BigToSmall(numOfReducers(newHint)) + } + .getOrElse(new BigToSmall(numOfReducers(newHint))) } } def getCrosser(rightSize: SizeHint): MatrixCrosser = - rightSize.total.map { t => if (t < maxTinyJoin) AnyCrossTiny else AnyCrossSmall } + rightSize.total + .map(t => if (t < maxTinyJoin) AnyCrossTiny else AnyCrossSmall) .getOrElse(AnyCrossSmall) - implicit def literalScalarRightProduct[Row, Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[Row, Col, ValT], LiteralScalar[ValT], Matrix[Row, Col, ValT]] = + implicit def literalScalarRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], LiteralScalar[ValT], Matrix[Row, Col, ValT]] = new MatrixProduct[Matrix[Row, Col, ValT], LiteralScalar[ValT], Matrix[Row, Col, ValT]] { def apply(left: Matrix[Row, Col, ValT], right: LiteralScalar[ValT]) = { val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v: ValT) => @@ -122,7 +124,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def literalRightProduct[Row, Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[Row, Col, ValT], ValT, Matrix[Row, Col, ValT]] = + implicit def literalRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], ValT, Matrix[Row, Col, ValT]] = new MatrixProduct[Matrix[Row, Col, ValT], ValT, Matrix[Row, Col, ValT]] { def apply(left: Matrix[Row, Col, ValT], right: ValT) = { val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v: ValT) => @@ -132,7 +136,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def literalScalarLeftProduct[Row, Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[LiteralScalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = + implicit def literalScalarLeftProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = new MatrixProduct[LiteralScalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] { def apply(left: LiteralScalar[ValT], right: Matrix[Row, Col, ValT]) = { val newPipe = right.pipe.map(right.valSym -> right.valSym) { (v: ValT) => @@ -142,27 +148,35 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarPipeRightProduct[Row, Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[Row, Col, ValT], Scalar[ValT], Matrix[Row, Col, ValT]] = + implicit def scalarPipeRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], Scalar[ValT], Matrix[Row, Col, ValT]] = new MatrixProduct[Matrix[Row, Col, ValT], Scalar[ValT], Matrix[Row, Col, ValT]] { - def apply(left: Matrix[Row, Col, ValT], right: Scalar[ValT]) = { - left.nonZerosWith(right).mapValues({ leftRight => - val (left, right) = leftRight - ring.times(left, right) - })(ring) - } + def apply(left: Matrix[Row, Col, ValT], right: Scalar[ValT]) = + left + .nonZerosWith(right) + .mapValues { leftRight => + val (left, right) = leftRight + ring.times(left, right) + }(ring) } - implicit def scalarPipeLeftProduct[Row, Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[Scalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = + implicit def scalarPipeLeftProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = new MatrixProduct[Scalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] { - def apply(left: Scalar[ValT], right: Matrix[Row, Col, ValT]) = { - right.nonZerosWith(left).mapValues({ matScal => - val (matVal, scalarVal) = matScal - ring.times(scalarVal, matVal) - })(ring) - } + def apply(left: Scalar[ValT], right: Matrix[Row, Col, ValT]) = + right + .nonZerosWith(left) + .mapValues { matScal => + val (matVal, scalarVal) = matScal + ring.times(scalarVal, matVal) + }(ring) } - implicit def scalarRowRightProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[Col, ValT], Scalar[ValT], RowVector[Col, ValT]] = + implicit def scalarRowRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Col, ValT], Scalar[ValT], RowVector[Col, ValT]] = new MatrixProduct[RowVector[Col, ValT], Scalar[ValT], RowVector[Col, ValT]] { def apply(left: RowVector[Col, ValT], right: Scalar[ValT]): RowVector[Col, ValT] = { val prod = left.toMatrix(0) * right @@ -171,7 +185,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarRowLeftProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[Scalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = + implicit def scalarRowLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = new MatrixProduct[Scalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] { def apply(left: Scalar[ValT], right: RowVector[Col, ValT]): RowVector[Col, ValT] = { val prod = (right.transpose.toMatrix(0)) * left @@ -180,7 +196,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarColRightProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[ColVector[Row, ValT], Scalar[ValT], ColVector[Row, ValT]] = + implicit def scalarColRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[Row, ValT], Scalar[ValT], ColVector[Row, ValT]] = new MatrixProduct[ColVector[Row, ValT], Scalar[ValT], ColVector[Row, ValT]] { def apply(left: ColVector[Row, ValT], right: Scalar[ValT]): ColVector[Row, ValT] = { val prod = left.toMatrix(0) * right @@ -189,7 +207,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarColLeftProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[Scalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = + implicit def scalarColLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = new MatrixProduct[Scalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] { def apply(left: Scalar[ValT], right: ColVector[Row, ValT]): ColVector[Row, ValT] = { val prod = (right.toMatrix(0)) * left @@ -198,7 +218,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarRowRightProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[Col, ValT], LiteralScalar[ValT], RowVector[Col, ValT]] = + implicit def litScalarRowRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Col, ValT], LiteralScalar[ValT], RowVector[Col, ValT]] = new MatrixProduct[RowVector[Col, ValT], LiteralScalar[ValT], RowVector[Col, ValT]] { def apply(left: RowVector[Col, ValT], right: LiteralScalar[ValT]): RowVector[Col, ValT] = { val prod = left.toMatrix(0) * right @@ -207,7 +229,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarRowLeftProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[LiteralScalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = + implicit def litScalarRowLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = new MatrixProduct[LiteralScalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] { def apply(left: LiteralScalar[ValT], right: RowVector[Col, ValT]): RowVector[Col, ValT] = { val prod = (right.transpose.toMatrix(0)) * left @@ -216,7 +240,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarColRightProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[ColVector[Row, ValT], LiteralScalar[ValT], ColVector[Row, ValT]] = + implicit def litScalarColRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[Row, ValT], LiteralScalar[ValT], ColVector[Row, ValT]] = new MatrixProduct[ColVector[Row, ValT], LiteralScalar[ValT], ColVector[Row, ValT]] { def apply(left: ColVector[Row, ValT], right: LiteralScalar[ValT]): ColVector[Row, ValT] = { val prod = left.toMatrix(0) * right @@ -225,7 +251,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarColLeftProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[LiteralScalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = + implicit def litScalarColLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = new MatrixProduct[LiteralScalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] { def apply(left: LiteralScalar[ValT], right: ColVector[Row, ValT]): ColVector[Row, ValT] = { val prod = (right.toMatrix(0)) * left @@ -234,7 +262,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarDiagRightProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[DiagonalMatrix[Row, ValT], Scalar[ValT], DiagonalMatrix[Row, ValT]] = + implicit def scalarDiagRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[Row, ValT], Scalar[ValT], DiagonalMatrix[Row, ValT]] = new MatrixProduct[DiagonalMatrix[Row, ValT], Scalar[ValT], DiagonalMatrix[Row, ValT]] { def apply(left: DiagonalMatrix[Row, ValT], right: Scalar[ValT]): DiagonalMatrix[Row, ValT] = { val prod = (left.toCol.toMatrix(0)) * right @@ -243,7 +273,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def scalarDiagLeftProduct[Row, ValT](implicit ring: Ring[ValT]): MatrixProduct[Scalar[ValT], DiagonalMatrix[Row, ValT], DiagonalMatrix[Row, ValT]] = + implicit def scalarDiagLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], DiagonalMatrix[Row, ValT], DiagonalMatrix[Row, ValT]] = new MatrixProduct[Scalar[ValT], DiagonalMatrix[Row, ValT], DiagonalMatrix[Row, ValT]] { def apply(left: Scalar[ValT], right: DiagonalMatrix[Row, ValT]): DiagonalMatrix[Row, ValT] = { val prod = (right.toCol.toMatrix(0)) * left @@ -252,7 +284,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarDiagRightProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[DiagonalMatrix[Col, ValT], LiteralScalar[ValT], DiagonalMatrix[Col, ValT]] = + implicit def litScalarDiagRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[Col, ValT], LiteralScalar[ValT], DiagonalMatrix[Col, ValT]] = new MatrixProduct[DiagonalMatrix[Col, ValT], LiteralScalar[ValT], DiagonalMatrix[Col, ValT]] { def apply(left: DiagonalMatrix[Col, ValT], right: LiteralScalar[ValT]): DiagonalMatrix[Col, ValT] = { val prod = (left.toRow.toMatrix(0)) * right @@ -261,7 +295,9 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def litScalarDiagLeftProduct[Col, ValT](implicit ring: Ring[ValT]): MatrixProduct[LiteralScalar[ValT], DiagonalMatrix[Col, ValT], DiagonalMatrix[Col, ValT]] = + implicit def litScalarDiagLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], DiagonalMatrix[Col, ValT], DiagonalMatrix[Col, ValT]] = new MatrixProduct[LiteralScalar[ValT], DiagonalMatrix[Col, ValT], DiagonalMatrix[Col, ValT]] { def apply(left: LiteralScalar[ValT], right: DiagonalMatrix[Col, ValT]): DiagonalMatrix[Col, ValT] = { val prod = (right.toCol.toMatrix(0)) * left @@ -271,10 +307,14 @@ object MatrixProduct extends java.io.Serializable { } //TODO: remove in 0.9.0, only here just for compatibility. - def vectorInnerProduct[IdxT, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = + def vectorInnerProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = rowColProduct(ring) - implicit def rowColProduct[IdxT, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = + implicit def rowColProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = new MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] { def apply(left: RowVector[IdxT, ValT], right: ColVector[IdxT, ValT]): Scalar[ValT] = { // Normal matrix multiplication works here, but we need to convert to a Scalar @@ -283,90 +323,101 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def rowMatrixProduct[Common, ColR, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] = + implicit def rowMatrixProduct[Common, ColR, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] = new MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] { - def apply(left: RowVector[Common, ValT], right: Matrix[Common, ColR, ValT]) = { + def apply(left: RowVector[Common, ValT], right: Matrix[Common, ColR, ValT]) = (left.toMatrix(true) * right).getRow(true) - } } - implicit def matrixColProduct[RowR, Common, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] = + implicit def matrixColProduct[RowR, Common, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] = new MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] { - def apply(left: Matrix[RowR, Common, ValT], right: ColVector[Common, ValT]) = { + def apply(left: Matrix[RowR, Common, ValT], right: ColVector[Common, ValT]) = (left * right.toMatrix(true)).getCol(true) - } } - implicit def vectorOuterProduct[RowT, ColT, ValT](implicit ring: Ring[ValT]): MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] = + implicit def vectorOuterProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] = new MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] { def apply(left: ColVector[RowT, ValT], right: RowVector[ColT, ValT]): Matrix[RowT, ColT, ValT] = { - val (newRightFields, newRightPipe) = ensureUniqueFields( - (left.rowS, left.valS), - (right.colS, right.valS), - right.pipe) + val (newRightFields, newRightPipe) = + ensureUniqueFields((left.rowS, left.valS), (right.colS, right.valS), right.pipe) val newColSym = Symbol(right.colS.name + "_newCol") val newHint = left.sizeH * right.sizeH - val productPipe = Matrix.filterOutZeros(left.valS, ring) { - getCrosser(right.sizeH) - .apply(left.pipe, newRightPipe) - .map(left.valS.append(getField(newRightFields, 1)) -> left.valS) { pair: (ValT, ValT) => - ring.times(pair._1, pair._2) - } - } + val productPipe = Matrix + .filterOutZeros(left.valS, ring) { + getCrosser(right.sizeH) + .apply(left.pipe, newRightPipe) + .map(left.valS.append(getField(newRightFields, 1)) -> left.valS) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } + } .rename(getField(newRightFields, 0) -> newColSym) new Matrix[RowT, ColT, ValT](left.rowS, newColSym, left.valS, productPipe, newHint) } } - implicit def standardMatrixProduct[RowL, Common, ColR, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[RowL, Common, ValT], Matrix[Common, ColR, ValT], Matrix[RowL, ColR, ValT]] = + implicit def standardMatrixProduct[RowL, Common, ColR, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowL, Common, ValT], Matrix[Common, ColR, ValT], Matrix[RowL, ColR, ValT]] = new MatrixProduct[Matrix[RowL, Common, ValT], Matrix[Common, ColR, ValT], Matrix[RowL, ColR, ValT]] { def apply(left: Matrix[RowL, Common, ValT], right: Matrix[Common, ColR, ValT]) = { val (newRightFields, newRightPipe) = ensureUniqueFields( (left.rowSym, left.colSym, left.valSym), (right.rowSym, right.colSym, right.valSym), - right.pipe) + right.pipe + ) val newHint = left.sizeHint * right.sizeHint // Hint of groupBy reducer size val grpReds = numOfReducers(newHint) - val productPipe = Matrix.filterOutZeros(left.valSym, ring) { - getJoiner(left.sizeHint, right.sizeHint) - // TODO: we should use the size hints to set the number of reducers: - .apply(left.pipe, (left.colSym -> getField(newRightFields, 0)), newRightPipe) - // Do the product: - .map((left.valSym.append(getField(newRightFields, 2))) -> left.valSym) { pair: (ValT, ValT) => - ring.times(pair._1, pair._2) - } - .groupBy(left.rowSym.append(getField(newRightFields, 1))) { - // We should use the size hints to set the number of reducers here - _.reduce(left.valSym) { (x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(ring.plus(x._1, y._1)) } + val productPipe = Matrix + .filterOutZeros(left.valSym, ring) { + getJoiner(left.sizeHint, right.sizeHint) + // TODO: we should use the size hints to set the number of reducers: + .apply(left.pipe, (left.colSym -> getField(newRightFields, 0)), newRightPipe) + // Do the product: + .map((left.valSym.append(getField(newRightFields, 2))) -> left.valSym) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } + .groupBy(left.rowSym.append(getField(newRightFields, 1))) { + // We should use the size hints to set the number of reducers here + _.reduce(left.valSym)((x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(ring.plus(x._1, y._1))) // There is a low chance that many (row,col) keys are co-located, and the keyspace // is likely huge, just push to reducers - .forceToReducers - .reducers(grpReds) - } - } + .forceToReducers + .reducers(grpReds) + } + } // Keep the names from the left: .rename(getField(newRightFields, 1) -> left.colSym) new Matrix[RowL, ColR, ValT](left.rowSym, left.colSym, left.valSym, productPipe, newHint) } } - implicit def diagMatrixProduct[RowT, ColT, ValT](implicit ring: Ring[ValT]): MatrixProduct[DiagonalMatrix[RowT, ValT], Matrix[RowT, ColT, ValT], Matrix[RowT, ColT, ValT]] = + implicit def diagMatrixProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[RowT, ValT], Matrix[RowT, ColT, ValT], Matrix[RowT, ColT, ValT]] = new MatrixProduct[DiagonalMatrix[RowT, ValT], Matrix[RowT, ColT, ValT], Matrix[RowT, ColT, ValT]] { def apply(left: DiagonalMatrix[RowT, ValT], right: Matrix[RowT, ColT, ValT]) = { val (newRightFields, newRightPipe) = ensureUniqueFields( (left.idxSym, left.valSym), (right.rowSym, right.colSym, right.valSym), - right.pipe) + right.pipe + ) val newHint = left.sizeHint * right.sizeHint val productPipe = Matrix.filterOutZeros(right.valSym, ring) { getJoiner(left.sizeHint, right.sizeHint) // TODO: we should use the size hints to set the number of reducers: .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) // Do the product: - .map((left.valSym.append(getField(newRightFields, 2))) -> getField(newRightFields, 2)) { pair: (ValT, ValT) => - ring.times(pair._1, pair._2) + .map((left.valSym.append(getField(newRightFields, 2))) -> getField(newRightFields, 2)) { + pair: (ValT, ValT) => + ring.times(pair._1, pair._2) } // Keep the names from the right: .project(newRightFields) @@ -376,48 +427,52 @@ object MatrixProduct extends java.io.Serializable { } } - implicit def matrixDiagProduct[RowT, ColT, ValT](implicit ring: Ring[ValT]): MatrixProduct[Matrix[RowT, ColT, ValT], DiagonalMatrix[ColT, ValT], Matrix[RowT, ColT, ValT]] = + implicit def matrixDiagProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowT, ColT, ValT], DiagonalMatrix[ColT, ValT], Matrix[RowT, ColT, ValT]] = new MatrixProduct[Matrix[RowT, ColT, ValT], DiagonalMatrix[ColT, ValT], Matrix[RowT, ColT, ValT]] { - def apply(left: Matrix[RowT, ColT, ValT], right: DiagonalMatrix[ColT, ValT]) = { + def apply(left: Matrix[RowT, ColT, ValT], right: DiagonalMatrix[ColT, ValT]) = // (A * B) = (B^T * A^T)^T // note diagonal^T = diagonal (right * (left.transpose)).transpose - } } - implicit def diagDiagProduct[IdxT, ValT](implicit ring: Ring[ValT]): MatrixProduct[DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT]] = + implicit def diagDiagProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT]] = new MatrixProduct[DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT]] { def apply(left: DiagonalMatrix[IdxT, ValT], right: DiagonalMatrix[IdxT, ValT]) = { - val (newRightFields, newRightPipe) = ensureUniqueFields( - (left.idxSym, left.valSym), - (right.idxSym, right.valSym), - right.pipe) + val (newRightFields, newRightPipe) = + ensureUniqueFields((left.idxSym, left.valSym), (right.idxSym, right.valSym), right.pipe) val newHint = left.sizeHint * right.sizeHint - val productPipe = Matrix.filterOutZeros(left.valSym, ring) { - getJoiner(left.sizeHint, right.sizeHint) - // TODO: we should use the size hints to set the number of reducers: - .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) - // Do the product: - .map((left.valSym.append(getField(newRightFields, 1))) -> left.valSym) { pair: (ValT, ValT) => - ring.times(pair._1, pair._2) - } - } + val productPipe = Matrix + .filterOutZeros(left.valSym, ring) { + getJoiner(left.sizeHint, right.sizeHint) + // TODO: we should use the size hints to set the number of reducers: + .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) + // Do the product: + .map((left.valSym.append(getField(newRightFields, 1))) -> left.valSym) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } + } // Keep the names from the left: .project(left.idxSym, left.valSym) new DiagonalMatrix[IdxT, ValT](left.idxSym, left.valSym, productPipe, newHint) } } - implicit def diagColProduct[IdxT, ValT](implicit ring: Ring[ValT]): MatrixProduct[DiagonalMatrix[IdxT, ValT], ColVector[IdxT, ValT], ColVector[IdxT, ValT]] = + implicit def diagColProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[IdxT, ValT], ColVector[IdxT, ValT], ColVector[IdxT, ValT]] = new MatrixProduct[DiagonalMatrix[IdxT, ValT], ColVector[IdxT, ValT], ColVector[IdxT, ValT]] { - def apply(left: DiagonalMatrix[IdxT, ValT], right: ColVector[IdxT, ValT]) = { + def apply(left: DiagonalMatrix[IdxT, ValT], right: ColVector[IdxT, ValT]) = (left * (right.diag)).toCol - } } - implicit def rowDiagProduct[IdxT, ValT](implicit ring: Ring[ValT]): MatrixProduct[RowVector[IdxT, ValT], DiagonalMatrix[IdxT, ValT], RowVector[IdxT, ValT]] = + implicit def rowDiagProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], DiagonalMatrix[IdxT, ValT], RowVector[IdxT, ValT]] = new MatrixProduct[RowVector[IdxT, ValT], DiagonalMatrix[IdxT, ValT], RowVector[IdxT, ValT]] { - def apply(left: RowVector[IdxT, ValT], right: DiagonalMatrix[IdxT, ValT]) = { + def apply(left: RowVector[IdxT, ValT], right: DiagonalMatrix[IdxT, ValT]) = ((left.diag) * right).toRow - } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala index e9112f983f..67f38cccf1 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala @@ -3,9 +3,8 @@ package com.twitter.scalding.mathematics import scala.util.Random /** - * Generating Poisson-distributed random variables - * according to Donald Knuth's algorithm as shown on Wikipedia's - * Poisson Distribution page + * Generating Poisson-distributed random variables according to Donald Knuth's algorithm as shown on + * Wikipedia's Poisson Distribution page */ class Poisson(fraction: Double, seed: Int) { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala index 8fc4e8fcae..d8408358ed 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala @@ -12,23 +12,24 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics object SizeHint { implicit val ordering: Ordering[SizeHint] = SizeHintOrdering // Return a sparsity assuming all the diagonal is present, but nothing else def asDiagonal(h: SizeHint): SizeHint = { - def make(r: BigInt, c: BigInt) = { - h.total.map { tot => - val maxElements = (r min c) - val sparsity = 1.0 / maxElements.doubleValue - SparseHint(sparsity, maxElements, maxElements) - }.getOrElse(NoClue) - } + def make(r: BigInt, c: BigInt) = + h.total + .map { tot => + val maxElements = r.min(c) + val sparsity = 1.0 / maxElements.doubleValue + SparseHint(sparsity, maxElements, maxElements) + } + .getOrElse(NoClue) h match { - case NoClue => NoClue - case FiniteHint(r, c) => make(r, c) + case NoClue => NoClue + case FiniteHint(r, c) => make(r, c) case SparseHint(sp, r, c) => make(r, c) } } @@ -61,30 +62,28 @@ case object NoClue extends SizeHint { } final case class FiniteHint(rows: BigInt = -1L, cols: BigInt = -1L) extends SizeHint { - def *(other: SizeHint) = { + def *(other: SizeHint) = other match { - case NoClue => NoClue + case NoClue => NoClue case FiniteHint(orows, ocols) => FiniteHint(rows, ocols) case sp @ SparseHint(_, _, _) => (SparseHint(1.0, rows, cols) * sp) } - } - def +(other: SizeHint) = { + def +(other: SizeHint) = other match { case NoClue => NoClue // In this case, a hint on one side, will overwrite lack of knowledge (-1L) case FiniteHint(orows, ocols) => FiniteHint(rows.max(orows), cols.max(ocols)) case sp @ SparseHint(_, _, _) => (sp + this) } - } - def #*#(other: SizeHint) = { + def #*#(other: SizeHint) = other match { case NoClue => NoClue // In this case, a hint on one side, will overwrite lack of knowledge (-1L) case FiniteHint(orows, ocols) => FiniteHint(rows.min(orows), cols.min(ocols)) case sp @ SparseHint(_, _, _) => (sp #*# this) } - } - def total = if (rows >= 0 && cols >= 0) { Some(rows * cols) } else None + def total = if (rows >= 0 && cols >= 0) { Some(rows * cols) } + else None def setCols(ncols: Long) = FiniteHint(rows, ncols) def setRows(nrows: Long) = FiniteHint(nrows, cols) def setColsToRows = FiniteHint(rows, rows) @@ -94,15 +93,15 @@ final case class FiniteHint(rows: BigInt = -1L, cols: BigInt = -1L) extends Size // sparsity is the fraction of the rows and columns that are expected to be present final case class SparseHint(sparsity: Double, rows: BigInt, cols: BigInt) extends SizeHint { - def *(other: SizeHint): SizeHint = { + def *(other: SizeHint): SizeHint = other match { - case NoClue => NoClue + case NoClue => NoClue case FiniteHint(r, c) => (this * SparseHint(1.0, r, c)) case SparseHint(sp, r, c) => { // if I occupy a bin with probability p, and you q, then both: pq // There are cols samples of the, above, so the probability one is present: // 1-(1-pq)^cols ~ (cols * p * q) min 1.0 - val newSp = (BigDecimal(cols) * sp * sparsity) + val newSp = BigDecimal(cols) * sp * sparsity if (newSp >= 1.0) { FiniteHint(rows, c) } else { @@ -110,38 +109,34 @@ final case class SparseHint(sparsity: Double, rows: BigInt, cols: BigInt) extend } } } - } - def +(other: SizeHint): SizeHint = { + def +(other: SizeHint): SizeHint = other match { - case NoClue => NoClue + case NoClue => NoClue case FiniteHint(r, c) => (this + SparseHint(1.0, r, c)) case SparseHint(sp, r, c) => { // if I occupy a bin with probability p, and you q, then either: p + q - pq if ((sparsity == 1.0) || (sp == 1.0)) { - FiniteHint(rows max r, cols max c) + FiniteHint(rows.max(r), cols.max(c)) } else { val newSp = sparsity + sp - sp * sparsity - SparseHint(newSp, rows max r, cols max c) + SparseHint(newSp, rows.max(r), cols.max(c)) } } } - } - def #*#(other: SizeHint): SizeHint = { + def #*#(other: SizeHint): SizeHint = other match { - case NoClue => NoClue + case NoClue => NoClue case FiniteHint(r, c) => (this #*# SparseHint(1.0, r, c)) case SparseHint(sp, r, c) => { - val newSp = sp min sparsity - SparseHint(newSp, rows min r, cols min c) + val newSp = sp.min(sparsity) + SparseHint(newSp, rows.min(r), cols.min(c)) } } - } - def total: Option[BigInt] = { + def total: Option[BigInt] = if ((rows >= 0) && (cols >= 0)) { Some((BigDecimal(rows) * BigDecimal(cols) * sparsity).toBigInt) } else None - } def setCols(c: Long): SizeHint = copy(cols = c) def setRows(r: Long): SizeHint = copy(rows = r) def setColsToRows: SizeHint = copy(cols = rows) @@ -153,9 +148,8 @@ final case class SparseHint(sparsity: Double, rows: BigInt, cols: BigInt) extend * Allows us to sort matrices by approximate type */ object SizeHintOrdering extends Ordering[SizeHint] with java.io.Serializable { - def compare(left: SizeHint, right: SizeHint): Int = { - left.total.getOrElse(BigInt(-1L)) + def compare(left: SizeHint, right: SizeHint): Int = + left.total + .getOrElse(BigInt(-1L)) .compare(right.total.getOrElse(BigInt(-1L))) - } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala index 42da7c6bbc..42e343f173 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala @@ -12,17 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import com.twitter.scalding.typed.{ Grouped, TypedPipe, WithReducers } +import com.twitter.scalding.typed.{Grouped, TypedPipe, WithReducers} import java.io.Serializable /** * Implementation of DISCO and DIMSUM approximation similarity algorithm - * @author Oscar Boykin - * @author Kevin Lin + * @author + * Oscar Boykin + * @author + * Kevin Lin */ /** @@ -40,34 +42,42 @@ final case class Weight(weight: Double) final case class L2Norm(norm: Double) object GraphOperations extends Serializable { + /** * For each N, aggregate all the edges, and attach Edge state */ - def joinAggregate[N, E, T](grouped: Grouped[N, Edge[N, E]])(agfn: Iterable[Edge[N, E]] => T): TypedPipe[Edge[N, (E, T)]] = - grouped.cogroup(grouped) { - (to: N, left: Iterator[Edge[N, E]], right: Iterable[Edge[N, E]]) => + def joinAggregate[N, E, T]( + grouped: Grouped[N, Edge[N, E]] + )(agfn: Iterable[Edge[N, E]] => T): TypedPipe[Edge[N, (E, T)]] = + grouped + .cogroup(grouped) { (to: N, left: Iterator[Edge[N, E]], right: Iterable[Edge[N, E]]) => val newState = agfn(right) - left.map { _.mapData { e: E => (e, newState) } } - } + left.map(_.mapData { e: E => (e, newState) }) + } .values // Returns all Vertices with non-zero in-degree - def withInDegree[N, E](g: TypedPipe[Edge[N, E]])(implicit ord: Ordering[N]): TypedPipe[Edge[N, (E, InDegree)]] = joinAggregate(g.groupBy { _.to }) { it => + def withInDegree[N, E]( + g: TypedPipe[Edge[N, E]] + )(implicit ord: Ordering[N]): TypedPipe[Edge[N, (E, InDegree)]] = joinAggregate(g.groupBy(_.to)) { it => InDegree(it.size) } // Returns all Vertices with non-zero out-degree - def withOutDegree[N, E](g: TypedPipe[Edge[N, E]])(implicit ord: Ordering[N]): TypedPipe[Edge[N, (E, OutDegree)]] = joinAggregate(g.groupBy { _.from }) { it => + def withOutDegree[N, E](g: TypedPipe[Edge[N, E]])(implicit + ord: Ordering[N] + ): TypedPipe[Edge[N, (E, OutDegree)]] = joinAggregate(g.groupBy(_.from)) { it => OutDegree(it.size) } // Returns all Vertices with weights and non-zero norms - def withInNorm[N, E](g: TypedPipe[Edge[N, Weight]])(implicit ord: Ordering[N]): TypedPipe[Edge[N, (Weight, L2Norm)]] = joinAggregate(g.groupBy { _.to }) { it => - val norm = scala.math.sqrt( - it.iterator.map { a => - val x = a.data.weight - x * x - }.sum) + def withInNorm[N, E](g: TypedPipe[Edge[N, Weight]])(implicit + ord: Ordering[N] + ): TypedPipe[Edge[N, (Weight, L2Norm)]] = joinAggregate(g.groupBy(_.to)) { it => + val norm = scala.math.sqrt(it.iterator.map { a => + val x = a.data.weight + x * x + }.sum) L2Norm(norm) } @@ -89,15 +99,14 @@ case class SetSimilarity(intersection: Int, sizeLeft: Int, sizeRight: Int) { trait TypedSimilarity[N, E, S] extends Serializable { def nodeOrdering: Ordering[N] + /** - * Given a TypedPipe of edges, and a predicate for a smaller group (smallpred) of nodes - * and a bigger group (bigpred), compute the similarity between each item in the two sets - * The Edge.from nodes in the result will all satisfy smallpred, and the Edge.to will - * all satisfy bigpred. It is more efficient if you keep the smallpred set smaller. + * Given a TypedPipe of edges, and a predicate for a smaller group (smallpred) of nodes and a bigger group + * (bigpred), compute the similarity between each item in the two sets The Edge.from nodes in the result + * will all satisfy smallpred, and the Edge.to will all satisfy bigpred. It is more efficient if you keep + * the smallpred set smaller. */ - def apply(g: TypedPipe[Edge[N, E]], - smallpred: N => Boolean, - bigpred: N => Boolean): TypedPipe[Edge[N, S]] + def apply(g: TypedPipe[Edge[N, E]], smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, S]] // Do similarity on all the nodes def apply(g: TypedPipe[Edge[N, E]]): TypedPipe[Edge[N, S]] = { val always = { n: N => true } @@ -109,35 +118,39 @@ object TypedSimilarity extends Serializable { private def maybeWithReducers[T <: WithReducers[T]](withReds: T, reds: Option[Int]) = reds match { case Some(i) => withReds.withReducers(i) - case None => withReds + case None => withReds } // key: document, // value: (word, documentsWithWord) // return: Edge of similarity between words measured by documents - def exactSetSimilarity[N: Ordering](g: Grouped[N, (N, Int)], - smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, SetSimilarity]] = + def exactSetSimilarity[N: Ordering]( + g: Grouped[N, (N, Int)], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, SetSimilarity]] = /* E_{ij} = 1 if document -> word exists * (E^T E)_ij = # of shared documents of i,j * = \sum_k E_ki E_kj */ // First compute (i,j) => E_{ki} E_{kj} - maybeWithReducers(g.join(g) - .values - .flatMap { - case ((node1, deg1), (node2, deg2)) => + maybeWithReducers( + g.join(g) + .values + .flatMap { case ((node1, deg1), (node2, deg2)) => if (smallpred(node1) && bigpred(node2)) Some(((node1, node2), (1, deg1, deg2))) else None - } - .group, g.reducers) + } + .group, + g.reducers + ) // Use reduceLeft to push to reducers, no benefit in mapside here .reduceLeft { (left, right) => // The degrees we always take the left: val (leftCnt, deg1, deg2) = left (leftCnt + right._1, deg1, deg2) } - .map { - case ((node1, node2), (cnt, deg1, deg2)) => - Edge(node1, node2, SetSimilarity(cnt, deg1, deg2)) + .map { case ((node1, node2), (cnt, deg1, deg2)) => + Edge(node1, node2, SetSimilarity(cnt, deg1, deg2)) } /* @@ -146,17 +159,20 @@ object TypedSimilarity extends Serializable { * return: Edge of similarity between words measured by documents * See: https://arxiv.org/pdf/1206.2082v2.pdf */ - def discoCosineSimilarity[N: Ordering](smallG: Grouped[N, (N, Int)], - bigG: Grouped[N, (N, Int)], oversample: Double): TypedPipe[Edge[N, Double]] = { + def discoCosineSimilarity[N: Ordering]( + smallG: Grouped[N, (N, Int)], + bigG: Grouped[N, (N, Int)], + oversample: Double + ): TypedPipe[Edge[N, Double]] = { // 1) make rnd lazy due to serialization, // 2) fix seed so that map-reduce speculative execution does not give inconsistent results. lazy val rnd = new scala.util.Random(1024) - maybeWithReducers(smallG.cogroup(bigG) { (n: N, leftit: Iterator[(N, Int)], rightit: Iterable[(N, Int)]) => - // Use a co-group to ensure this happens in the reducer: - leftit.flatMap { - case (node1, deg1) => - rightit.iterator.flatMap { - case (node2, deg2) => + maybeWithReducers( + smallG + .cogroup(bigG) { (n: N, leftit: Iterator[(N, Int)], rightit: Iterable[(N, Int)]) => + // Use a co-group to ensure this happens in the reducer: + leftit.flatMap { case (node1, deg1) => + rightit.iterator.flatMap { case (node2, deg2) => val weight = 1.0 / scala.math.sqrt(deg1.toDouble * deg2.toDouble) val prob = oversample * weight if (prob >= 1.0) { @@ -167,13 +183,13 @@ object TypedSimilarity extends Serializable { Iterator(((node1, node2), 1.0 / oversample)) } else Iterator.empty + } } - } - } - .values - .group, smallG.reducers) - .forceToReducers - .sum + } + .values + .group, + smallG.reducers + ).forceToReducers.sum .map { case ((node1, node2), sim) => Edge(node1, node2, sim) } } @@ -183,77 +199,89 @@ object TypedSimilarity extends Serializable { * return: Edge of similarity between words measured by documents * See: https://stanford.edu/~rezab/papers/dimsum.pdf */ - def dimsumCosineSimilarity[N: Ordering](smallG: Grouped[N, (N, Double, Double)], - bigG: Grouped[N, (N, Double, Double)], oversample: Double): TypedPipe[Edge[N, Double]] = { + def dimsumCosineSimilarity[N: Ordering]( + smallG: Grouped[N, (N, Double, Double)], + bigG: Grouped[N, (N, Double, Double)], + oversample: Double + ): TypedPipe[Edge[N, Double]] = { lazy val rnd = new scala.util.Random(1024) - maybeWithReducers(smallG.cogroup(bigG) { (n: N, leftit: Iterator[(N, Double, Double)], rightit: Iterable[(N, Double, Double)]) => - // Use a co-group to ensure this happens in the reducer: - leftit.flatMap { - case (node1, weight1, norm1) => - rightit.iterator.flatMap { - case (node2, weight2, norm2) => - val weight = 1.0 / (norm1 * norm2) - val prob = oversample * weight - if (prob >= 1.0) { - // Small degree case, just output all of them: - Iterator(((node1, node2), weight * weight1 * weight2)) - } else if (rnd.nextDouble < prob) { - // Sample - Iterator(((node1, node2), 1.0 / oversample * weight1 * weight2)) - } else - Iterator.empty - } - } - } - .values - .group, smallG.reducers) - .forceToReducers - .sum + maybeWithReducers( + smallG + .cogroup(bigG) { + (n: N, leftit: Iterator[(N, Double, Double)], rightit: Iterable[(N, Double, Double)]) => + // Use a co-group to ensure this happens in the reducer: + leftit.flatMap { case (node1, weight1, norm1) => + rightit.iterator.flatMap { case (node2, weight2, norm2) => + val weight = 1.0 / (norm1 * norm2) + val prob = oversample * weight + if (prob >= 1.0) { + // Small degree case, just output all of them: + Iterator(((node1, node2), weight * weight1 * weight2)) + } else if (rnd.nextDouble < prob) { + // Sample + Iterator(((node1, node2), 1.0 / oversample * weight1 * weight2)) + } else + Iterator.empty + } + } + } + .values + .group, + smallG.reducers + ).forceToReducers.sum .map { case ((node1, node2), sim) => Edge(node1, node2, sim) } } } /** - * This algothm is just matrix multiplication done by hand to make it - * clearer when we do the sampling implementation + * This algothm is just matrix multiplication done by hand to make it clearer when we do the sampling + * implementation */ -class ExactInCosine[N](reducers: Int = -1)(implicit override val nodeOrdering: Ordering[N]) extends TypedSimilarity[N, InDegree, Double] { +class ExactInCosine[N](reducers: Int = -1)(implicit override val nodeOrdering: Ordering[N]) + extends TypedSimilarity[N, InDegree, Double] { - def apply(graph: TypedPipe[Edge[N, InDegree]], - smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, Double]] = { + def apply( + graph: TypedPipe[Edge[N, InDegree]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { val groupedOnSrc = graph - .filter { e => smallpred(e.to) || bigpred(e.to) } - .map { e => (e.from, (e.to, e.data.degree)) } + .filter(e => smallpred(e.to) || bigpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) .group .withReducers(reducers) - TypedSimilarity.exactSetSimilarity(groupedOnSrc, smallpred, bigpred) - .flatMap { e => e.data.cosine.map { c => e.mapData { s => c } } } + TypedSimilarity + .exactSetSimilarity(groupedOnSrc, smallpred, bigpred) + .flatMap(e => e.data.cosine.map(c => e.mapData(s => c))) } } /** - * Params: - * minCos: the minimum cosine similarity you care about accuracy for - * delta: the error on the approximated cosine (e.g. 0.05 = 5%) - * boundedProb: the probability we have larger than delta error - * see: https://arxiv.org/pdf/1206.2082v2.pdf for more details + * Params: minCos: the minimum cosine similarity you care about accuracy for delta: the error on the + * approximated cosine (e.g. 0.05 = 5%) boundedProb: the probability we have larger than delta error see: + * https://arxiv.org/pdf/1206.2082v2.pdf for more details */ -class DiscoInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit override val nodeOrdering: Ordering[N]) extends TypedSimilarity[N, InDegree, Double] { +class DiscoInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit + override val nodeOrdering: Ordering[N] +) extends TypedSimilarity[N, InDegree, Double] { // The probability of being more than delta error is approx: // boundedProb ~ exp(-p delta^2 / 2) private val oversample = (-2.0 * scala.math.log(boundedProb) / (delta * delta)) / minCos - def apply(graph: TypedPipe[Edge[N, InDegree]], - smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, Double]] = { + def apply( + graph: TypedPipe[Edge[N, InDegree]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { val bigGroupedOnSrc = graph - .filter { e => bigpred(e.to) } - .map { e => (e.from, (e.to, e.data.degree)) } + .filter(e => bigpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) .group .withReducers(reducers) val smallGroupedOnSrc = graph - .filter { e => smallpred(e.to) } - .map { e => (e.from, (e.to, e.data.degree)) } + .filter(e => smallpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) .group .withReducers(reducers) @@ -262,22 +290,27 @@ class DiscoInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reduc } -class DimsumInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit override val nodeOrdering: Ordering[N]) extends TypedSimilarity[N, (Weight, L2Norm), Double] { +class DimsumInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit + override val nodeOrdering: Ordering[N] +) extends TypedSimilarity[N, (Weight, L2Norm), Double] { // The probability of being more than delta error is approx: // boundedProb ~ exp(-p delta^2 / 2) private val oversample = (-2.0 * scala.math.log(boundedProb) / (delta * delta)) / minCos - def apply(graph: TypedPipe[Edge[N, (Weight, L2Norm)]], - smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, Double]] = { + def apply( + graph: TypedPipe[Edge[N, (Weight, L2Norm)]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { val bigGroupedOnSrc = graph - .filter { e => bigpred(e.to) } - .map { e => (e.from, (e.to, e.data._1.weight, e.data._2.norm)) } + .filter(e => bigpred(e.to)) + .map(e => (e.from, (e.to, e.data._1.weight, e.data._2.norm))) .group .withReducers(reducers) val smallGroupedOnSrc = graph - .filter { e => smallpred(e.to) } - .map { e => (e.from, (e.to, e.data._1.weight, e.data._2.norm)) } + .filter(e => smallpred(e.to)) + .map(e => (e.from, (e.to, e.data._1.weight, e.data._2.norm))) .group .withReducers(reducers) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala index 45e8cda0ce..1a5d89eb05 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala @@ -1,7 +1,7 @@ package com.twitter.scalding.reducer_estimation import cascading.tap.hadoop.Hfs -import com.twitter.scalding.estimation.{ Common, Estimator, FlowStrategyInfo } +import com.twitter.scalding.estimation.{Common, Estimator, FlowStrategyInfo} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory @@ -12,10 +12,9 @@ object InputSizeReducerEstimator { val defaultBytesPerReducer = 1L << 32 // 4 GB /** - * Get the target bytes/reducer from the JobConf. - * Supported formats are long or human readable format. - * For human readable format you can use the following suffix (case insensitive): - * k(kilo), m(mega), g(giga), t(tera), p(peta), e(exa). + * Get the target bytes/reducer from the JobConf. Supported formats are long or human readable + * format. For human readable format you can use the following suffix (case insensitive): k(kilo), m(mega), + * g(giga), t(tera), p(peta), e(exa). * * Examples: 1024, 128m, 1g. */ @@ -23,18 +22,18 @@ object InputSizeReducerEstimator { conf.getLongBytes(BytesPerReducer, defaultBytesPerReducer) /** - * Same as estimateReducers, except doesn't round or ceil the result. - * This is useful for composing with other estimation strategies that - * don't want to lose the fractional number of reducers. Especially - * helpful for when less than 1 reducer is needed, but this fraction - * will be multiplied by a scaling factor later. + * Same as estimateReducers, except doesn't round or ceil the result. This is useful for composing with + * other estimation strategies that don't want to lose the fractional number of reducers. Especially helpful + * for when less than 1 reducer is needed, but this fraction will be multiplied by a scaling factor later. */ - def estimateReducersWithoutRounding(info: FlowStrategyInfo): Option[Double] = { + def estimateReducersWithoutRounding(info: FlowStrategyInfo): Option[Double] = Common.inputSizes(info.step) match { case Nil => - LOG.warn("InputSizeReducerEstimator unable to estimate reducers; " + - "cannot compute size of (is it a non hfs tap?):\n - " + - Common.unrollTaps(info.step).filterNot(_.isInstanceOf[Hfs]).mkString("\n - ")) + LOG.warn( + "InputSizeReducerEstimator unable to estimate reducers; " + + "cannot compute size of (is it a non hfs tap?):\n - " + + Common.unrollTaps(info.step).filterNot(_.isInstanceOf[Hfs]).mkString("\n - ") + ) None case inputSizes => val bytesPerReducer = @@ -43,19 +42,22 @@ object InputSizeReducerEstimator { val totalBytes = inputSizes.map(_._2).sum val nReducers = totalBytes.toDouble / bytesPerReducer.toDouble - lazy val logStr = inputSizes.map { - case (name, bytes) => s" - $name\t$bytes" - }.mkString("\n") + lazy val logStr = inputSizes + .map { case (name, bytes) => + s" - $name\t$bytes" + } + .mkString("\n") - LOG.info("\nInputSizeReducerEstimator" + - "\n - input size (bytes): " + totalBytes + - "\n - reducer estimate: " + nReducers + - "\n - Breakdown:\n" + - logStr) + LOG.info( + "\nInputSizeReducerEstimator" + + "\n - input size (bytes): " + totalBytes + + "\n - reducer estimate: " + nReducers + + "\n - Breakdown:\n" + + logStr + ) Some(nReducers) } - } } @@ -68,5 +70,5 @@ class InputSizeReducerEstimator extends Estimator[Int] { import InputSizeReducerEstimator._ override def estimate(info: FlowStrategyInfo): Option[Int] = - estimateReducersWithoutRounding(info).map { _.ceil.toInt.max(1) } -} \ No newline at end of file + estimateReducersWithoutRounding(info).map(_.ceil.toInt.max(1)) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala index 71f6bbaad8..719d653747 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala @@ -1,15 +1,15 @@ package com.twitter.scalding.reducer_estimation -import com.twitter.scalding.estimation.{ Common, FlowStepHistory, FlowStrategyInfo } +import com.twitter.scalding.estimation.{Common, FlowStepHistory, FlowStrategyInfo} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory object RatioBasedEstimator { + /** - * RatioBasedEstimator optionally ignores history items whose input size is - * drastically different than the current job. This parameter specifies the - * lower bound on allowable input size ratio. Defaults to 0.10 (10%), which - * sets the upper bound to 10x. + * RatioBasedEstimator optionally ignores history items whose input size is drastically different than the + * current job. This parameter specifies the lower bound on allowable input size ratio. Defaults to 0.10 + * (10%), which sets the upper bound to 10x. */ val inputRatioThresholdKey = "scalding.reducer.estimator.input.ratio.threshold" def getInputRatioThreshold(conf: JobConf) = conf.getFloat(inputRatioThresholdKey, 0.10f) @@ -19,29 +19,32 @@ abstract class RatioBasedEstimator extends ReducerHistoryEstimator { private val LOG = LoggerFactory.getLogger(this.getClass) /** - * Determines if this input and the previous input are close enough. - * If they're drastically different, we have no business trying to - * make an estimate based on the past job. + * Determines if this input and the previous input are close enough. If they're drastically different, we + * have no business trying to make an estimate based on the past job. * - * @param threshold Specify lower bound on ratio (e.g. 0.10 for 10%) + * @param threshold + * Specify lower bound on ratio (e.g. 0.10 for 10%) */ private def acceptableInputRatio(current: Long, past: Long, threshold: Double): Boolean = { val ratio = current / past.toDouble if (threshold > 0 && (ratio < threshold || ratio > 1 / threshold)) { - LOG.warn("Input sizes differ too much to use for estimation: " + - "current: " + current + ", past: " + past) + LOG.warn( + "Input sizes differ too much to use for estimation: " + + "current: " + current + ", past: " + past + ) false } else true } /** - * Compute the average ratio of mapper bytes to reducer bytes and use that to - * scale the estimate produced by InputSizeReducerEstimator. + * Compute the average ratio of mapper bytes to reducer bytes and use that to scale the estimate produced by + * InputSizeReducerEstimator. */ override protected def estimate( - info: FlowStrategyInfo, - conf: JobConf, - history: Seq[FlowStepHistory]): Option[Int] = { + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { val threshold = RatioBasedEstimator.getInputRatioThreshold(conf) val inputBytes = Common.totalInputSize(info.step) @@ -68,9 +71,11 @@ abstract class RatioBasedEstimator extends ReducerHistoryEstimator { // scale reducer estimate based on the historical input ratio val e = (baseEstimate * reducerRatio).ceil.toInt.max(1) - LOG.info("\nRatioBasedEstimator" - + "\n - past reducer ratio: " + reducerRatio - + "\n - reducer estimate: " + e) + LOG.info( + "\nRatioBasedEstimator" + + "\n - past reducer ratio: " + reducerRatio + + "\n - reducer estimate: " + e + ) e } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala index a8153e71d4..62c6cac3be 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala @@ -8,9 +8,9 @@ object ReducerEstimatorConfig { val estimatedNumReducers = "scalding.reducer.estimator.result" /** - * Output param: same as estimatedNumReducers but with the cap specified by maxEstimatedReducersKey - * applied. Can be used to determine whether a cap was applied to the estimated number of reducers - * and potentially to trigger alerting / logging. + * Output param: same as estimatedNumReducers but with the cap specified by maxEstimatedReducersKey applied. + * Can be used to determine whether a cap was applied to the estimated number of reducers and potentially to + * trigger alerting / logging. */ val cappedEstimatedNumReducersKey = "scalding.reducer.estimator.result.capped" @@ -18,8 +18,7 @@ object ReducerEstimatorConfig { val originalNumReducers = "scalding.reducer.estimator.original.mapred.reduce.tasks" /** - * If we estimate more than this number of reducers, - * we will use this number instead of the estimated value + * If we estimate more than this number of reducers, we will use this number instead of the estimated value */ val maxEstimatedReducersKey = "scalding.reducer.estimator.max.estimated.reducers" diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala index 4ac7139dc2..557cddfd6f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala @@ -1,10 +1,10 @@ package com.twitter.scalding.reducer_estimation -import cascading.flow.{ Flow, FlowStep, FlowStepStrategy } +import cascading.flow.{Flow, FlowStep, FlowStepStrategy} import com.twitter.algebird.Monoid -import com.twitter.scalding.estimation.{ Estimator, FallbackEstimatorMonoid, FlowStrategyInfo } -import com.twitter.scalding.{ Config, StringUtility } -import java.util.{ List => JList } +import com.twitter.scalding.estimation.{Estimator, FallbackEstimatorMonoid, FlowStrategyInfo} +import com.twitter.scalding.{Config, StringUtility} +import java.util.{List => JList} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ @@ -17,16 +17,16 @@ object ReducerEstimatorStepStrategy extends FlowStepStrategy[JobConf] { new FallbackEstimatorMonoid[Int] /** - * Make reducer estimate, possibly overriding explicitly-set numReducers, - * and save useful info (such as the default & estimate) in JobConf for - * later consumption. + * Make reducer estimate, possibly overriding explicitly-set numReducers, and save useful info (such as the + * default & estimate) in JobConf for later consumption. * * Called by Cascading at the start of each job step. */ final override def apply( - flow: Flow[JobConf], - preds: JList[FlowStep[JobConf]], - step: FlowStep[JobConf]): Unit = { + flow: Flow[JobConf], + preds: JList[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = { val conf = step.getConfig // for steps with reduce phase, mapred.reduce.tasks is set in the jobconf at this point // so we check that to determine if this is a map-only step. @@ -34,8 +34,7 @@ object ReducerEstimatorStepStrategy extends FlowStepStrategy[JobConf] { case 0 => LOG.info(s"${flow.getName} is a map-only step. Skipping reducer estimation.") case _ => if (skipReducerEstimation(step)) { - LOG.info( - s""" + LOG.info(s""" |Flow step ${step.getName} was configured with reducers |set explicitly (${Config.WithReducersSetExplicitly}=true) and the estimator |explicit override turned off (${Config.ReducerEstimatorOverride}=false). Skipping @@ -58,18 +57,15 @@ object ReducerEstimatorStepStrategy extends FlowStepStrategy[JobConf] { private def skipReducerEstimation(step: FlowStep[JobConf]) = reducersSetExplicitly(step) && !overrideExplicitReducers(step) - private def estimate( - flow: Flow[JobConf], - preds: Seq[FlowStep[JobConf]], - step: FlowStep[JobConf]): Unit = { + private def estimate(flow: Flow[JobConf], preds: Seq[FlowStep[JobConf]], step: FlowStep[JobConf]): Unit = { val conf = step.getConfig val stepNumReducers = conf.get(Config.HadoopNumReducers) Option(conf.get(Config.ReducerEstimators)).foreach { clsNames => - val clsLoader = Thread.currentThread.getContextClassLoader - val estimators = StringUtility.fastSplit(clsNames, ",") + val estimators = StringUtility + .fastSplit(clsNames, ",") .map(clsLoader.loadClass(_).newInstance.asInstanceOf[Estimator[Int]]) val combinedEstimator = Monoid.sum(estimators) @@ -80,11 +76,13 @@ object ReducerEstimatorStepStrategy extends FlowStepStrategy[JobConf] { // apply cap if needed val cappedNumReducers = estimatedNumReducers.map { n => - val configuredMax = conf.getInt(ReducerEstimatorConfig.maxEstimatedReducersKey, ReducerEstimatorConfig.defaultMaxEstimatedReducers) + val configuredMax = conf.getInt( + ReducerEstimatorConfig.maxEstimatedReducersKey, + ReducerEstimatorConfig.defaultMaxEstimatedReducers + ) if (n > configuredMax) { - LOG.warn( - s""" + LOG.warn(s""" |Reducer estimator estimated $n reducers, which is more than the configured maximum of $configuredMax. |Will use $configuredMax instead. """.stripMargin) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala index 4cffffee9c..f7dd2291f5 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.reducer_estimation -import com.twitter.scalding.estimation.{ HistoryEstimator, Task } +import com.twitter.scalding.estimation.{HistoryEstimator, Task} import org.apache.hadoop.mapred.JobConf object ReducerHistoryEstimator { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala index d771126731..eb9b56a0ee 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.reducer_estimation -import com.twitter.scalding.estimation.{ Common, FlowStepHistory, FlowStrategyInfo } +import com.twitter.scalding.estimation.{Common, FlowStepHistory, FlowStrategyInfo} import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory @@ -12,18 +12,15 @@ import org.slf4j.LoggerFactory trait RuntimeEstimationScheme { /** - * Given a list of times that each reducer took in a certain FlowStep, - * aggregates these times into a single estimate of the time that - * a "typical" reducer took. - * Suggested implementation: mean or median. + * Given a list of times that each reducer took in a certain FlowStep, aggregates these times into a single + * estimate of the time that a "typical" reducer took. Suggested implementation: mean or median. */ def estimateTaskTime(times: Seq[Double]): Option[Double] /** - * Given a list of "typical" times observed in a series of jobs of - * the same FlowStep, aggregates these times into a single estimate of - * the time that a "typical" reducer took in a "typical" job. - * Suggested implementation: mean or median. + * Given a list of "typical" times observed in a series of jobs of the same FlowStep, aggregates these times + * into a single estimate of the time that a "typical" reducer took in a "typical" job. Suggested + * implementation: mean or median. */ def estimateJobTime(times: Seq[Double]): Option[Double] } @@ -52,13 +49,12 @@ object RuntimeReducerEstimator { } /** - * Whether to use the median or the mean in the runtime estimation process. - * Default is median. + * Whether to use the median or the mean in the runtime estimation process. Default is median. */ def getRuntimeEstimationScheme(conf: JobConf): RuntimeEstimationScheme = { val default = "median" conf.get(EstimationScheme, default) match { - case "mean" => MeanEstimationScheme + case "mean" => MeanEstimationScheme case "median" => MedianEstimationScheme case _ => throw new Exception(s"""Value of $EstimationScheme must be "mean", "median", or not specified.""") @@ -66,11 +62,9 @@ object RuntimeReducerEstimator { } /** - * Whether to ignore the input size of the data. - * If true, RuntimeReducerEstimator uses a non-input scaled estimator. - * If false, RuntimeReducerEstimator uses an input-scaled estimator - * first, and uses a non-input-scaled estimator as a fallback. - * Default is false. + * Whether to ignore the input size of the data. If true, RuntimeReducerEstimator uses a non-input scaled + * estimator. If false, RuntimeReducerEstimator uses an input-scaled estimator first, and uses a + * non-input-scaled estimator as a fallback. Default is false. */ def getRuntimeIgnoreInputSize(conf: JobConf): Boolean = { val default = false @@ -80,16 +74,15 @@ object RuntimeReducerEstimator { def getReduceTimes(history: Seq[FlowStepHistory]): Seq[Seq[Double]] = history.map { h => h.tasks - .filter { t => t.taskType.contains("REDUCE") && t.status.contains("SUCCEEDED") } + .filter(t => t.taskType.contains("REDUCE") && t.status.contains("SUCCEEDED")) .flatMap { t => - t.finishTime.zip(t.startTime) - .filter { - case (finishedTime, startTime) => - finishedTime > startTime + t.finishTime + .zip(t.startTime) + .filter { case (finishedTime, startTime) => + finishedTime > startTime } - .map { - case (finishedTime, startTime) => - (finishedTime - startTime).toDouble + .map { case (finishedTime, startTime) => + (finishedTime - startTime).toDouble } } } @@ -103,13 +96,13 @@ trait BasicRuntimeReducerEstimator extends ReducerHistoryEstimator { def runtimeEstimationScheme: RuntimeEstimationScheme override protected def estimate( - info: FlowStrategyInfo, - conf: JobConf, - history: Seq[FlowStepHistory]): Option[Int] = { + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { val reduceTimes: Seq[Seq[Double]] = getReduceTimes(history) - LOG.info( - s"""| + LOG.info(s"""| |History items have the following numbers of tasks: | ${history.map(_.tasks.length)}, |and the following numbers of tasks have valid task histories: @@ -117,7 +110,7 @@ trait BasicRuntimeReducerEstimator extends ReducerHistoryEstimator { // total time taken in the step = time per reducer * number of reducers val jobTimes: Seq[Option[Double]] = reduceTimes - .map { xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length) } + .map(xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length)) // time per step, averaged over all the steps val typicalJobTime: Option[Double] = runtimeEstimationScheme.estimateJobTime(jobTimes.flatten) @@ -126,8 +119,7 @@ trait BasicRuntimeReducerEstimator extends ReducerHistoryEstimator { val estimate = typicalJobTime.map { t: Double => (t / desiredRuntime).ceil.toInt } - LOG.info( - s""" + LOG.info(s""" | - Typical job time: $typicalJobTime | - Desired runtime: $desiredRuntime | - Estimate: $estimate @@ -145,13 +137,13 @@ trait InputScaledRuntimeReducerEstimator extends ReducerHistoryEstimator { def runtimeEstimationScheme: RuntimeEstimationScheme override protected def estimate( - info: FlowStrategyInfo, - conf: JobConf, - history: Seq[FlowStepHistory]): Option[Int] = { + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { val reduceTimes: Seq[Seq[Double]] = getReduceTimes(history) - LOG.info( - s"""| + LOG.info(s"""| |History items have the following numbers of tasks: | ${history.map(_.tasks.length)}, |and the following numbers of tasks have valid task histories: @@ -159,7 +151,7 @@ trait InputScaledRuntimeReducerEstimator extends ReducerHistoryEstimator { // total time taken in the step = time per reducer * number of reducers val jobTimes: Seq[Option[Double]] = reduceTimes - .map { xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length) } + .map(xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length)) // time-to-byte ratio for a step = time per reducer * number of reducers / number of bytes val timeToByteRatios: Seq[Double] = jobTimes @@ -184,8 +176,7 @@ trait InputScaledRuntimeReducerEstimator extends ReducerHistoryEstimator { (t * inputBytes / desiredRuntime).ceil.toInt } - LOG.info( - s""" + LOG.info(s""" | - HDFS bytes read: ${history.map(_.hdfsBytesRead)} | - Time-to-byte-ratios: $timeToByteRatios | - Typical type-to-byte-ratio: $typicalTimeToByteRatio @@ -224,6 +215,10 @@ trait RuntimeReducerEstimator extends ReducerHistoryEstimator { combinedEstimator.estimate(info) } - override protected def estimate(info: FlowStrategyInfo, conf: JobConf, history: Seq[FlowStepHistory]): Option[Int] = + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = estimate(info) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala index 264bf06807..791bdbac73 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala @@ -12,26 +12,27 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import cascading.flow.Flow import cascading.flow.planner.BaseFlowStep -import cascading.tuple.{ Hasher => CHasher, StreamComparator } +import cascading.tuple.{Hasher => CHasher, StreamComparator} import com.twitter.scalding.ExecutionContext.getDesc import java.io.InputStream import java.util.Comparator -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} import org.slf4j.LoggerFactory /** * This is the type that should be fed to cascading to enable binary comparators */ -class CascadingBinaryComparator[T](ob: OrderedSerialization[T]) extends Comparator[T] - with StreamComparator[InputStream] - with CHasher[T] - with Serializable { +class CascadingBinaryComparator[T](ob: OrderedSerialization[T]) + extends Comparator[T] + with StreamComparator[InputStream] + with CHasher[T] + with Serializable { override def compare(a: T, b: T) = ob.compare(a, b) override def hashCode(t: T): Int = ob.hash(t) @@ -44,10 +45,13 @@ object CascadingBinaryComparator { private val LOG = LoggerFactory.getLogger(this.getClass) /** - * This method will walk the flowDef and make sure all the - * groupBy/cogroups are using a CascadingBinaryComparator + * This method will walk the flowDef and make sure all the groupBy/cogroups are using a + * CascadingBinaryComparator */ - private[scalding] def checkForOrderedSerialization[T](flow: Flow[T], mode: RequireOrderedSerializationMode): Try[Unit] = { + private[scalding] def checkForOrderedSerialization[T]( + flow: Flow[T], + mode: RequireOrderedSerializationMode + ): Try[Unit] = { import collection.JavaConverters._ import cascading.pipe._ @@ -73,26 +77,27 @@ object CascadingBinaryComparator { if (m.isEmpty) failure(s"Splice must have KeySelectors: $s") else { - reduce(m.map { - case (pipename, fields) => - /* - * Scalding typed-API ALWAYS puts the key into field position 0. - * If OrderedSerialization is enabled, this must be a CascadingBinaryComparator - */ - if (fields.getComparators()(0).isInstanceOf[CascadingBinaryComparator[_]]) - Success(()) - else failure(s"pipe: $s, fields: $fields, comparators: ${fields.getComparators.toList}") + reduce(m.map { case (pipename, fields) => + /* + * Scalding typed-API ALWAYS puts the key into field position 0. + * If OrderedSerialization is enabled, this must be a CascadingBinaryComparator + */ + if (fields.getComparators()(0).isInstanceOf[CascadingBinaryComparator[_]]) + Success(()) + else failure(s"pipe: $s, fields: $fields, comparators: ${fields.getComparators.toList}") }) } } def getDescriptionsForMissingOrdSer[U](bfs: BaseFlowStep[U]): Option[String] = // does this job have any Splices without OrderedSerialization: - if (bfs.getGraph.vertexSet.asScala.exists { - case gb: GroupBy => check(gb).isFailure - case cg: CoGroup => check(cg).isFailure - case _ => false // only do sorting in groupBy/cogroupBy - }) { + if ( + bfs.getGraph.vertexSet.asScala.exists { + case gb: GroupBy => check(gb).isFailure + case cg: CoGroup => check(cg).isFailure + case _ => false // only do sorting in groupBy/cogroupBy + } + ) { Some(getDesc(bfs).mkString(", ")) } else None diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala index dbe5ca4826..d0e503ec0b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import com.twitter.chill.{ Externalizer => ChillExtern } +import com.twitter.chill.{Externalizer => ChillExtern} import com.esotericsoftware.kryo.DefaultSerializer import com.esotericsoftware.kryo.serializers.JavaSerializer @@ -38,4 +38,3 @@ class Externalizer[T] extends ChillExtern[T] { protected override def kryo = new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true"))) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala index 1d36f8cc9f..8a9caaf12c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala @@ -12,15 +12,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.serializers.FieldSerializer -import com.twitter.scalding.{ Args, CascadingTokenUpdater, DateRange, RichDate, Config => ScaldingConfig } +import com.twitter.scalding.{Args, CascadingTokenUpdater, Config => ScaldingConfig, DateRange, RichDate} import com.twitter.chill.algebird._ import com.twitter.chill.config.Config -import com.twitter.chill.{ IKryoRegistrar, KryoInstantiator, ScalaKryoInstantiator, SingletonSerializer } +import com.twitter.chill.{IKryoRegistrar, KryoInstantiator, ScalaKryoInstantiator, SingletonSerializer} class KryoHadoop(@transient config: Config) extends KryoInstantiator { // keeping track of references is costly for memory, and often triggers OOM on Hadoop @@ -28,13 +28,10 @@ class KryoHadoop(@transient config: Config) extends KryoInstantiator { val cascadingSerializationTokens = config.get(ScaldingConfig.CascadingSerializationTokens) /** - * TODO!!! - * Deal with this issue. The problem is grouping by Kryo serialized - * objects silently breaks the results. If Kryo gets in front of TupleSerialization - * (and possibly Writable, unclear at this time), grouping is broken. - * There are two issues here: - * 1) Kryo objects not being compared properly. - * 2) Kryo being used instead of cascading. + * TODO!!! Deal with this issue. The problem is grouping by Kryo serialized objects silently breaks the + * results. If Kryo gets in front of TupleSerialization (and possibly Writable, unclear at this time), + * grouping is broken. There are two issues here: 1) Kryo objects not being compared properly. 2) Kryo being + * used instead of cascading. * * We must identify each and fix these bugs. */ @@ -63,38 +60,46 @@ class KryoHadoop(@transient config: Config) extends KryoInstantiator { } /** - * AdaptiveVector is IndexedSeq, which picks up the chill IndexedSeq serializer - * (which is its own bug), force using the fields serializer here + * AdaptiveVector is IndexedSeq, which picks up the chill IndexedSeq serializer (which is its own bug), + * force using the fields serializer here */ - newK.register(classOf[com.twitter.algebird.DenseVector[_]], - new FieldSerializer[com.twitter.algebird.DenseVector[_]](newK, - classOf[com.twitter.algebird.DenseVector[_]])) - - newK.register(classOf[com.twitter.algebird.SparseVector[_]], - new FieldSerializer[com.twitter.algebird.SparseVector[_]](newK, - classOf[com.twitter.algebird.SparseVector[_]])) - - newK.addDefaultSerializer(classOf[com.twitter.algebird.AdaptiveVector[_]], - classOf[FieldSerializer[_]]) + newK.register( + classOf[com.twitter.algebird.DenseVector[_]], + new FieldSerializer[com.twitter.algebird.DenseVector[_]]( + newK, + classOf[com.twitter.algebird.DenseVector[_]] + ) + ) + + newK.register( + classOf[com.twitter.algebird.SparseVector[_]], + new FieldSerializer[com.twitter.algebird.SparseVector[_]]( + newK, + classOf[com.twitter.algebird.SparseVector[_]] + ) + ) + + newK.addDefaultSerializer(classOf[com.twitter.algebird.AdaptiveVector[_]], classOf[FieldSerializer[_]]) /** - * Pipes can be swept up into closures inside of case classes. This can generally - * be safely ignored. If the case class has a method that actually accesses something - * in the pipe (what would that even be?), you will get a null pointer exception, - * so it shouldn't cause data corruption. - * a more robust solution is to use Spark's closure cleaner approach on every object that - * is serialized, but that's very expensive. + * Pipes can be swept up into closures inside of case classes. This can generally be safely ignored. If + * the case class has a method that actually accesses something in the pipe (what would that even be?), + * you will get a null pointer exception, so it shouldn't cause data corruption. a more robust solution is + * to use Spark's closure cleaner approach on every object that is serialized, but that's very expensive. */ newK.addDefaultSerializer(classOf[cascading.pipe.Pipe], new SingletonSerializer(null)) newK.addDefaultSerializer(classOf[com.twitter.scalding.typed.TypedPipe[_]], new SingletonSerializer(null)) newK.addDefaultSerializer(classOf[com.twitter.scalding.Execution[_]], new SingletonSerializer(null)) - newK.addDefaultSerializer(classOf[com.twitter.scalding.Execution.ToWrite[_]], new SingletonSerializer(null)) + newK.addDefaultSerializer( + classOf[com.twitter.scalding.Execution.ToWrite[_]], + new SingletonSerializer(null) + ) newK.setReferences(useRefs) /** - * Make sure we use the thread's context class loader to ensure the classes of the - * submitted jar and any -libjars arguments can be found + * Make sure we use the thread's context class loader to ensure the classes of the submitted jar and any + * -libjars arguments can be found */ val classLoader = Thread.currentThread.getContextClassLoader newK.setClassLoader(classLoader) @@ -129,8 +134,8 @@ class KryoHadoop(@transient config: Config) extends KryoInstantiator { } /** - * If you override KryoHadoop, prefer to add registrations here instead of overriding [[newKryo]]. - * That way, any additional default serializers will be used for registering cascading tokenized classes. + * If you override KryoHadoop, prefer to add registrations here instead of overriding [[newKryo]]. That way, + * any additional default serializers will be used for registering cascading tokenized classes. */ def customRegistrar: IKryoRegistrar = new IKryoRegistrar { override def apply(k: Kryo): Unit = {} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala index d93b6362d0..0268a2b9a7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import com.esotericsoftware.kryo.Kryo -import com.esotericsoftware.kryo.{ Serializer => KSerializer } -import com.esotericsoftware.kryo.io.{ Input, Output } +import com.esotericsoftware.kryo.{Serializer => KSerializer} +import com.esotericsoftware.kryo.io.{Input, Output} import com.twitter.scalding._ @@ -25,9 +25,8 @@ import com.twitter.scalding._ * This is a runtime check for types we should never be serializing */ class ThrowingSerializer[T] extends KSerializer[T] { - override def write(kryo: Kryo, output: Output, t: T): Unit = { + override def write(kryo: Kryo, output: Output, t: T): Unit = sys.error(s"Kryo should never be used to serialize an instance: $t") - } override def read(kryo: Kryo, input: Input, t: Class[T]): T = sys.error(s"Kryo should never be used to serialize an instance, class: $t") } @@ -39,15 +38,13 @@ class SerializeAsUnit[T >: Null] extends KSerializer[T] { } /** - * * - * Below are some serializers for objects in the scalding project. + * * Below are some serializers for objects in the scalding project. */ class RichDateSerializer extends KSerializer[RichDate] { // RichDates are immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out: Output, date: RichDate): Unit = { + def write(kser: Kryo, out: Output, date: RichDate): Unit = out.writeLong(date.timestamp, true) - } def read(kser: Kryo, in: Input, cls: Class[RichDate]): RichDate = RichDate(in.readLong(true)) @@ -61,17 +58,15 @@ class DateRangeSerializer extends KSerializer[DateRange] { out.writeLong(range.end.timestamp, true) } - def read(kser: Kryo, in: Input, cls: Class[DateRange]): DateRange = { + def read(kser: Kryo, in: Input, cls: Class[DateRange]): DateRange = DateRange(RichDate(in.readLong(true)), RichDate(in.readLong(true))) - } } class ArgsSerializer extends KSerializer[Args] { // Args are immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out: Output, a: Args): Unit = { + def write(kser: Kryo, out: Output, a: Args): Unit = out.writeString(a.toString) - } def read(kser: Kryo, in: Input, cls: Class[Args]): Args = Args(in.readString) } @@ -107,4 +102,3 @@ class StringFieldSerializer extends KSerializer[StringField[_]] { StringField[Any](id)(ord, mf) } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala index b188c982f8..652df358d2 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala @@ -2,30 +2,33 @@ package com.twitter.scalding.serialization import com.twitter.scalding._ -import scala.language.experimental.{ macros => smacros } +import scala.language.experimental.{macros => smacros} /** - * RequiredBinaryComparators provide comparators (or Ordering in Scala) that are capable of comparing keys in their - * serialized form reducing the amount of time spent in serialization/deserialization. These comparators are implemented - * using Scala macros, and currently provide binary comparators for primitives, strings, Options, tuples, collections, case classes - * and Scrooge objects. + * RequiredBinaryComparators provide comparators (or Ordering in Scala) that are capable of comparing keys in + * their serialized form reducing the amount of time spent in serialization/deserialization. These comparators + * are implemented using Scala macros, and currently provide binary comparators for primitives, strings, + * Options, tuples, collections, case classes and Scrooge objects. */ trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig { - implicit def ordSer[T]: OrderedSerialization[T] = macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + implicit def ordSer[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] } object RequiredBinaryComparators { - implicit def orderedSerialization[T]: OrderedSerialization[T] = macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + implicit def orderedSerialization[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] } /** * Use this for an ExecutionApp. */ trait RequiredBinaryComparatorsExecutionApp extends ExecutionApp { - implicit def ordSer[T]: OrderedSerialization[T] = macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + implicit def ordSer[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail override def config(inputArgs: Array[String]): (Config, Mode) = { val (conf, m) = super.config(inputArgs) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala index d14872d6cc..6b2b6a2c6e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.serialization -import com.twitter.scalding.{ Config, Job } +import com.twitter.scalding.{Config, Job} sealed trait RequireOrderedSerializationMode object RequireOrderedSerializationMode { @@ -10,5 +10,6 @@ object RequireOrderedSerializationMode { trait RequiredBinaryComparatorsConfig extends Job { def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail - override def config = super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode.toString) + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode.toString) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala index 7062c66ec4..a81841f8cd 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala @@ -12,20 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import org.apache.hadoop.io.serializer.{ Serialization => HSerialization, Deserializer, Serializer } -import org.apache.hadoop.conf.{ Configurable, Configuration } +import org.apache.hadoop.io.serializer.{Deserializer, Serialization => HSerialization, Serializer} +import org.apache.hadoop.conf.{Configurable, Configuration} -import java.io.{ InputStream, OutputStream } -import com.twitter.bijection.{ Injection, JavaSerializationInjection, Base64String } +import java.io.{InputStream, OutputStream} +import com.twitter.bijection.{Base64String, Injection, JavaSerializationInjection} import scala.collection.JavaConverters._ /** - * WrappedSerialization wraps a value in a wrapper class that - * has an associated Binary that is used to deserialize - * items wrapped in the wrapper + * WrappedSerialization wraps a value in a wrapper class that has an associated Binary that is used to + * deserialize items wrapped in the wrapper */ class WrappedSerialization[T] extends HSerialization[T] with Configurable { @@ -45,26 +44,30 @@ class WrappedSerialization[T] extends HSerialization[T] with Configurable { def accept(c: Class[_]): Boolean = serializations.contains(c) def getSerialization(c: Class[T]): Option[Serialization[T]] = - serializations.get(c) + serializations + .get(c) // This cast should never fail since we matched the class .asInstanceOf[Option[Serialization[T]]] def getSerializer(c: Class[T]): Serializer[T] = - new BinarySerializer(getSerialization(c) - .getOrElse(sys.error(s"Serialization for class: ${c} not found"))) + new BinarySerializer( + getSerialization(c) + .getOrElse(sys.error(s"Serialization for class: $c not found")) + ) def getDeserializer(c: Class[T]): Deserializer[T] = - new BinaryDeserializer(getSerialization(c) - .getOrElse(sys.error(s"Serialization for class: ${c} not found"))) + new BinaryDeserializer( + getSerialization(c) + .getOrElse(sys.error(s"Serialization for class: $c not found")) + ) } class BinarySerializer[T](buf: Serialization[T]) extends Serializer[T] { private var out: OutputStream = _ - def open(os: OutputStream): Unit = { + def open(os: OutputStream): Unit = out = os - } - def close(): Unit = { out = null } + def close(): Unit = out = null def serialize(t: T): Unit = { if (out == null) throw new NullPointerException("OutputStream is null") buf.write(out, t).get @@ -73,8 +76,8 @@ class BinarySerializer[T](buf: Serialization[T]) extends Serializer[T] { class BinaryDeserializer[T](buf: Serialization[T]) extends Deserializer[T] { private var is: InputStream = _ - def open(i: InputStream): Unit = { is = i } - def close(): Unit = { is = null } + def open(i: InputStream): Unit = is = i + def close(): Unit = is = null def deserialize(t: T): T = { if (is == null) throw new NullPointerException("InputStream is null") buf.read(is).get @@ -85,7 +88,8 @@ object WrappedSerialization { type ClassSerialization[T] = (Class[T], Serialization[T]) private def getSerializer[U]: Injection[Externalizer[U], String] = { - implicit val initialInj: Injection[Externalizer[U], Array[Byte]] = JavaSerializationInjection[Externalizer[U]] + implicit val initialInj: Injection[Externalizer[U], Array[Byte]] = + JavaSerializationInjection[Externalizer[U]] Injection.connect[Externalizer[U], Array[Byte], Base64String, String] } @@ -97,28 +101,26 @@ object WrappedSerialization { private val confKey = "com.twitter.scalding.serialization.WrappedSerialization" - def rawSetBinary(bufs: Iterable[ClassSerialization[_]], fn: (String, String) => Unit) = { + def rawSetBinary(bufs: Iterable[ClassSerialization[_]], fn: (String, String) => Unit) = fn(confKey, bufs.map { case (cls, buf) => s"${cls.getName}:${serialize(buf)}" }.mkString(",")) - } def setBinary(conf: Configuration, bufs: Iterable[ClassSerialization[_]]): Unit = rawSetBinary(bufs, { case (k, v) => conf.set(k, v) }) def getBinary(conf: Configuration): Map[Class[_], Serialization[_]] = - conf - .iterator - .asScala + conf.iterator.asScala .map { it => (it.getKey, it.getValue) } .filter(_._1.startsWith(confKey)) - .map { - case (_, clsbuf) => - clsbuf.split(":") match { - case Array(className, serialization) => - // Jump through a hoop to get scalac happy - def deser[T](cls: Class[T]): ClassSerialization[T] = (cls, deserialize[Serialization[T]](serialization)) - deser(conf.getClassByName(className)) - case _ => sys.error(s"ill formed bufferables: ${clsbuf}") - } - }.toMap + .map { case (_, clsbuf) => + clsbuf.split(":") match { + case Array(className, serialization) => + // Jump through a hoop to get scalac happy + def deser[T](cls: Class[T]): ClassSerialization[T] = + (cls, deserialize[Serialization[T]](serialization)) + deser(conf.getClassByName(className)) + case _ => sys.error(s"ill formed bufferables: $clsbuf") + } + } + .toMap } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala index dc9e2f801e..a4b4eab19f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -20,9 +20,8 @@ import com.twitter.bijection.Injection import java.io.Serializable /** - * Handles the error checking for Injection inversion - * if check fails, it will throw an unrecoverable exception stopping the job - * TODO: probably belongs in Bijection + * Handles the error checking for Injection inversion if check fails, it will throw an unrecoverable exception + * stopping the job TODO: probably belongs in Bijection */ trait CheckedInversion[T, U] extends Serializable { def injection: Injection[T, U] diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala index a4cef7e3a3..23d5c1f887 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -20,7 +20,7 @@ import cascading.pipe.Pipe import cascading.scheme.Scheme import cascading.scheme.hadoop.WritableSequenceFile import cascading.tuple.Fields -import com.twitter.bijection.{ Bijection, Injection } +import com.twitter.bijection.{Bijection, Injection} import com.twitter.chill.Externalizer import com.twitter.scalding._ @@ -29,8 +29,7 @@ import org.apache.hadoop.io.BytesWritable import scala.collection.JavaConverters._ /** - * Source used to write some type T into a WritableSequenceFile using a codec on T - * for serialization. + * Source used to write some type T into a WritableSequenceFile using a codec on T for serialization. */ object BytesWritableCodec { @@ -46,32 +45,36 @@ object CodecSource { def apply[T](paths: String*)(implicit codec: Injection[T, Array[Byte]]) = new CodecSource[T](paths) } -class CodecSource[T] private (val hdfsPaths: Seq[String], val maxFailures: Int = 0)(implicit @transient injection: Injection[T, Array[Byte]]) - extends FileSource - with Mappable[T] - with LocalTapSource { +class CodecSource[T] private (val hdfsPaths: Seq[String], val maxFailures: Int = 0)(implicit + @transient injection: Injection[T, Array[Byte]] +) extends FileSource + with Mappable[T] + with LocalTapSource { import Dsl._ val fieldSym = 'encodedBytes lazy val field = new Fields(fieldSym.name) - val injectionBox = Externalizer(injection andThen BytesWritableCodec.get) + val injectionBox = Externalizer(injection.andThen(BytesWritableCodec.get)) def localPaths = hdfsPaths override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T]) override def hdfsScheme = - HadoopSchemeInstance(new WritableSequenceFile(field, classOf[BytesWritable]).asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new WritableSequenceFile(field, classOf[BytesWritable]).asInstanceOf[Scheme[_, _, _, _, _]] + ) protected lazy val checkedInversion = new MaxFailuresCheck[T, BytesWritable](maxFailures)(injectionBox.get) override def transformForRead(pipe: Pipe) = - pipe.flatMap((fieldSym) -> (fieldSym)) { (bw: BytesWritable) => checkedInversion(bw) } + pipe.flatMap(fieldSym -> fieldSym)((bw: BytesWritable) => checkedInversion(bw)) override def transformForWrite(pipe: Pipe) = - pipe.mapTo((0) -> (fieldSym)) { injectionBox.get.apply(_: T) } + pipe.mapTo(0 -> fieldSym)(injectionBox.get.apply(_: T)) override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { val tap = createTap(Read)(mode) - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(config, tap) .asScala .flatMap { te => diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala index 46015dcc0a..3224f9d6d0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala @@ -1,17 +1,14 @@ /** * Copyright 2012 Twitter, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. */ package com.twitter.scalding.source @@ -20,51 +17,79 @@ import com.twitter.scalding._ import cascading.tuple.Fields abstract class DailyPrefixSuffixSource(prefixTemplate: String, suffixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", dateRange, DateOps.UTC) + extends TimePathedSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", + dateRange, + DateOps.UTC + ) -abstract class DailyPrefixSuffixMostRecentSource(prefixTemplate: String, suffixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", dateRange, DateOps.UTC) +abstract class DailyPrefixSuffixMostRecentSource( + prefixTemplate: String, + suffixTemplate: String, + dateRange: DateRange +) extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", + dateRange, + DateOps.UTC + ) abstract class DailySuffixSource(prefixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) + extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) abstract class DailySuffixMostRecentSource(prefixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) + extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", + dateRange, + DateOps.UTC + ) object DailySuffixTsv { - def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = new DailySuffixTsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixTsv(prefix, fs) } class DailySuffixTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with DelimitedScheme { + extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs } object DailySuffixTypedTsv { - def apply[T](prefix: String)(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = + def apply[T]( + prefix: String + )(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = new DailySuffixTypedTsv[T](prefix) } -class DailySuffixTypedTsv[T](prefix: String)(implicit override val dateRange: DateRange, override val mf: Manifest[T], override val conv: TupleConverter[T], - override val tset: TupleSetter[T]) - extends DailySuffixSource(prefix, dateRange) with TypedDelimited[T] +class DailySuffixTypedTsv[T](prefix: String)(implicit + override val dateRange: DateRange, + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends DailySuffixSource(prefix, dateRange) + with TypedDelimited[T] object DailySuffixCsv { - def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = new DailySuffixCsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixCsv(prefix, fs) } class DailySuffixCsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with DelimitedScheme { + extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs override val separator = "," } object DailySuffixMostRecentCsv { - def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = new DailySuffixMostRecentCsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixMostRecentCsv(prefix, fs) } -class DailySuffixMostRecentCsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixMostRecentSource(prefix, dateRange) with DelimitedScheme { +class DailySuffixMostRecentCsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends DailySuffixMostRecentSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs override val separator = "," } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala index 4e836952af..06fa819487 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala @@ -1,17 +1,14 @@ /** * Copyright 2012 Twitter, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. */ package com.twitter.scalding.source @@ -19,32 +16,48 @@ package com.twitter.scalding.source import com.twitter.scalding._ abstract class HourlySuffixSource(prefixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", dateRange, DateOps.UTC) + extends TimePathedSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", + dateRange, + DateOps.UTC + ) abstract class HourlySuffixMostRecentSource(prefixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", dateRange, DateOps.UTC) + extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", + dateRange, + DateOps.UTC + ) object HourlySuffixTsv { def apply(prefix: String)(implicit dateRange: DateRange) = new HourlySuffixTsv(prefix) } class HourlySuffixTsv(prefix: String)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with DelimitedScheme + extends HourlySuffixSource(prefix, dateRange) + with DelimitedScheme object HourlySuffixTypedTsv { - def apply[T](prefix: String)(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = + def apply[T]( + prefix: String + )(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = new HourlySuffixTypedTsv[T](prefix) } -class HourlySuffixTypedTsv[T](prefix: String)(implicit override val dateRange: DateRange, override val mf: Manifest[T], override val conv: TupleConverter[T], - override val tset: TupleSetter[T]) - extends HourlySuffixSource(prefix, dateRange) with TypedDelimited[T] +class HourlySuffixTypedTsv[T](prefix: String)(implicit + override val dateRange: DateRange, + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends HourlySuffixSource(prefix, dateRange) + with TypedDelimited[T] object HourlySuffixCsv { def apply(prefix: String)(implicit dateRange: DateRange) = new HourlySuffixCsv(prefix) } class HourlySuffixCsv(prefix: String)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with DelimitedScheme { + extends HourlySuffixSource(prefix, dateRange) + with DelimitedScheme { override val separator = "," } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala index 5f602c5c66..90e640f820 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -21,20 +21,17 @@ import java.util.concurrent.atomic.AtomicInteger // TODO: this should actually increment an read a Hadoop counter class MaxFailuresCheck[T, U](val maxFailures: Int)(implicit override val injection: Injection[T, U]) - extends CheckedInversion[T, U] { + extends CheckedInversion[T, U] { private val failures = new AtomicInteger(0) - def apply(input: U): Option[T] = { + def apply(input: U): Option[T] = try { Some(injection.invert(input).get) } catch { case e: Exception => // TODO: use proper logging e.printStackTrace() - assert( - failures.incrementAndGet <= maxFailures, - "maximum decoding errors exceeded") + assert(failures.incrementAndGet <= maxFailures, "maximum decoding errors exceeded") None } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala index 44c5f94d30..4cde1a85fe 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala @@ -1,12 +1,11 @@ package com.twitter.scalding.source import com.twitter.scalding.typed.TypedSink -import com.twitter.scalding.{ BaseNullSource, TupleSetter } +import com.twitter.scalding.{BaseNullSource, TupleSetter} /** - * This can be used to cause cascading to run a flow, but discard - * the output. The only place this is likely of use is to do some (non-recommended, - * but sometimes the most expediant way to accomplish some task). + * This can be used to cause cascading to run a flow, but discard the output. The only place this is likely of + * use is to do some (non-recommended, but sometimes the most expediant way to accomplish some task). */ object NullSink extends BaseNullSource with TypedSink[Any] { def setter[U <: Any] = TupleSetter.asSubSetter[Any, U](TupleSetter.singleSetter) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala index a31eb1bc82..f635244f5e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -21,20 +21,23 @@ import com.twitter.scalding._ import com.twitter.scalding.SequenceFile /** - * SequenceFile with explicit types. Useful for debugging flows using the Typed API. - * Not to be used for permanent storage: uses Kryo serialization which may not be - * consistent across JVM instances. Use Thrift sources instead. + * SequenceFile with explicit types. Useful for debugging flows using the Typed API. Not to be used for + * permanent storage: uses Kryo serialization which may not be consistent across JVM instances. Use Thrift + * sources instead. */ -class TypedSequenceFile[T](val path: String) extends SequenceFile(path, Fields.FIRST) with Mappable[T] with TypedSink[T] { +class TypedSequenceFile[T](val path: String) + extends SequenceFile(path, Fields.FIRST) + with Mappable[T] + with TypedSink[T] { override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T]) override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) override def toString: String = "TypedSequenceFile(%s)".format(path) override def equals(that: Any): Boolean = that match { - case null => false + case null => false case t: TypedSequenceFile[_] => t.p == p // horribly named fields in the SequenceFile case class - case _ => false + case _ => false } override def hashCode = path.hashCode } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala index 8f5e2da6c4..fb091f15b5 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala @@ -1,14 +1,14 @@ package com.twitter.scalding.source import cascading.scheme.Scheme -import cascading.scheme.hadoop.{ TextDelimited => CHTextDelimited } -import cascading.scheme.local.{ TextDelimited => CLTextDelimited } +import cascading.scheme.hadoop.{TextDelimited => CHTextDelimited} +import cascading.scheme.local.{TextDelimited => CLTextDelimited} import com.twitter.scalding._ import com.twitter.scalding.typed.TypedSink /** - * This object gives you easy access to text formats (possibly LZO compressed) by - * using a case class to describe the field names and types. + * This object gives you easy access to text formats (possibly LZO compressed) by using a case class to + * describe the field names and types. */ case class TypedSep(str: String) extends AnyVal @@ -28,51 +28,65 @@ object TypedText { /** * Prefix might be "/logs/awesome" */ - private def hourly[T](sep: TypedSep, prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + private def hourly[T](sep: TypedSep, prefix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") } - def hourlyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def hourlyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = hourly(TAB, prefix) - def hourlyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def hourlyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = hourly(ONE, prefix) - def hourlyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def hourlyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = hourly(COMMA, prefix) - private def daily[T]( - sep: TypedSep, prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + private def daily[T](sep: TypedSep, prefix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") } - def dailyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = daily(TAB, prefix) - def dailyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = daily(ONE, prefix) - def dailyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = daily(COMMA, prefix) - private def dailyPrefixSuffix[T]( - sep: TypedSep, - prefix: String, - suffix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + private def dailyPrefixSuffix[T](sep: TypedSep, prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { require(prefix.last != '/', "prefix should not include trailing /") require(suffix.head == '/', "suffix should include a preceding /") new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY + suffix + "/*") } - def dailyPrefixSuffixTsv[T](prefix: String, suffix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyPrefixSuffixTsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = dailyPrefixSuffix(TAB, prefix, suffix) - def dailyPrefixSuffixOsv[T](prefix: String, suffix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyPrefixSuffixOsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = dailyPrefixSuffix(ONE, prefix, suffix) - def dailyPrefixSuffixCsv[T](prefix: String, suffix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + def dailyPrefixSuffixCsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = dailyPrefixSuffix(COMMA, prefix, suffix) } @@ -99,30 +113,50 @@ trait TypedTextDelimited[T] extends SchemedSource with Mappable[T] with TypedSin override def sourceFields = typeDescriptor.fields override def localScheme = - new CLTextDelimited(typeDescriptor.fields, false, false, separator.str, strict, null /* quote */ , - typeDescriptor.fields.getTypesClasses, safe) + new CLTextDelimited( + typeDescriptor.fields, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ) override def hdfsScheme = - HadoopSchemeInstance(new CHTextDelimited(typeDescriptor.fields, null /* compression */ , false, false, - separator.str, strict, null /* quote */ , - typeDescriptor.fields.getTypesClasses, safe).asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new CHTextDelimited( + typeDescriptor.fields, + null /* compression */, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ).asInstanceOf[Scheme[_, _, _, _, _]] + ) } class TimePathTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) - extends TimePathedSource(path, dr, DateOps.UTC) with TypedTextDelimited[T] { + extends TimePathedSource(path, dr, DateOps.UTC) + with TypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } class MostRecentTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) - extends MostRecentGoodSource(path, dr, DateOps.UTC) with TypedTextDelimited[T] { + extends MostRecentGoodSource(path, dr, DateOps.UTC) + with TypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } class FixedTypedText[T](sep: TypedSep, path: String*)(implicit td: TypeDescriptor[T]) - extends FixedPathSource(path: _*) with TypedTextDelimited[T] { + extends FixedPathSource(path: _*) + with TypedTextDelimited[T] { override def typeDescriptor = td protected override def separator = sep } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala index 4bbe1b3807..2794a32f22 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import cascading.flow.FlowDef @@ -24,17 +24,22 @@ import serialization.Externalizer object BijectedSourceSink { type SourceSink[T] = TypedSource[T] with TypedSink[T] - def apply[T, U](parent: SourceSink[T])(implicit transformer: ImplicitBijection[T, U]): BijectedSourceSink[T, U] = + def apply[T, U](parent: SourceSink[T])(implicit + transformer: ImplicitBijection[T, U] + ): BijectedSourceSink[T, U] = new BijectedSourceSink(parent)(transformer) } -class BijectedSourceSink[T, U](parent: BijectedSourceSink.SourceSink[T])(implicit @transient transformer: ImplicitBijection[T, U]) extends TypedSource[U] with TypedSink[U] { +class BijectedSourceSink[T, U](parent: BijectedSourceSink.SourceSink[T])(implicit + @transient transformer: ImplicitBijection[T, U] +) extends TypedSource[U] + with TypedSink[U] { val lockedBij = Externalizer(transformer) def setter[V <: U] = parent.setter.contraMap(lockedBij.get.invert(_)) - override def converter[W >: U] = parent.converter.andThen{ t: T => lockedBij.get(t) }: TupleConverter[W] + override def converter[W >: U] = parent.converter.andThen { t: T => lockedBij.get(t) }: TupleConverter[W] override def read(implicit flowDef: FlowDef, mode: Mode): Pipe = parent.read override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = parent.writeFrom(pipe) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala index 458c100dce..ea52d42fc4 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala @@ -2,9 +2,8 @@ package com.twitter.scalding.typed /** - * Autogenerated methods for flattening the nested value tuples that result after - * joining many pipes together. These methods can be used directly, or via the - * the joins available in MultiJoin. + * Autogenerated methods for flattening the nested value tuples that result after joining many pipes together. + * These methods can be used directly, or via the the joins available in MultiJoin. */ object FlattenGroup { val pairOfNones = (None, None) @@ -16,153 +15,250 @@ object FlattenGroup { (a, b, c) } - class FlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C](nested: KLL[KEY, ((A, B), C)]) { - def flattenValueTuple: KLL[KEY, (A, B, C)] = nested.mapValues { tup => FlattenGroup.flattenNestedTuple(tup) } + class FlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, ((A, B), C)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } } - implicit def toFlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C](nested: KLL[KEY, ((A, B), C)]): FlattenGroup.FlattenLeftJoin3[KEY, KLL, A, B, C] = new FlattenLeftJoin3(nested) + implicit def toFlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, ((A, B), C)] + ): FlattenGroup.FlattenLeftJoin3[KEY, KLL, A, B, C] = new FlattenLeftJoin3(nested) def flattenNestedTuple[A, B, C, D](nested: (((A, B), C), D)): (A, B, C, D) = { val (((a, b), c), d) = nested (a, b, c, d) } - class FlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D](nested: KLL[KEY, (((A, B), C), D)]) { - def flattenValueTuple: KLL[KEY, (A, B, C, D)] = nested.mapValues { tup => FlattenGroup.flattenNestedTuple(tup) } + class FlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (((A, B), C), D)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } } - implicit def toFlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D](nested: KLL[KEY, (((A, B), C), D)]): FlattenGroup.FlattenLeftJoin4[KEY, KLL, A, B, C, D] = new FlattenLeftJoin4(nested) + implicit def toFlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (((A, B), C), D)] + ): FlattenGroup.FlattenLeftJoin4[KEY, KLL, A, B, C, D] = new FlattenLeftJoin4(nested) def flattenNestedTuple[A, B, C, D, E](nested: ((((A, B), C), D), E)): (A, B, C, D, E) = { val ((((a, b), c), d), e) = nested (a, b, c, d, e) } - class FlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E](nested: KLL[KEY, ((((A, B), C), D), E)]) { - def flattenValueTuple: KLL[KEY, (A, B, C, D, E)] = nested.mapValues { tup => FlattenGroup.flattenNestedTuple(tup) } + class FlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, ((((A, B), C), D), E)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D, E)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } } - implicit def toFlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E](nested: KLL[KEY, ((((A, B), C), D), E)]): FlattenGroup.FlattenLeftJoin5[KEY, KLL, A, B, C, D, E] = new FlattenLeftJoin5(nested) + implicit def toFlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, ((((A, B), C), D), E)] + ): FlattenGroup.FlattenLeftJoin5[KEY, KLL, A, B, C, D, E] = new FlattenLeftJoin5(nested) def flattenNestedTuple[A, B, C, D, E, F](nested: (((((A, B), C), D), E), F)): (A, B, C, D, E, F) = { val (((((a, b), c), d), e), f) = nested (a, b, c, d, e, f) } - class FlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F](nested: KLL[KEY, (((((A, B), C), D), E), F)]) { - def flattenValueTuple: KLL[KEY, (A, B, C, D, E, F)] = nested.mapValues { tup => FlattenGroup.flattenNestedTuple(tup) } + class FlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F]( + nested: KLL[KEY, (((((A, B), C), D), E), F)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D, E, F)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } } - implicit def toFlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F](nested: KLL[KEY, (((((A, B), C), D), E), F)]): FlattenGroup.FlattenLeftJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenLeftJoin6(nested) + implicit def toFlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E, F]( + nested: KLL[KEY, (((((A, B), C), D), E), F)] + ): FlattenGroup.FlattenLeftJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenLeftJoin6(nested) - def flattenNestedTuple[A, B, C, D, E, F, G](nested: ((((((A, B), C), D), E), F), G)): (A, B, C, D, E, F, G) = { + def flattenNestedTuple[A, B, C, D, E, F, G]( + nested: ((((((A, B), C), D), E), F), G) + ): (A, B, C, D, E, F, G) = { val ((((((a, b), c), d), e), f), g) = nested (a, b, c, d, e, f, g) } - def flattenNestedTuple[A, B, C, D, E, F, G, H](nested: (((((((A, B), C), D), E), F), G), H)): (A, B, C, D, E, F, G, H) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H]( + nested: (((((((A, B), C), D), E), F), G), H) + ): (A, B, C, D, E, F, G, H) = { val (((((((a, b), c), d), e), f), g), h) = nested (a, b, c, d, e, f, g, h) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I](nested: ((((((((A, B), C), D), E), F), G), H), I)): (A, B, C, D, E, F, G, H, I) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I]( + nested: ((((((((A, B), C), D), E), F), G), H), I) + ): (A, B, C, D, E, F, G, H, I) = { val ((((((((a, b), c), d), e), f), g), h), i) = nested (a, b, c, d, e, f, g, h, i) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J](nested: (((((((((A, B), C), D), E), F), G), H), I), J)): (A, B, C, D, E, F, G, H, I, J) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J]( + nested: (((((((((A, B), C), D), E), F), G), H), I), J) + ): (A, B, C, D, E, F, G, H, I, J) = { val (((((((((a, b), c), d), e), f), g), h), i), j) = nested (a, b, c, d, e, f, g, h, i, j) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K](nested: ((((((((((A, B), C), D), E), F), G), H), I), J), K)): (A, B, C, D, E, F, G, H, I, J, K) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K]( + nested: ((((((((((A, B), C), D), E), F), G), H), I), J), K) + ): (A, B, C, D, E, F, G, H, I, J, K) = { val ((((((((((a, b), c), d), e), f), g), h), i), j), k) = nested (a, b, c, d, e, f, g, h, i, j, k) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L](nested: (((((((((((A, B), C), D), E), F), G), H), I), J), K), L)): (A, B, C, D, E, F, G, H, I, J, K, L) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L]( + nested: (((((((((((A, B), C), D), E), F), G), H), I), J), K), L) + ): (A, B, C, D, E, F, G, H, I, J, K, L) = { val (((((((((((a, b), c), d), e), f), g), h), i), j), k), l) = nested (a, b, c, d, e, f, g, h, i, j, k, l) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M](nested: ((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M)): (A, B, C, D, E, F, G, H, I, J, K, L, M) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M]( + nested: ((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M) = { val ((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N](nested: (((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + nested: (((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N) = { val (((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](nested: ((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + nested: ((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) = { val ((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](nested: (((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + nested: (((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) = { val (((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](nested: ((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + nested: ((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q) = { val ((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](nested: (((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + nested: (((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R) = { val (((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](nested: ((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S) = { + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + nested: ((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S) = { val ((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](nested: (((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T) = { - val (((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t) = nested + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + nested: (((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T) = { + val (((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t) = + nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](nested: ((((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T), U)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U) = { - val ((((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), u) = nested + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + nested: ( + (((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T), + U + ) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U) = { + val ( + (((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), + u + ) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u) } - def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](nested: (((((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T), U), V)): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V) = { - val (((((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), u), v) = nested + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + nested: ( + ( + ( + ((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), + T + ), + U + ), + V + ) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V) = { + val ( + ((((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), u), + v + ) = nested (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v) } // methods for flattening results of outerJoin - def flattenNestedOptionTuple[A, B, C](nested: (Option[(Option[A], Option[B])], Option[C])): (Option[A], Option[B], Option[C]) = { + def flattenNestedOptionTuple[A, B, C]( + nested: (Option[(Option[A], Option[B])], Option[C]) + ): (Option[A], Option[B], Option[C]) = { val (rest1, c) = nested val (a, b) = rest1.getOrElse(pairOfNones) (a, b, c) } - class FlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C](nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])]) { - def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C])] = nested.mapValues { tup => FlattenGroup.flattenNestedOptionTuple(tup) } + class FlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C])] = nested.mapValues { tup => + FlattenGroup.flattenNestedOptionTuple(tup) + } } - implicit def toFlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C](nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])]): FlattenGroup.FlattenOuterJoin3[KEY, KLL, A, B, C] = new FlattenOuterJoin3(nested) + implicit def toFlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])] + ): FlattenGroup.FlattenOuterJoin3[KEY, KLL, A, B, C] = new FlattenOuterJoin3(nested) - def flattenNestedOptionTuple[A, B, C, D](nested: (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])): (Option[A], Option[B], Option[C], Option[D]) = { + def flattenNestedOptionTuple[A, B, C, D]( + nested: (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D]) + ): (Option[A], Option[B], Option[C], Option[D]) = { val (rest1, d) = nested val (rest2, c) = rest1.getOrElse(pairOfNones) val (a, b) = rest2.getOrElse(pairOfNones) (a, b, c, d) } - class FlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D](nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])]) { - def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D])] = nested.mapValues { tup => FlattenGroup.flattenNestedOptionTuple(tup) } + class FlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D])] = nested.mapValues { tup => + FlattenGroup.flattenNestedOptionTuple(tup) + } } - implicit def toFlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D](nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])]): FlattenGroup.FlattenOuterJoin4[KEY, KLL, A, B, C, D] = new FlattenOuterJoin4(nested) + implicit def toFlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])] + ): FlattenGroup.FlattenOuterJoin4[KEY, KLL, A, B, C, D] = new FlattenOuterJoin4(nested) - def flattenNestedOptionTuple[A, B, C, D, E](nested: (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])): (Option[A], Option[B], Option[C], Option[D], Option[E]) = { + def flattenNestedOptionTuple[A, B, C, D, E]( + nested: (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E]) + ): (Option[A], Option[B], Option[C], Option[D], Option[E]) = { val (rest1, e) = nested val (rest2, d) = rest1.getOrElse(pairOfNones) val (rest3, c) = rest2.getOrElse(pairOfNones) @@ -170,13 +266,27 @@ object FlattenGroup { (a, b, c, d, e) } - class FlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E](nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])]) { - def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = nested.mapValues { tup => FlattenGroup.flattenNestedOptionTuple(tup) } + class FlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = + nested.mapValues(tup => FlattenGroup.flattenNestedOptionTuple(tup)) } - implicit def toFlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E](nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])]): FlattenGroup.FlattenOuterJoin5[KEY, KLL, A, B, C, D, E] = new FlattenOuterJoin5(nested) + implicit def toFlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E]( + nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])] + ): FlattenGroup.FlattenOuterJoin5[KEY, KLL, A, B, C, D, E] = new FlattenOuterJoin5(nested) - def flattenNestedOptionTuple[A, B, C, D, E, F](nested: (Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F]( + nested: ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F]) = { val (rest1, f) = nested val (rest2, e) = rest1.getOrElse(pairOfNones) val (rest3, d) = rest2.getOrElse(pairOfNones) @@ -185,13 +295,44 @@ object FlattenGroup { (a, b, c, d, e, f) } - class FlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F](nested: KLL[KEY, (Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])]) { - def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = nested.mapValues { tup => FlattenGroup.flattenNestedOptionTuple(tup) } + class FlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F]( + nested: KLL[ + KEY, + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = + nested.mapValues(tup => FlattenGroup.flattenNestedOptionTuple(tup)) } - implicit def toFlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F](nested: KLL[KEY, (Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])]): FlattenGroup.FlattenOuterJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenOuterJoin6(nested) + implicit def toFlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E, F]( + nested: KLL[ + KEY, + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ] + ): FlattenGroup.FlattenOuterJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenOuterJoin6(nested) - def flattenNestedOptionTuple[A, B, C, D, E, F, G](nested: (Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G]( + nested: ( + Option[ + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ], + Option[G] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G]) = { val (rest1, g) = nested val (rest2, f) = rest1.getOrElse(pairOfNones) val (rest3, e) = rest2.getOrElse(pairOfNones) @@ -201,7 +342,24 @@ object FlattenGroup { (a, b, c, d, e, f, g) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E]) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H]) = { val (rest1, h) = nested val (rest2, g) = rest1.getOrElse(pairOfNones) val (rest3, f) = rest2.getOrElse(pairOfNones) @@ -212,7 +370,32 @@ object FlattenGroup { (a, b, c, d, e, f, g, h) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) = { val (rest1, i) = nested val (rest2, h) = rest1.getOrElse(pairOfNones) val (rest3, g) = rest2.getOrElse(pairOfNones) @@ -224,7 +407,50 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D]) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J] + ) = { val (rest1, j) = nested val (rest2, i) = rest1.getOrElse(pairOfNones) val (rest3, h) = rest2.getOrElse(pairOfNones) @@ -237,7 +463,59 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) = { val (rest1, k) = nested val (rest2, j) = rest1.getOrElse(pairOfNones) val (rest3, i) = rest2.getOrElse(pairOfNones) @@ -251,7 +529,65 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) = { val (rest1, l) = nested val (rest2, k) = rest1.getOrElse(pairOfNones) val (rest3, j) = rest2.getOrElse(pairOfNones) @@ -266,7 +602,73 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[A], Option[B])], Option[C]) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) = { val (rest1, m) = nested val (rest2, l) = rest1.getOrElse(pairOfNones) val (rest3, k) = rest2.getOrElse(pairOfNones) @@ -282,7 +684,82 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[A], Option[B])], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) = { val (rest1, n) = nested val (rest2, m) = rest1.getOrElse(pairOfNones) val (rest3, l) = rest2.getOrElse(pairOfNones) @@ -299,7 +776,90 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[A], Option[B]) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) = { val (rest1, o) = nested val (rest2, n) = rest1.getOrElse(pairOfNones) val (rest3, m) = rest2.getOrElse(pairOfNones) @@ -317,7 +877,99 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) = { val (rest1, p) = nested val (rest2, o) = rest1.getOrElse(pairOfNones) val (rest3, n) = rest2.getOrElse(pairOfNones) @@ -336,7 +988,105 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) = { val (rest1, q) = nested val (rest2, p) = rest1.getOrElse(pairOfNones) val (rest3, o) = rest2.getOrElse(pairOfNones) @@ -356,7 +1106,115 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])], Option[R])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + A + ], + Option[ + B + ] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) = { val (rest1, r) = nested val (rest2, q) = rest1.getOrElse(pairOfNones) val (rest3, p) = rest2.getOrElse(pairOfNones) @@ -377,7 +1235,119 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])], Option[R])], Option[S])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[ + C + ] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) = { val (rest1, s) = nested val (rest2, r) = rest1.getOrElse(pairOfNones) val (rest3, q) = rest2.getOrElse(pairOfNones) @@ -399,7 +1369,120 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])], Option[R])], Option[S])], Option[T])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[A], Option[B])], + Option[C] + ) + ], + Option[ + D + ] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) = { val (rest1, t) = nested val (rest2, s) = rest1.getOrElse(pairOfNones) val (rest3, r) = rest2.getOrElse(pairOfNones) @@ -422,7 +1505,121 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])], Option[R])], Option[S])], Option[T])], Option[U])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[ + E + ] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ], + Option[U] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) = { val (rest1, u) = nested val (rest2, t) = rest1.getOrElse(pairOfNones) val (rest3, s) = rest2.getOrElse(pairOfNones) @@ -446,7 +1643,127 @@ object FlattenGroup { (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u) } - def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](nested: (Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], Option[F])], Option[G])], Option[H])], Option[I])], Option[J])], Option[K])], Option[L])], Option[M])], Option[N])], Option[O])], Option[P])], Option[Q])], Option[R])], Option[S])], Option[T])], Option[U])], Option[V])): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U], Option[V]) = { + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[ + F + ] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ], + Option[U] + ) + ], + Option[V] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) = { val (rest1, v) = nested val (rest2, u) = rest1.getOrElse(pairOfNones) val (rest3, t) = rest2.getOrElse(pairOfNones) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala index 5df81dbdb7..dc5cb89924 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala @@ -14,79 +14,124 @@ trait TypedSource3[A, B, C] extends TypedSource[Tuple3[A, B, C]] { } trait TypedSource4[A, B, C, D] extends TypedSource[Tuple4[A, B, C, D]] { - def converter[Z >: Tuple4[A, B, C, D]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) + def converter[Z >: Tuple4[A, B, C, D]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) } trait TypedSource5[A, B, C, D, E] extends TypedSource[Tuple5[A, B, C, D, E]] { - def converter[Z >: Tuple5[A, B, C, D, E]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) + def converter[Z >: Tuple5[A, B, C, D, E]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) } trait TypedSource6[A, B, C, D, E, F] extends TypedSource[Tuple6[A, B, C, D, E, F]] { - def converter[Z >: Tuple6[A, B, C, D, E, F]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) + def converter[Z >: Tuple6[A, B, C, D, E, F]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) } trait TypedSource7[A, B, C, D, E, F, G] extends TypedSource[Tuple7[A, B, C, D, E, F, G]] { - def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) + def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) } trait TypedSource8[A, B, C, D, E, F, G, H] extends TypedSource[Tuple8[A, B, C, D, E, F, G, H]] { - def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) + def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) } trait TypedSource9[A, B, C, D, E, F, G, H, I] extends TypedSource[Tuple9[A, B, C, D, E, F, G, H, I]] { - def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) + def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } trait TypedSource10[A, B, C, D, E, F, G, H, I, J] extends TypedSource[Tuple10[A, B, C, D, E, F, G, H, I, J]] { - def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) + def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } -trait TypedSource11[A, B, C, D, E, F, G, H, I, J, K] extends TypedSource[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { - def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) +trait TypedSource11[A, B, C, D, E, F, G, H, I, J, K] + extends TypedSource[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait TypedSource12[A, B, C, D, E, F, G, H, I, J, K, L] extends TypedSource[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { - def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) +trait TypedSource12[A, B, C, D, E, F, G, H, I, J, K, L] + extends TypedSource[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait TypedSource13[A, B, C, D, E, F, G, H, I, J, K, L, M] extends TypedSource[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { - def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) +trait TypedSource13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends TypedSource[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait TypedSource14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] extends TypedSource[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { - def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) +trait TypedSource14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends TypedSource[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait TypedSource15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] extends TypedSource[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { - def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) +trait TypedSource15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends TypedSource[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait TypedSource16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] extends TypedSource[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { - def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]]) +trait TypedSource16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends TypedSource[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + ) } -trait TypedSource17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] extends TypedSource[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { - def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]]) +trait TypedSource17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends TypedSource[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + ) } -trait TypedSource18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] extends TypedSource[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { - def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]]) +trait TypedSource18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends TypedSource[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + ) } -trait TypedSource19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] extends TypedSource[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { - def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]]) +trait TypedSource19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends TypedSource[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + ) } -trait TypedSource20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] extends TypedSource[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { - def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]]) +trait TypedSource20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends TypedSource[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait TypedSource21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] extends TypedSource[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { - def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]]) +trait TypedSource21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends TypedSource[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait TypedSource22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] extends TypedSource[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { - def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]]) +trait TypedSource22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends TypedSource[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } trait TypedSink1[A] extends TypedSink[Tuple1[A]] { @@ -106,75 +151,111 @@ trait TypedSink4[A, B, C, D] extends TypedSink[Tuple4[A, B, C, D]] { } trait TypedSink5[A, B, C, D, E] extends TypedSink[Tuple5[A, B, C, D, E]] { - final def setter[Z <: Tuple5[A, B, C, D, E]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple5[A, B, C, D, E]]) + final def setter[Z <: Tuple5[A, B, C, D, E]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple5[A, B, C, D, E]]) } trait TypedSink6[A, B, C, D, E, F] extends TypedSink[Tuple6[A, B, C, D, E, F]] { - final def setter[Z <: Tuple6[A, B, C, D, E, F]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple6[A, B, C, D, E, F]]) + final def setter[Z <: Tuple6[A, B, C, D, E, F]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple6[A, B, C, D, E, F]]) } trait TypedSink7[A, B, C, D, E, F, G] extends TypedSink[Tuple7[A, B, C, D, E, F, G]] { - final def setter[Z <: Tuple7[A, B, C, D, E, F, G]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple7[A, B, C, D, E, F, G]]) + final def setter[Z <: Tuple7[A, B, C, D, E, F, G]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple7[A, B, C, D, E, F, G]]) } trait TypedSink8[A, B, C, D, E, F, G, H] extends TypedSink[Tuple8[A, B, C, D, E, F, G, H]] { - final def setter[Z <: Tuple8[A, B, C, D, E, F, G, H]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple8[A, B, C, D, E, F, G, H]]) + final def setter[Z <: Tuple8[A, B, C, D, E, F, G, H]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple8[A, B, C, D, E, F, G, H]]) } trait TypedSink9[A, B, C, D, E, F, G, H, I] extends TypedSink[Tuple9[A, B, C, D, E, F, G, H, I]] { - final def setter[Z <: Tuple9[A, B, C, D, E, F, G, H, I]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) + final def setter[Z <: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } trait TypedSink10[A, B, C, D, E, F, G, H, I, J] extends TypedSink[Tuple10[A, B, C, D, E, F, G, H, I, J]] { - final def setter[Z <: Tuple10[A, B, C, D, E, F, G, H, I, J]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) + final def setter[Z <: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } -trait TypedSink11[A, B, C, D, E, F, G, H, I, J, K] extends TypedSink[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { - final def setter[Z <: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) +trait TypedSink11[A, B, C, D, E, F, G, H, I, J, K] + extends TypedSink[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + final def setter[Z <: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait TypedSink12[A, B, C, D, E, F, G, H, I, J, K, L] extends TypedSink[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { - final def setter[Z <: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) +trait TypedSink12[A, B, C, D, E, F, G, H, I, J, K, L] + extends TypedSink[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + final def setter[Z <: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait TypedSink13[A, B, C, D, E, F, G, H, I, J, K, L, M] extends TypedSink[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { - final def setter[Z <: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) +trait TypedSink13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends TypedSink[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + final def setter[Z <: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait TypedSink14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] extends TypedSink[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { - final def setter[Z <: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) +trait TypedSink14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends TypedSink[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + final def setter[Z <: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait TypedSink15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] extends TypedSink[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { - final def setter[Z <: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) +trait TypedSink15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends TypedSink[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + final def setter[Z <: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait TypedSink16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] extends TypedSink[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { - final def setter[Z <: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]]) +trait TypedSink16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends TypedSink[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + final def setter[Z <: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]]) } -trait TypedSink17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] extends TypedSink[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { - final def setter[Z <: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]]) +trait TypedSink17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends TypedSink[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + final def setter[Z <: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]]) } -trait TypedSink18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] extends TypedSink[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { - final def setter[Z <: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]]) +trait TypedSink18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends TypedSink[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + final def setter[Z <: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]]) } -trait TypedSink19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] extends TypedSink[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { - final def setter[Z <: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]]) +trait TypedSink19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends TypedSink[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + final def setter[Z <: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]]) } -trait TypedSink20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] extends TypedSink[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { - final def setter[Z <: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]]) +trait TypedSink20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends TypedSink[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + final def setter[Z <: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait TypedSink21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] extends TypedSink[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { - final def setter[Z <: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]]) +trait TypedSink21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends TypedSink[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + final def setter[Z <: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait TypedSink22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] extends TypedSink[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { - final def setter[Z <: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]]) +trait TypedSink22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends TypedSink[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + final def setter[Z <: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala index d8a8eb0940..718c3a37f9 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import com.twitter.algebird.Semigroup @@ -24,9 +24,10 @@ import scala.util.hashing.MurmurHash3 import java.io.Serializable object CoGroupable extends Serializable { + /** - * Return true if there is a sum occurring at the end the mapGroup transformations - * If we know this is finally summed, we can make some different optimization choices + * Return true if there is a sum occurring at the end the mapGroup transformations If we know this is + * finally summed, we can make some different optimization choices * * If this is true, we know we have at most one value for each key */ @@ -34,60 +35,58 @@ object CoGroupable extends Serializable { import CoGrouped._ cg match { case Pair(left, right, joinf) => - atMostOneValue(left) && atMostOneValue(right) && ( - joinf match { - case Joiner.InnerJoin() => true - case Joiner.OuterJoin() => true - case Joiner.LeftJoin() => true - case Joiner.RightJoin() => true - case _ => false - }) - case WithReducers(on, _) => atMostOneValue(on) + atMostOneValue(left) && atMostOneValue(right) && (joinf match { + case Joiner.InnerJoin() => true + case Joiner.OuterJoin() => true + case Joiner.LeftJoin() => true + case Joiner.RightJoin() => true + case _ => false + }) + case WithReducers(on, _) => atMostOneValue(on) case WithDescription(on, _) => atMostOneValue(on) - case FilterKeys(on, _) => atMostOneValue(on) + case FilterKeys(on, _) => atMostOneValue(on) case MapGroup(on, fn) => atMostOneFn(fn) || (atMostOneValue(on) && atMostInputSizeFn(fn)) - case IdentityReduce(_, _, _, _, _) => false + case IdentityReduce(_, _, _, _, _) => false case UnsortedIdentityReduce(_, _, _, _, _) => false - case IteratorMappedReduce(_, _, fn, _, _) => atMostOneFn(fn) + case IteratorMappedReduce(_, _, fn, _, _) => atMostOneFn(fn) } } /** - * Returns true if the group mapping function definitely returns 0 or 1 - * element. + * Returns true if the group mapping function definitely returns 0 or 1 element. * * in 2.12 this can be tailrec, but the types change on recursion, so 2.11 forbids */ final def atMostOneFn[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): Boolean = fn match { case ComposedMapGroup(_, fn) if atMostOneFn(fn) => true - case ComposedMapGroup(first, second) => atMostOneFn(first) && atMostInputSizeFn(second) - case MapValueStream(SumAll(_)) => true - case MapValueStream(ToList()) => true - case MapValueStream(FoldIterator(_)) => true - case MapValueStream(FoldLeftIterator(_, _)) => true - case FoldWithKeyIterator(_) => true - case EmptyGuard(fn) => atMostOneFn(fn) - case _ => false + case ComposedMapGroup(first, second) => atMostOneFn(first) && atMostInputSizeFn(second) + case MapValueStream(SumAll(_)) => true + case MapValueStream(ToList()) => true + case MapValueStream(FoldIterator(_)) => true + case MapValueStream(FoldLeftIterator(_, _)) => true + case FoldWithKeyIterator(_) => true + case EmptyGuard(fn) => atMostOneFn(fn) + case _ => false } /** - * Returns true if the group mapping function does not increase - * the number of items in the Iterator + * Returns true if the group mapping function does not increase the number of items in the Iterator */ final def atMostInputSizeFn[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): Boolean = fn match { - case MapGroupMapValues(_) => true - case MapValueStream(Drop(_)) => true + case MapGroupMapValues(_) => true + case MapValueStream(Drop(_)) => true case MapValueStream(DropWhile(_)) => true - case MapValueStream(Take(_)) => true + case MapValueStream(Take(_)) => true case MapValueStream(TakeWhile(_)) => true - case FilterGroup(_) => true - case EmptyGuard(fn) if atMostOneFn(fn) => true // since 0 always goes to 0 due to empty guard, and 1 -> 0 or 1 since atMostOne - case EmptyGuard(fn) => atMostInputSizeFn(fn) + case FilterGroup(_) => true + case EmptyGuard(fn) if atMostOneFn(fn) => + true // since 0 always goes to 0 due to empty guard, and 1 -> 0 or 1 since atMostOne + case EmptyGuard(fn) => atMostInputSizeFn(fn) case ComposedMapGroup(first, second) => atMostInputSizeFn(first) && atMostInputSizeFn(second) - case _ => false + case _ => false } } @@ -95,6 +94,7 @@ object CoGroupable extends Serializable { * Represents something than can be CoGrouped with another CoGroupable */ sealed trait CoGroupable[K, +R] extends HasReducers with HasDescription with Serializable { + /** * This is the list of mapped pipes, just before the (reducing) joinFunction is applied */ @@ -103,25 +103,24 @@ sealed trait CoGroupable[K, +R] extends HasReducers with HasDescription with Ser def keyOrdering: Ordering[K] /** - * This function is not type-safe for others to call, but it should - * never have an error. By construction, we never call it with incorrect - * types. - * It would be preferable to have stronger type safety here, but unclear - * how to achieve, and since it is an internal function, not clear it - * would actually help anyone for it to be type-safe + * This function is not type-safe for others to call, but it should never have an error. By construction, we + * never call it with incorrect types. It would be preferable to have stronger type safety here, but unclear + * how to achieve, and since it is an internal function, not clear it would actually help anyone for it to + * be type-safe */ def joinFunction: MultiJoinFunction[K, R] /** - * Smaller is about average values/key not total size (that does not matter, but is - * clearly related). + * Smaller is about average values/key not total size (that does not matter, but is clearly related). * - * Note that from the type signature we see that the right side is iterated (or may be) - * over and over, but the left side is not. That means that you want the side with - * fewer values per key on the right. If both sides are similar, no need to worry. - * If one side is a one-to-one mapping, that should be the "smaller" side. + * Note that from the type signature we see that the right side is iterated (or may be) over and over, but + * the left side is not. That means that you want the side with fewer values per key on the right. If both + * sides are similar, no need to worry. If one side is a one-to-one mapping, that should be the "smaller" + * side. */ - def cogroup[R1, R2](smaller: CoGroupable[K, R1])(fn: (K, Iterator[R], Iterable[R1]) => Iterator[R2]): CoGrouped[K, R2] = + def cogroup[R1, R2](smaller: CoGroupable[K, R1])( + fn: (K, Iterator[R], Iterable[R1]) => Iterator[R2] + ): CoGrouped[K, R2] = CoGrouped.Pair(this, smaller, fn) def join[W](smaller: CoGroupable[K, W]) = @@ -174,15 +173,16 @@ object CoGrouped extends Serializable { optCg.map { cg1 => reds match { case Some(r) if cg1.reducers != reds => CoGrouped.WithReducers(cg1, r) - case _ => cg1 + case _ => cg1 } } } final case class Pair[K, A, B, C]( - larger: CoGroupable[K, A], - smaller: CoGroupable[K, B], - fn: (K, Iterator[A], Iterable[B]) => Iterator[C]) extends CoGrouped[K, C] { + larger: CoGroupable[K, A], + smaller: CoGroupable[K, B], + fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends CoGrouped[K, C] { // case classes that merge more than one TypedPipe need to memoize the result or // it can be exponential in complexity @@ -190,8 +190,8 @@ object CoGrouped extends Serializable { override def equals(that: Any) = that match { case thatRef: AnyRef if this eq thatRef => true - case Pair(l, s, f) => (fn == f) && (l == larger) && (s == smaller) - case _ => false + case Pair(l, s, f) => (fn == f) && (l == larger) && (s == smaller) + case _ => false } def inputs = larger.inputs ++ smaller.inputs @@ -200,13 +200,13 @@ object CoGrouped extends Serializable { def keyOrdering = smaller.keyOrdering /** - * Avoid capturing anything below as it will need to be serialized and sent to - * all the reducers. + * Avoid capturing anything below as it will need to be serialized and sent to all the reducers. */ def joinFunction = { + /** - * if there is at most one value on the smaller side definitely - * cache the result to avoid repeatedly computing it + * if there is at most one value on the smaller side definitely cache the result to avoid repeatedly + * computing it */ val smallerIsAtMostOne = CoGroupable.atMostOneValue(smaller) if (smallerIsAtMostOne) MultiJoinFunction.PairCachedRight(larger.joinFunction, smaller.joinFunction, fn) @@ -222,9 +222,7 @@ object CoGrouped extends Serializable { def descriptions: Seq[String] = on.descriptions } - final case class WithDescription[K, V]( - on: CoGrouped[K, V], - description: String) extends CoGrouped[K, V] { + final case class WithDescription[K, V](on: CoGrouped[K, V], description: String) extends CoGrouped[K, V] { def inputs = on.inputs def reducers = on.reducers @@ -241,23 +239,23 @@ object CoGrouped extends Serializable { def descriptions: Seq[String] = on.descriptions } - final case class MapGroup[K, V1, V2](on: CoGrouped[K, V1], fn: (K, Iterator[V1]) => Iterator[V2]) extends CoGrouped[K, V2] { + final case class MapGroup[K, V1, V2](on: CoGrouped[K, V1], fn: (K, Iterator[V1]) => Iterator[V2]) + extends CoGrouped[K, V2] { def inputs = on.inputs def reducers = on.reducers def descriptions: Seq[String] = on.descriptions def keyOrdering = on.keyOrdering def joinFunction = - MultiJoinFunction.MapGroup( - on.joinFunction, - fn) + MultiJoinFunction.MapGroup(on.joinFunction, fn) } } -sealed trait CoGrouped[K, +R] extends KeyedListLike[K, R, CoGrouped] - with CoGroupable[K, R] - with WithReducers[CoGrouped[K, R]] - with WithDescription[CoGrouped[K, R]] - with Serializable { +sealed trait CoGrouped[K, +R] + extends KeyedListLike[K, R, CoGrouped] + with CoGroupable[K, R] + with WithReducers[CoGrouped[K, R]] + with WithDescription[CoGrouped[K, R]] + with Serializable { override def withReducers(reds: Int): CoGrouped[K, R] = CoGrouped.WithReducers(this, reds) @@ -266,10 +264,9 @@ sealed trait CoGrouped[K, +R] extends KeyedListLike[K, R, CoGrouped] CoGrouped.WithDescription(this, description) /** - * It seems complex to push a take up to the mappers before a general join. - * For some cases (inner join), we could take at most n from each TypedPipe, - * but it is not clear how to generalize that for general cogrouping functions. - * For now, just do a normal take. + * It seems complex to push a take up to the mappers before a general join. For some cases (inner join), we + * could take at most n from each TypedPipe, but it is not clear how to generalize that for general + * cogrouping functions. For now, just do a normal take. */ override def bufferedTake(n: Int): CoGrouped[K, R] = take(n) @@ -293,10 +290,11 @@ sealed trait CoGrouped[K, +R] extends KeyedListLike[K, R, CoGrouped] /** * If we can HashJoin, then we can CoGroup, but not vice-versa - * i.e., HashJoinable is a strict subset of CoGroupable (CoGrouped, for instance - * is CoGroupable, but not HashJoinable). + * i.e., HashJoinable is a strict subset of CoGroupable (CoGrouped, for instance is CoGroupable, but not + * HashJoinable). */ sealed trait HashJoinable[K, +V] extends CoGroupable[K, V] with KeyedPipe[K] { + /** A HashJoinable has a single input into to the cogroup */ override def inputs = List(mapped) } @@ -304,60 +302,55 @@ sealed trait HashJoinable[K, +V] extends CoGroupable[K, V] with KeyedPipe[K] { object HashJoinable extends Serializable { def toReduceStep[A, B](hj: HashJoinable[A, B]): ReduceStep[A, _, _ <: B] = hj match { - case step@IdentityReduce(_, _, _, _, _) => step - case step@UnsortedIdentityReduce(_, _, _, _, _) => step - case step@IteratorMappedReduce(_, _, _, _, _) => step + case step @ IdentityReduce(_, _, _, _, _) => step + case step @ UnsortedIdentityReduce(_, _, _, _, _) => step + case step @ IteratorMappedReduce(_, _, _, _, _) => step } def filterKeys[A, B](hj: HashJoinable[A, B], fn: A => Boolean): HashJoinable[A, B] = hj match { - case step@IdentityReduce(_, _, _, _, _) => + case step @ IdentityReduce(_, _, _, _, _) => step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) - case step@UnsortedIdentityReduce(_, _, _, _, _) => + case step @ UnsortedIdentityReduce(_, _, _, _, _) => step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) - case step@IteratorMappedReduce(_, _, _, _, _) => + case step @ IteratorMappedReduce(_, _, _, _, _) => step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) } } + /** - * This encodes the rules that - * 1) sorting is only possible before doing any reduce, - * 2) reversing is only possible after sorting. - * 3) unsorted Groups can be CoGrouped or HashJoined + * This encodes the rules that 1) sorting is only possible before doing any reduce, 2) reversing is only + * possible after sorting. 3) unsorted Groups can be CoGrouped or HashJoined * - * This may appear a complex type, but it makes - * sure that code won't compile if it breaks the rule + * This may appear a complex type, but it makes sure that code won't compile if it breaks the rule */ sealed trait Grouped[K, +V] - extends KeyedListLike[K, V, UnsortedGrouped] - with HashJoinable[K, V] - with Sortable[V, ({ type t[+x] = SortedGrouped[K, x] with Reversable[SortedGrouped[K, x]] })#t] - with WithReducers[Grouped[K, V]] - with WithDescription[Grouped[K, V]] + extends KeyedListLike[K, V, UnsortedGrouped] + with HashJoinable[K, V] + with Sortable[V, ({ type t[+x] = SortedGrouped[K, x] with Reversable[SortedGrouped[K, x]] })#t] + with WithReducers[Grouped[K, V]] + with WithDescription[Grouped[K, V]] /** - * After sorting, we are no longer CoGroupable, and we can only call reverse - * in the initial SortedGrouped created from the Sortable: - * .sortBy(_._2).reverse - * for instance + * After sorting, we are no longer CoGroupable, and we can only call reverse in the initial SortedGrouped + * created from the Sortable: .sortBy(_._2).reverse for instance * * Once we have sorted, we cannot do a HashJoin or a CoGrouping */ sealed trait SortedGrouped[K, +V] - extends KeyedListLike[K, V, SortedGrouped] - with WithReducers[SortedGrouped[K, V]] - with WithDescription[SortedGrouped[K, V]] + extends KeyedListLike[K, V, SortedGrouped] + with WithReducers[SortedGrouped[K, V]] + with WithDescription[SortedGrouped[K, V]] /** - * This is the state after we have done some reducing. It is - * not possible to sort at this phase, but it is possible to - * do a CoGrouping or a HashJoin. + * This is the state after we have done some reducing. It is not possible to sort at this phase, but it is + * possible to do a CoGrouping or a HashJoin. */ sealed trait UnsortedGrouped[K, +V] - extends KeyedListLike[K, V, UnsortedGrouped] - with HashJoinable[K, V] - with WithReducers[UnsortedGrouped[K, V]] - with WithDescription[UnsortedGrouped[K, V]] + extends KeyedListLike[K, V, UnsortedGrouped] + with HashJoinable[K, V] + with WithReducers[UnsortedGrouped[K, V]] + with WithDescription[UnsortedGrouped[K, V]] object Grouped extends Serializable { def apply[K, V](pipe: TypedPipe[(K, V)])(implicit ordering: Ordering[K]): Grouped[K, V] = @@ -365,16 +358,15 @@ object Grouped extends Serializable { def addEmptyGuard[K, V1, V2](fn: (K, Iterator[V1]) => Iterator[V2]): (K, Iterator[V1]) => Iterator[V2] = fn match { - case alreadyGuarded@EmptyGuard(_) => alreadyGuarded + case alreadyGuarded @ EmptyGuard(_) => alreadyGuarded case ami if CoGroupable.atMostInputSizeFn(ami) => ami // already safe - case needGuard => EmptyGuard(needGuard) + case needGuard => EmptyGuard(needGuard) } } /** - * All sorting methods defined here trigger Hadoop secondary sort on key + value. - * Hadoop secondary sort is external sorting. i.e. it won't materialize all values - * of each key in memory on the reducer. + * All sorting methods defined here trigger Hadoop secondary sort on key + value. Hadoop secondary sort is + * external sorting. i.e. it won't materialize all values of each key in memory on the reducer. */ sealed trait Sortable[+T, +Sorted[+_]] { def withSortOrdering[U >: T](so: Ordering[U]): Sorted[U] @@ -396,11 +388,11 @@ sealed trait Reversable[+R] { } /** - * This is a class that models the logical portion of the reduce step. - * details like where this occurs, the number of reducers, etc... are - * left in the Grouped class + * This is a class that models the logical portion of the reduce step. details like where this occurs, the + * number of reducers, etc... are left in the Grouped class */ sealed trait ReduceStep[K, V1, V2] extends KeyedPipe[K] with HasReducers { + /** * Note, this satisfies KeyedPipe.mapped: TypedPipe[(K, Any)] */ @@ -412,18 +404,19 @@ sealed trait ReduceStep[K, V1, V2] extends KeyedPipe[K] with HasReducers { object ReduceStep extends Serializable { /** - * assuming coherent Orderings on the A, in some cases ReduceSteps can be combined. - * Note: we have always assumed coherant orderings in scalding with joins where - * both sides have their own Ordering, so we argue this is not different. + * assuming coherent Orderings on the A, in some cases ReduceSteps can be combined. Note: we have always + * assumed coherant orderings in scalding with joins where both sides have their own Ordering, so we argue + * this is not different. * - * If a user has incoherant Orderings, which are already dangerous, they can - * use .forceToDisk between reduce steps, however, a better strategy is to - * use different key types. + * If a user has incoherant Orderings, which are already dangerous, they can use .forceToDisk between reduce + * steps, however, a better strategy is to use different key types. * - * The only case where they can't is when there are two different value sorts going - * on. + * The only case where they can't is when there are two different value sorts going on. */ - def maybeCompose[A, B, C, D](rs1: ReduceStep[A, B, C], rs2: ReduceStep[A, C, D]): Option[ReduceStep[A, B, D]] = { + def maybeCompose[A, B, C, D]( + rs1: ReduceStep[A, B, C], + rs2: ReduceStep[A, C, D] + ): Option[ReduceStep[A, B, D]] = { val reds = WithReducers.maybeCombine(rs1.reducers, rs2.reducers) val optRs = (rs1, rs2) match { case (step @ IdentityReduce(_, _, _, _, _), step2) => @@ -444,11 +437,11 @@ object ReduceStep extends Serializable { * All the rest have either two sorts, or a sort after a reduce */ case (IdentityValueSortedReduce(_, _, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None - case (IdentityValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None - case (IteratorMappedReduce(_, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None - case (IteratorMappedReduce(_, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None - case (ValueSortedReduce(_, _, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None - case (ValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + case (IdentityValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + case (IteratorMappedReduce(_, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None + case (IteratorMappedReduce(_, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + case (ValueSortedReduce(_, _, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None + case (ValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None } optRs.map { composed => @@ -466,11 +459,13 @@ object ReduceStep extends Serializable { val step = step0.evidence.subst[IR](step0) val revEv = step0.evidence.reverse val res = - IdentityReduce[A, C, C](step.keyOrdering, + IdentityReduce[A, C, C]( + step.keyOrdering, step0.evidence.subst[In](input), step.reducers, step.descriptions, - implicitly) + implicitly + ) // Put the type back to what scala expects ReduceStep[A, B, C] revEv.subst[Res](res) case step0 @ UnsortedIdentityReduce(_, _, _, _, _) => @@ -478,11 +473,13 @@ object ReduceStep extends Serializable { val step = step0.evidence.subst[IR](step0) val revEv = step0.evidence.reverse val res = - UnsortedIdentityReduce[A, C, C](step.keyOrdering, + UnsortedIdentityReduce[A, C, C]( + step.keyOrdering, step0.evidence.subst[In](input), step.reducers, step.descriptions, - implicitly) + implicitly + ) // Put the type back to what scala expects ReduceStep[A, B, C] revEv.subst[Res](res) case step0 @ IdentityValueSortedReduce(_, _, _, _, _, _) => @@ -490,17 +487,25 @@ object ReduceStep extends Serializable { val step = step0.evidence.subst[IVSR](step0) val revEv = step0.evidence.reverse val res = - IdentityValueSortedReduce[A, C, C](step.keyOrdering, + IdentityValueSortedReduce[A, C, C]( + step.keyOrdering, step0.evidence.subst[In](input), step.valueSort, step.reducers, step.descriptions, - implicitly) + implicitly + ) // Put the type back to what scala expects ReduceStep[A, B, C] revEv.subst[Res](res) case step @ ValueSortedReduce(_, _, _, _, _, _) => - ValueSortedReduce[A, B, C](step.keyOrdering, - input, step.valueSort, step.reduceFn, step.reducers, step.descriptions) + ValueSortedReduce[A, B, C]( + step.keyOrdering, + input, + step.valueSort, + step.reduceFn, + step.reducers, + step.descriptions + ) case step @ IteratorMappedReduce(_, _, _, _, _) => def go(imr: IteratorMappedReduce[A, B, C]): IteratorMappedReduce[A, B, C] = imr.copy(mapped = input) @@ -508,7 +513,9 @@ object ReduceStep extends Serializable { } } - def mapGroup[A, B, C, D](rs: ReduceStep[A, B, C])(fn: (A, Iterator[C]) => Iterator[D]): ReduceStep[A, B, D] = + def mapGroup[A, B, C, D]( + rs: ReduceStep[A, B, C] + )(fn: (A, Iterator[C]) => Iterator[D]): ReduceStep[A, B, D] = rs match { case step @ IdentityReduce(_, _, _, _, _) => type Res[T] = ReduceStep[A, T, D] @@ -569,13 +576,13 @@ object ReduceStep extends Serializable { } final case class IdentityReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - override val reducers: Option[Int], - override val descriptions: Seq[String], - evidence: EqTypes[V1, V2]) - extends ReduceStep[K, V1, V2] - with Grouped[K, V2] { + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with Grouped[K, V2] { /* * Because after mapValues, take, filter, we can no-longer sort, @@ -590,9 +597,8 @@ final case class IdentityReduce[K, V1, V2]( } /** - * This does the partial heap sort followed by take in memory on the mappers - * before sending to the mappers. This is a big help if there are relatively - * few keys and n is relatively small. + * This does the partial heap sort followed by take in memory on the mappers before sending to the mappers. + * This is a big help if there are relatively few keys and n is relatively small. */ override def bufferedTake(n: Int) = toUIR.bufferedTake(n) @@ -609,10 +615,9 @@ final case class IdentityReduce[K, V1, V2]( override def filterKeys(fn: K => Boolean) = toUIR.filterKeys(fn) - override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = // Only pass non-Empty iterators to subsequent functions IteratorMappedReduce(keyOrdering, mappedV2, Grouped.addEmptyGuard(fn), reducers, descriptions) - } // It would be nice to return IdentityReduce here, but // the type constraints prevent it currently @@ -625,7 +630,13 @@ final case class IdentityReduce[K, V1, V2]( override def sum[U >: V2](implicit sg: Semigroup[U]) = { // there is no sort, mapValueStream or force to reducers: val upipe: TypedPipe[(K, U)] = mappedV2 // use covariance to set the type - UnsortedIdentityReduce[K, U, U](keyOrdering, upipe.sumByLocalKeys, reducers, descriptions, implicitly).sumLeft + UnsortedIdentityReduce[K, U, U]( + keyOrdering, + upipe.sumByLocalKeys, + reducers, + descriptions, + implicitly + ).sumLeft } /** This is just an identity that casts the result to V2 */ @@ -633,18 +644,17 @@ final case class IdentityReduce[K, V1, V2]( } final case class UnsortedIdentityReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - override val reducers: Option[Int], - override val descriptions: Seq[String], - evidence: EqTypes[V1, V2]) - extends ReduceStep[K, V1, V2] - with UnsortedGrouped[K, V2] { + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with UnsortedGrouped[K, V2] { /** - * This does the partial heap sort followed by take in memory on the mappers - * before sending to the reducers. This is a big help if there are relatively - * few keys and n is relatively small. + * This does the partial heap sort followed by take in memory on the mappers before sending to the reducers. + * This is a big help if there are relatively few keys and n is relatively small. */ override def bufferedTake(n: Int) = if (n < 1) { @@ -661,13 +671,19 @@ final case class UnsortedIdentityReduce[K, V1, V2]( val fakeOrdering: Ordering[V1] = Ordering.by { v: V1 => v.hashCode } implicit val mon: PriorityQueueMonoid[V1] = new PriorityQueueMonoid[V1](n)(fakeOrdering) // Do the heap-sort on the mappers: - val pretake: TypedPipe[(K, V1)] = mapped.mapValues { v: V1 => mon.build(v) } + val pretake: TypedPipe[(K, V1)] = mapped + .mapValues { v: V1 => mon.build(v) } .sumByLocalKeys .flatMap { case (k, vs) => vs.iterator.asScala.map((k, _)) } // We have removed the priority queues, so serialization is not greater // Now finish on the reducers - UnsortedIdentityReduce[K, V1, V2](keyOrdering, pretake, reducers, descriptions, evidence) - .forceToReducers // jump to ValueSortedReduce + UnsortedIdentityReduce[K, V1, V2]( + keyOrdering, + pretake, + reducers, + descriptions, + evidence + ).forceToReducers // jump to ValueSortedReduce .take(n) } @@ -697,7 +713,13 @@ final case class UnsortedIdentityReduce[K, V1, V2]( override def sum[U >: V2](implicit sg: Semigroup[U]) = { // there is no sort, mapValueStream or force to reducers: val upipe: TypedPipe[(K, U)] = mappedV2 // use covariance to set the type - UnsortedIdentityReduce[K, U, U](keyOrdering, upipe.sumByLocalKeys, reducers, descriptions, implicitly).sumLeft + UnsortedIdentityReduce[K, U, U]( + keyOrdering, + upipe.sumByLocalKeys, + reducers, + descriptions, + implicitly + ).sumLeft } /** This is just an identity that casts the result to V2 */ @@ -705,40 +727,75 @@ final case class UnsortedIdentityReduce[K, V1, V2]( } final case class IdentityValueSortedReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - valueSort: Ordering[V1], - override val reducers: Option[Int], - override val descriptions: Seq[String], - evidence: EqTypes[V1, V2]) extends ReduceStep[K, V1, V2] - with SortedGrouped[K, V2] - with Reversable[IdentityValueSortedReduce[K, V1, V2]] { + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + valueSort: Ordering[V1], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with SortedGrouped[K, V2] + with Reversable[IdentityValueSortedReduce[K, V1, V2]] { override def reverse: IdentityValueSortedReduce[K, V1, V2] = - IdentityValueSortedReduce[K, V1, V2](keyOrdering, mapped, valueSort.reverse, reducers, descriptions, evidence) + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort.reverse, + reducers, + descriptions, + evidence + ) override def withReducers(red: Int): IdentityValueSortedReduce[K, V1, V2] = // copy fails to get the types right, :/ - IdentityValueSortedReduce[K, V1, V2](keyOrdering, mapped, valueSort, reducers = Some(red), descriptions, evidence) + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort, + reducers = Some(red), + descriptions, + evidence + ) override def withDescription(description: String): IdentityValueSortedReduce[K, V1, V2] = - IdentityValueSortedReduce[K, V1, V2](keyOrdering, mapped, valueSort, reducers, descriptions = descriptions :+ description, evidence) + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort, + reducers, + descriptions = descriptions :+ description, + evidence + ) override def filterKeys(fn: K => Boolean) = // copy fails to get the types right, :/ - IdentityValueSortedReduce[K, V1, V2](keyOrdering, mapped.filterKeys(fn), valueSort, reducers, descriptions, evidence) + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped.filterKeys(fn), + valueSort, + reducers, + descriptions, + evidence + ) override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { // Only pass non-Empty iterators to subsequent functions val gfn = Grouped.addEmptyGuard(fn) type TK[V] = TypedPipe[(K, V)] - ValueSortedReduce[K, V2, V3](keyOrdering, evidence.subst[TK](mapped), evidence.subst[Ordering](valueSort), gfn, reducers, descriptions) + ValueSortedReduce[K, V2, V3]( + keyOrdering, + evidence.subst[TK](mapped), + evidence.subst[Ordering](valueSort), + gfn, + reducers, + descriptions + ) } /** - * This does the partial heap sort followed by take in memory on the mappers - * before sending to the reducers. This is a big help if there are relatively - * few keys and n is relatively small. + * This does the partial heap sort followed by take in memory on the mappers before sending to the reducers. + * This is a big help if there are relatively few keys and n is relatively small. */ override def bufferedTake(n: Int): SortedGrouped[K, V2] = if (n <= 0) { @@ -747,20 +804,25 @@ final case class IdentityValueSortedReduce[K, V1, V2]( } else { implicit val mon: PriorityQueueMonoid[V1] = new PriorityQueueMonoid[V1](n)(valueSort) // Do the heap-sort on the mappers: - val pretake: TypedPipe[(K, V1)] = mapped.mapValues { v: V1 => mon.build(v) } + val pretake: TypedPipe[(K, V1)] = mapped + .mapValues { v: V1 => mon.build(v) } .sumByLocalKeys .flatMap { case (k, vs) => vs.iterator.asScala.map((k, _)) } // Now finish on the reducers - IdentityValueSortedReduce[K, V1, V2](keyOrdering, pretake, valueSort, reducers, descriptions, evidence) - .forceToReducers // jump to ValueSortedReduce + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + pretake, + valueSort, + reducers, + descriptions, + evidence + ).forceToReducers // jump to ValueSortedReduce .take(n) } /** - * We are sorting then taking. Optimized for small take values - * If we take <= 1, we use an in-memory-based method. - * To force a memory-based take, use bufferedTake - * Otherwise, we send all the values to the reducers + * We are sorting then taking. Optimized for small take values If we take <= 1, we use an in-memory-based + * method. To force a memory-based take, use bufferedTake Otherwise, we send all the values to the reducers */ override def take(n: Int) = if (n <= 1) bufferedTake(n) @@ -768,52 +830,63 @@ final case class IdentityValueSortedReduce[K, V1, V2]( } final case class ValueSortedReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - valueSort: Ordering[V1], - reduceFn: (K, Iterator[V1]) => Iterator[V2], - override val reducers: Option[Int], - override val descriptions: Seq[String]) - extends ReduceStep[K, V1, V2] with SortedGrouped[K, V2] { + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + valueSort: Ordering[V1], + reduceFn: (K, Iterator[V1]) => Iterator[V2], + override val reducers: Option[Int], + override val descriptions: Seq[String] +) extends ReduceStep[K, V1, V2] + with SortedGrouped[K, V2] { /** - * After sorting, then reducing, there is no chance - * to operate in the mappers. Just call take. + * After sorting, then reducing, there is no chance to operate in the mappers. Just call take. */ override def bufferedTake(n: Int) = take(n) override def withReducers(red: Int) = // copy infers loose types. :( - ValueSortedReduce[K, V1, V2]( - keyOrdering, mapped, valueSort, reduceFn, Some(red), descriptions) + ValueSortedReduce[K, V1, V2](keyOrdering, mapped, valueSort, reduceFn, Some(red), descriptions) override def withDescription(description: String) = ValueSortedReduce[K, V1, V2]( - keyOrdering, mapped, valueSort, reduceFn, reducers, descriptions :+ description) + keyOrdering, + mapped, + valueSort, + reduceFn, + reducers, + descriptions :+ description + ) override def filterKeys(fn: K => Boolean) = // copy fails to get the types right, :/ - ValueSortedReduce[K, V1, V2](keyOrdering, mapped.filterKeys(fn), valueSort, reduceFn, reducers, descriptions) + ValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped.filterKeys(fn), + valueSort, + reduceFn, + reducers, + descriptions + ) override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { // we don't need the empty guard here because ComposedMapGroup already does it val newReduce = ComposedMapGroup(reduceFn, fn) - ValueSortedReduce[K, V1, V3]( - keyOrdering, mapped, valueSort, newReduce, reducers, descriptions) + ValueSortedReduce[K, V1, V3](keyOrdering, mapped, valueSort, newReduce, reducers, descriptions) } } final case class IteratorMappedReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - reduceFn: (K, Iterator[V1]) => Iterator[V2], - override val reducers: Option[Int], - override val descriptions: Seq[String]) - extends ReduceStep[K, V1, V2] with UnsortedGrouped[K, V2] { + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + reduceFn: (K, Iterator[V1]) => Iterator[V2], + override val reducers: Option[Int], + override val descriptions: Seq[String] +) extends ReduceStep[K, V1, V2] + with UnsortedGrouped[K, V2] { /** - * After reducing, we are always - * operating in memory. Just call take. + * After reducing, we are always operating in memory. Just call take. */ override def bufferedTake(n: Int) = take(n) @@ -834,4 +907,3 @@ final case class IteratorMappedReduce[K, V1, V2]( override def joinFunction = MultiJoinFunction.MapCast(reduceFn) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala index c3a7689be1..5c9f30cfc6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala @@ -11,32 +11,31 @@ sealed trait HashEqualsArrayWrapper[T] { object HashEqualsArrayWrapper { /** - * Wraps an Array in an object with a valid equals() and hashCode() - * Uses specialized wrappers for arrays of primitive values. + * Wraps an Array in an object with a valid equals() and hashCode() Uses specialized wrappers for arrays of + * primitive values. */ def wrap[T](a: Array[T]): HashEqualsArrayWrapper[T] = wrapByClassFn[T](a.getClass.asInstanceOf[Class[Array[T]]])(a) /** - * Creates a function that can be used to wrap Arrays into objects - * with valid equals() and hashCode() methods. + * Creates a function that can be used to wrap Arrays into objects with valid equals() and hashCode() + * methods. * - * Using this method and applying it to many arrays should be faster - * than using wrap above on each array, because this method uses reflection - * once, and wrap above uses reflection on each individual array. + * Using this method and applying it to many arrays should be faster than using wrap above on each array, + * because this method uses reflection once, and wrap above uses reflection on each individual array. */ def wrapByClassFn[T](clazz: Class[Array[T]]): Array[T] => HashEqualsArrayWrapper[T] = { val fn = clazz match { - case c if classOf[Array[Long]].equals(c) => a: Array[Long] => new HashEqualsLongArrayWrapper(a) - case c if classOf[Array[Int]].equals(c) => a: Array[Int] => new HashEqualsIntArrayWrapper(a) - case c if classOf[Array[Short]].equals(c) => a: Array[Short] => new HashEqualsShortArrayWrapper(a) - case c if classOf[Array[Char]].equals(c) => a: Array[Char] => new HashEqualsCharArrayWrapper(a) - case c if classOf[Array[Byte]].equals(c) => a: Array[Byte] => new HashEqualsByteArrayWrapper(a) + case c if classOf[Array[Long]].equals(c) => a: Array[Long] => new HashEqualsLongArrayWrapper(a) + case c if classOf[Array[Int]].equals(c) => a: Array[Int] => new HashEqualsIntArrayWrapper(a) + case c if classOf[Array[Short]].equals(c) => a: Array[Short] => new HashEqualsShortArrayWrapper(a) + case c if classOf[Array[Char]].equals(c) => a: Array[Char] => new HashEqualsCharArrayWrapper(a) + case c if classOf[Array[Byte]].equals(c) => a: Array[Byte] => new HashEqualsByteArrayWrapper(a) case c if classOf[Array[Boolean]].equals(c) => a: Array[Boolean] => new HashEqualsBooleanArrayWrapper(a) - case c if classOf[Array[Float]].equals(c) => a: Array[Float] => new HashEqualsFloatArrayWrapper(a) - case c if classOf[Array[Double]].equals(c) => a: Array[Double] => new HashEqualsDoubleArrayWrapper(a) - case c => a: Array[T] => new HashEqualsObjectArrayWrapper(a) + case c if classOf[Array[Float]].equals(c) => a: Array[Float] => new HashEqualsFloatArrayWrapper(a) + case c if classOf[Array[Double]].equals(c) => a: Array[Double] => new HashEqualsDoubleArrayWrapper(a) + case c => a: Array[T] => new HashEqualsObjectArrayWrapper(a) } fn.asInstanceOf[(Array[T] => HashEqualsArrayWrapper[T])] @@ -218,53 +217,62 @@ object HashEqualsArrayWrapper { } } - implicit val hashEqualsLongOrdering: Ordering[HashEqualsArrayWrapper[Long]] = new Ordering[HashEqualsArrayWrapper[Long]] { - override def compare(x: HashEqualsArrayWrapper[Long], y: HashEqualsArrayWrapper[Long]): Int = - longArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsLongOrdering: Ordering[HashEqualsArrayWrapper[Long]] = + new Ordering[HashEqualsArrayWrapper[Long]] { + override def compare(x: HashEqualsArrayWrapper[Long], y: HashEqualsArrayWrapper[Long]): Int = + longArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsIntOrdering: Ordering[HashEqualsArrayWrapper[Int]] = new Ordering[HashEqualsArrayWrapper[Int]] { - override def compare(x: HashEqualsArrayWrapper[Int], y: HashEqualsArrayWrapper[Int]): Int = - intArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsIntOrdering: Ordering[HashEqualsArrayWrapper[Int]] = + new Ordering[HashEqualsArrayWrapper[Int]] { + override def compare(x: HashEqualsArrayWrapper[Int], y: HashEqualsArrayWrapper[Int]): Int = + intArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsShortOrdering: Ordering[HashEqualsArrayWrapper[Short]] = new Ordering[HashEqualsArrayWrapper[Short]] { - override def compare(x: HashEqualsArrayWrapper[Short], y: HashEqualsArrayWrapper[Short]): Int = - shortArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsShortOrdering: Ordering[HashEqualsArrayWrapper[Short]] = + new Ordering[HashEqualsArrayWrapper[Short]] { + override def compare(x: HashEqualsArrayWrapper[Short], y: HashEqualsArrayWrapper[Short]): Int = + shortArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsCharOrdering: Ordering[HashEqualsArrayWrapper[Char]] = new Ordering[HashEqualsArrayWrapper[Char]] { - override def compare(x: HashEqualsArrayWrapper[Char], y: HashEqualsArrayWrapper[Char]): Int = - charArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsCharOrdering: Ordering[HashEqualsArrayWrapper[Char]] = + new Ordering[HashEqualsArrayWrapper[Char]] { + override def compare(x: HashEqualsArrayWrapper[Char], y: HashEqualsArrayWrapper[Char]): Int = + charArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsByteOrdering: Ordering[HashEqualsArrayWrapper[Byte]] = new Ordering[HashEqualsArrayWrapper[Byte]] { - override def compare(x: HashEqualsArrayWrapper[Byte], y: HashEqualsArrayWrapper[Byte]): Int = - byteArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsByteOrdering: Ordering[HashEqualsArrayWrapper[Byte]] = + new Ordering[HashEqualsArrayWrapper[Byte]] { + override def compare(x: HashEqualsArrayWrapper[Byte], y: HashEqualsArrayWrapper[Byte]): Int = + byteArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsBooleanOrdering: Ordering[HashEqualsArrayWrapper[Boolean]] = new Ordering[HashEqualsArrayWrapper[Boolean]] { - override def compare(x: HashEqualsArrayWrapper[Boolean], y: HashEqualsArrayWrapper[Boolean]): Int = - booleanArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsBooleanOrdering: Ordering[HashEqualsArrayWrapper[Boolean]] = + new Ordering[HashEqualsArrayWrapper[Boolean]] { + override def compare(x: HashEqualsArrayWrapper[Boolean], y: HashEqualsArrayWrapper[Boolean]): Int = + booleanArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsFloatOrdering: Ordering[HashEqualsArrayWrapper[Float]] = new Ordering[HashEqualsArrayWrapper[Float]] { - override def compare(x: HashEqualsArrayWrapper[Float], y: HashEqualsArrayWrapper[Float]): Int = - floatArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsFloatOrdering: Ordering[HashEqualsArrayWrapper[Float]] = + new Ordering[HashEqualsArrayWrapper[Float]] { + override def compare(x: HashEqualsArrayWrapper[Float], y: HashEqualsArrayWrapper[Float]): Int = + floatArrayOrd.compare(x.wrapped, y.wrapped) + } - implicit val hashEqualsDoubleOrdering: Ordering[HashEqualsArrayWrapper[Double]] = new Ordering[HashEqualsArrayWrapper[Double]] { - override def compare(x: HashEqualsArrayWrapper[Double], y: HashEqualsArrayWrapper[Double]): Int = - doubleArrayOrd.compare(x.wrapped, y.wrapped) - } + implicit val hashEqualsDoubleOrdering: Ordering[HashEqualsArrayWrapper[Double]] = + new Ordering[HashEqualsArrayWrapper[Double]] { + override def compare(x: HashEqualsArrayWrapper[Double], y: HashEqualsArrayWrapper[Double]): Int = + doubleArrayOrd.compare(x.wrapped, y.wrapped) + } } -final class HashEqualsLongArrayWrapper(override val wrapped: Array[Long]) extends HashEqualsArrayWrapper[Long] { +final class HashEqualsLongArrayWrapper(override val wrapped: Array[Long]) + extends HashEqualsArrayWrapper[Long] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsLongArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } @@ -272,64 +280,71 @@ final class HashEqualsIntArrayWrapper(override val wrapped: Array[Int]) extends override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsIntArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsShortArrayWrapper(override val wrapped: Array[Short]) extends HashEqualsArrayWrapper[Short] { +final class HashEqualsShortArrayWrapper(override val wrapped: Array[Short]) + extends HashEqualsArrayWrapper[Short] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsShortArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsCharArrayWrapper(override val wrapped: Array[Char]) extends HashEqualsArrayWrapper[Char] { +final class HashEqualsCharArrayWrapper(override val wrapped: Array[Char]) + extends HashEqualsArrayWrapper[Char] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsCharArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsByteArrayWrapper(override val wrapped: Array[Byte]) extends HashEqualsArrayWrapper[Byte] { +final class HashEqualsByteArrayWrapper(override val wrapped: Array[Byte]) + extends HashEqualsArrayWrapper[Byte] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsByteArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsBooleanArrayWrapper(override val wrapped: Array[Boolean]) extends HashEqualsArrayWrapper[Boolean] { +final class HashEqualsBooleanArrayWrapper(override val wrapped: Array[Boolean]) + extends HashEqualsArrayWrapper[Boolean] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsBooleanArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsFloatArrayWrapper(override val wrapped: Array[Float]) extends HashEqualsArrayWrapper[Float] { +final class HashEqualsFloatArrayWrapper(override val wrapped: Array[Float]) + extends HashEqualsArrayWrapper[Float] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsFloatArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsDoubleArrayWrapper(override val wrapped: Array[Double]) extends HashEqualsArrayWrapper[Double] { +final class HashEqualsDoubleArrayWrapper(override val wrapped: Array[Double]) + extends HashEqualsArrayWrapper[Double] { override def hashCode(): Int = util.Arrays.hashCode(wrapped) override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsDoubleArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) - case _ => false + case _ => false } } -final class HashEqualsObjectArrayWrapper[T](override val wrapped: Array[T]) extends HashEqualsArrayWrapper[T] { +final class HashEqualsObjectArrayWrapper[T](override val wrapped: Array[T]) + extends HashEqualsArrayWrapper[T] { private val wrappedInternal = wrapped.toSeq override def hashCode(): Int = wrappedInternal.hashCode() override def equals(obj: scala.Any): Boolean = obj match { case other: HashEqualsObjectArrayWrapper[T] => wrappedInternal.equals(other.wrappedInternal) - case _ => false + case _ => false } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala index 3e5d65e7d3..a9c66be55e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed object Joiner extends java.io.Serializable { @@ -46,8 +46,7 @@ object Joiner extends java.io.Serializable { RightJoin() /** - * Optimizers want to match on the kinds of joins we are doing. - * This gives them that ability + * Optimizers want to match on the kinds of joins we are doing. This gives them that ability */ sealed abstract class HashJoinFunction[K, V, U, R] extends Function3[K, V, Iterable[U], Iterator[R]] @@ -57,58 +56,73 @@ object Joiner extends java.io.Serializable { final case class HashLeft[K, V, U]() extends HashJoinFunction[K, V, U, (V, Option[U])] { def apply(k: K, v: V, u: Iterable[U]) = asOuter(u.iterator).map((v, _)) } - final case class FilteredHashJoin[K, V1, V2, R](jf: HashJoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) extends HashJoinFunction[K, V1, V2, R] { + final case class FilteredHashJoin[K, V1, V2, R](jf: HashJoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends HashJoinFunction[K, V1, V2, R] { def apply(k: K, left: V1, right: Iterable[V2]) = - jf.apply(k, left, right).filter { r => fn((k, r)) } + jf.apply(k, left, right).filter(r => fn((k, r))) } - final case class MappedHashJoin[K, V1, V2, R, R1](jf: HashJoinFunction[K, V1, V2, R], fn: R => R1) extends HashJoinFunction[K, V1, V2, R1] { + final case class MappedHashJoin[K, V1, V2, R, R1](jf: HashJoinFunction[K, V1, V2, R], fn: R => R1) + extends HashJoinFunction[K, V1, V2, R1] { def apply(k: K, left: V1, right: Iterable[V2]) = jf.apply(k, left, right).map(fn) } - final case class FlatMappedHashJoin[K, V1, V2, R, R1](jf: HashJoinFunction[K, V1, V2, R], fn: R => TraversableOnce[R1]) extends HashJoinFunction[K, V1, V2, R1] { + final case class FlatMappedHashJoin[K, V1, V2, R, R1]( + jf: HashJoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends HashJoinFunction[K, V1, V2, R1] { def apply(k: K, left: V1, right: Iterable[V2]) = jf.apply(k, left, right).flatMap(fn) } - sealed abstract class JoinFunction[K, V1, V2, R] extends Function3[K, Iterator[V1], Iterable[V2], Iterator[R]] + sealed abstract class JoinFunction[K, V1, V2, R] + extends Function3[K, Iterator[V1], Iterable[V2], Iterator[R]] final case class InnerJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, V2)] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(V1, V2)] = - left.flatMap { v1 => right.iterator.map((v1, _)) } + left.flatMap(v1 => right.iterator.map((v1, _))) } final case class LeftJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, Option[V2])] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(V1, Option[V2])] = - left.flatMap { v1 => asOuter(right.iterator).map((v1, _)) } + left.flatMap(v1 => asOuter(right.iterator).map((v1, _))) } final case class RightJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], V2)] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(Option[V1], V2)] = - asOuter(left).flatMap { v1 => right.iterator.map((v1, _)) } + asOuter(left).flatMap(v1 => right.iterator.map((v1, _))) } final case class OuterJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], Option[V2])] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(Option[V1], Option[V2])] = if (left.isEmpty && right.isEmpty) Iterator.empty - else asOuter(left).flatMap { v1 => asOuter(right.iterator).map((v1, _)) } + else asOuter(left).flatMap(v1 => asOuter(right.iterator).map((v1, _))) } - final case class FilteredJoin[K, V1, V2, R](jf: JoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) extends JoinFunction[K, V1, V2, R] { + final case class FilteredJoin[K, V1, V2, R](jf: JoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends JoinFunction[K, V1, V2, R] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = - jf.apply(k, left, right).filter { r => fn((k, r)) } + jf.apply(k, left, right).filter(r => fn((k, r))) } - final case class MappedJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: R => R1) extends JoinFunction[K, V1, V2, R1] { + final case class MappedJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: R => R1) + extends JoinFunction[K, V1, V2, R1] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = jf.apply(k, left, right).map(fn) } - final case class FlatMappedJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: R => TraversableOnce[R1]) extends JoinFunction[K, V1, V2, R1] { + final case class FlatMappedJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends JoinFunction[K, V1, V2, R1] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = jf.apply(k, left, right).flatMap(fn) } - final case class MappedGroupJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: (K, Iterator[R]) => Iterator[R1]) extends JoinFunction[K, V1, V2, R1] { + final case class MappedGroupJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: (K, Iterator[R]) => Iterator[R1] + ) extends JoinFunction[K, V1, V2, R1] { def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = { val iterr = jf.apply(k, left, right) if (iterr.isEmpty) Iterator.empty // mapGroup operates on non-empty groups else fn(k, iterr) } } - final case class JoinFromHashJoin[K, V1, V2, R](hj: (K, V1, Iterable[V2]) => Iterator[R]) extends JoinFunction[K, V1, V2, R] { + final case class JoinFromHashJoin[K, V1, V2, R](hj: (K, V1, Iterable[V2]) => Iterator[R]) + extends JoinFunction[K, V1, V2, R] { def apply(k: K, itv: Iterator[V1], itu: Iterable[V2]) = itv.flatMap(hj(k, _, itu)) } @@ -116,50 +130,58 @@ object Joiner extends java.io.Serializable { /** * an inner-like join function is empty definitely if either side is empty */ - final def isInnerJoinLike[K, V1, V2, R](jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R]): Option[Boolean] = + final def isInnerJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = jf match { - case InnerJoin() => Some(true) - case LeftJoin() => Some(false) - case RightJoin() => Some(false) - case OuterJoin() => Some(false) - case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) - case FilteredJoin(jf, _) => isInnerJoinLike(jf) - case MappedJoin(jf, _) => isInnerJoinLike(jf) - case FlatMappedJoin(jf, _) => isInnerJoinLike(jf) + case InnerJoin() => Some(true) + case LeftJoin() => Some(false) + case RightJoin() => Some(false) + case OuterJoin() => Some(false) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case FilteredJoin(jf, _) => isInnerJoinLike(jf) + case MappedJoin(jf, _) => isInnerJoinLike(jf) + case FlatMappedJoin(jf, _) => isInnerJoinLike(jf) case MappedGroupJoin(jf, _) => isInnerJoinLike(jf) - case _ => None + case _ => None } + /** * a left-like join function is empty definitely if the left side is empty */ - final def isLeftJoinLike[K, V1, V2, R](jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R]): Option[Boolean] = + final def isLeftJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = jf match { - case InnerJoin() => Some(true) - case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) - case LeftJoin() => Some(true) - case RightJoin() => Some(false) - case OuterJoin() => Some(false) - case FilteredJoin(jf, _) => isLeftJoinLike(jf) - case MappedJoin(jf, _) => isLeftJoinLike(jf) - case FlatMappedJoin(jf, _) => isLeftJoinLike(jf) + case InnerJoin() => Some(true) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case LeftJoin() => Some(true) + case RightJoin() => Some(false) + case OuterJoin() => Some(false) + case FilteredJoin(jf, _) => isLeftJoinLike(jf) + case MappedJoin(jf, _) => isLeftJoinLike(jf) + case FlatMappedJoin(jf, _) => isLeftJoinLike(jf) case MappedGroupJoin(jf, _) => isLeftJoinLike(jf) - case _ => None + case _ => None } + /** * a right-like join function is empty definitely if the right side is empty */ - final def isRightJoinLike[K, V1, V2, R](jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R]): Option[Boolean] = + final def isRightJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = jf match { - case InnerJoin() => Some(true) - case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) - case LeftJoin() => Some(false) - case RightJoin() => Some(true) - case OuterJoin() => Some(false) - case FilteredJoin(jf, _) => isRightJoinLike(jf) - case MappedJoin(jf, _) => isRightJoinLike(jf) - case FlatMappedJoin(jf, _) => isRightJoinLike(jf) + case InnerJoin() => Some(true) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case LeftJoin() => Some(false) + case RightJoin() => Some(true) + case OuterJoin() => Some(false) + case FilteredJoin(jf, _) => isRightJoinLike(jf) + case MappedJoin(jf, _) => isRightJoinLike(jf) + case FlatMappedJoin(jf, _) => isRightJoinLike(jf) case MappedGroupJoin(jf, _) => isRightJoinLike(jf) - case _ => None + case _ => None } /** @@ -167,12 +189,11 @@ object Joiner extends java.io.Serializable { */ final def isInnerHashJoinLike[K, V1, V2, R](jf: (K, V1, Iterable[V2]) => Iterator[R]): Option[Boolean] = jf match { - case HashInner() => Some(true) - case HashLeft() => Some(false) - case FilteredHashJoin(jf, _) => isInnerHashJoinLike(jf) - case MappedHashJoin(jf, _) => isInnerHashJoinLike(jf) + case HashInner() => Some(true) + case HashLeft() => Some(false) + case FilteredHashJoin(jf, _) => isInnerHashJoinLike(jf) + case MappedHashJoin(jf, _) => isInnerHashJoinLike(jf) case FlatMappedHashJoin(jf, _) => isInnerHashJoinLike(jf) - case _ => None + case _ => None } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala index cf9f35775e..1319a1c498 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala @@ -12,52 +12,54 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import java.io.Serializable import scala.collection.JavaConverters._ -import com.twitter.algebird.{ Fold, Semigroup, Ring, Aggregator } +import com.twitter.algebird.{Aggregator, Fold, Ring, Semigroup} import com.twitter.algebird.mutable.PriorityQueueMonoid import com.twitter.scalding.typed.functions._ object KeyedListLike { + /** KeyedListLike items are implicitly convertable to TypedPipe */ - implicit def toTypedPipe[K, V, S[K, +V] <: KeyedListLike[K, V, S]](keyed: KeyedListLike[K, V, S]): TypedPipe[(K, V)] = keyed.toTypedPipe + implicit def toTypedPipe[K, V, S[K, +V] <: KeyedListLike[K, V, S]]( + keyed: KeyedListLike[K, V, S] + ): TypedPipe[(K, V)] = keyed.toTypedPipe - implicit def toTypedPipeKeyed[K, V, S[K, +V] <: KeyedListLike[K, V, S]](keyed: KeyedListLike[K, V, S]): TypedPipe.Keyed[K, V] = + implicit def toTypedPipeKeyed[K, V, S[K, +V] <: KeyedListLike[K, V, S]]( + keyed: KeyedListLike[K, V, S] + ): TypedPipe.Keyed[K, V] = new TypedPipe.Keyed(keyed.toTypedPipe) } /** - * This is for the case where you don't want to expose any structure - * but the ability to operate on an iterator of the values + * This is for the case where you don't want to expose any structure but the ability to operate on an iterator + * of the values */ trait KeyedList[K, +T] extends KeyedListLike[K, T, KeyedList] /** - * Represents sharded lists of items of type T - * There are exactly two fundamental operations: - * toTypedPipe: marks the end of the grouped-on-key operations. - * mapValueStream: further transforms all values, in order, one at a time, - * with a function from Iterator to another Iterator + * Represents sharded lists of items of type T There are exactly two fundamental operations: toTypedPipe: + * marks the end of the grouped-on-key operations. mapValueStream: further transforms all values, in order, + * one at a time, with a function from Iterator to another Iterator */ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Serializable { /** - * End of the operations on values. From this point on the keyed structure - * is lost and another shuffle is generally required to reconstruct it + * End of the operations on values. From this point on the keyed structure is lost and another shuffle is + * generally required to reconstruct it */ def toTypedPipe: TypedPipe[(K, T)] /** - * This is like take except that the items are kept in memory - * and we attempt to partially execute on the mappers if possible - * For very large values of n, this could create memory pressure. - * (as you may aggregate n items in a memory heap for each key) - * If you get OOM issues, try to resolve using the method `take` instead. + * This is like take except that the items are kept in memory and we attempt to partially execute on the + * mappers if possible For very large values of n, this could create memory pressure. (as you may aggregate + * n items in a memory heap for each key) If you get OOM issues, try to resolve using the method `take` + * instead. */ def bufferedTake(n: Int): This[K, T] /* @@ -88,11 +90,10 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se } } - */ + */ /** - * filter keys on a predicate. More efficient than filter if you are - * only looking at keys + * filter keys on a predicate. More efficient than filter if you are only looking at keys */ def filterKeys(fn: K => Boolean): This[K, T] /* an inefficient implementation is below, but @@ -105,13 +106,11 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se */ /** - * Operate on an Iterator[T] of all the values for each key at one time. - * Prefer this to toList, when you can avoid accumulating the whole list in memory. - * Prefer sum, which is partially executed map-side by default. + * Operate on an Iterator[T] of all the values for each key at one time. Prefer this to toList, when you can + * avoid accumulating the whole list in memory. Prefer sum, which is partially executed map-side by default. * Use mapValueStream when you don't care about the key for the group. * - * Iterator is always Non-empty. - * Note, any key that has all values removed will not appear in subsequent + * Iterator is always Non-empty. Note, any key that has all values removed will not appear in subsequent * .mapGroup/mapValueStream */ def mapGroup[V](smfn: (K, Iterator[T]) => Iterator[V]): This[K, V] @@ -129,88 +128,76 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se .mapValues[C](AggPresent(agg)) /** - * .filter(fn).toTypedPipe == .toTypedPipe.filter(fn) - * It is generally better to avoid going back to a TypedPipe - * as long as possible: this minimizes the times we go in - * and out of cascading/hadoop types. + * .filter(fn).toTypedPipe == .toTypedPipe.filter(fn) It is generally better to avoid going back to a + * TypedPipe as long as possible: this minimizes the times we go in and out of cascading/hadoop types. */ def filter(fn: ((K, T)) => Boolean): This[K, T] = mapGroup(FilterGroup(fn)) /** - * flatten the values - * Useful after sortedTake, for instance + * flatten the values Useful after sortedTake, for instance */ def flattenValues[U](implicit ev: T <:< TraversableOnce[U]): This[K, U] = flatMapValues(Widen(SubTypes.fromEv(ev))) /** - * This is just short hand for mapValueStream(identity), it makes sure the - * planner sees that you want to force a shuffle. For expert tuning + * This is just short hand for mapValueStream(identity), it makes sure the planner sees that you want to + * force a shuffle. For expert tuning */ def forceToReducers: This[K, T] = mapValueStream(Identity()) /** - * Use this to get the first value encountered. - * prefer this to take(1). + * Use this to get the first value encountered. prefer this to take(1). */ def head: This[K, T] = sum(HeadSemigroup[T]()) /** - * This is a special case of mapValueStream, but can be optimized because it doesn't need - * all the values for a given key at once. An unoptimized implementation is: - * mapValueStream { _.map { fn } } - * but for Grouped we can avoid resorting to mapValueStream + * This is a special case of mapValueStream, but can be optimized because it doesn't need all the values for + * a given key at once. An unoptimized implementation is: mapValueStream { _.map { fn } } but for Grouped we + * can avoid resorting to mapValueStream */ def mapValues[V](fn: T => V): This[K, V] = mapGroup(MapGroupMapValues(fn)) /** - * Similar to mapValues, but works like flatMap, returning a collection of outputs - * for each value input. + * Similar to mapValues, but works like flatMap, returning a collection of outputs for each value input. */ def flatMapValues[V](fn: T => TraversableOnce[V]): This[K, V] = mapGroup(MapGroupFlatMapValues(fn)) /** - * Use this when you don't care about the key for the group, - * otherwise use mapGroup + * Use this when you don't care about the key for the group, otherwise use mapGroup */ def mapValueStream[V](smfn: Iterator[T] => Iterator[V]): This[K, V] = mapGroup(MapValueStream(smfn)) /** - * Add all items according to the implicit Semigroup - * If there is no sorting, we default to assuming the Semigroup is - * commutative. If you don't want that, define an ordering on the Values, - * sort or .forceToReducers. + * Add all items according to the implicit Semigroup If there is no sorting, we default to assuming the + * Semigroup is commutative. If you don't want that, define an ordering on the Values, sort or + * .forceToReducers. * - * Semigroups MAY have a faster implementation of sum for iterators, - * so prefer using sum/sumLeft to reduce + * Semigroups MAY have a faster implementation of sum for iterators, so prefer using sum/sumLeft to reduce */ def sum[U >: T](implicit sg: Semigroup[U]): This[K, U] = sumLeft[U] /** - * reduce with fn which must be associative and commutative. - * Like the above this can be optimized in some Grouped cases. - * If you don't have a commutative operator, use reduceLeft + * reduce with fn which must be associative and commutative. Like the above this can be optimized in some + * Grouped cases. If you don't have a commutative operator, use reduceLeft */ def reduce[U >: T](fn: (U, U) => U): This[K, U] = sum(SemigroupFromFn(fn)) /** - * Take the largest k things according to the implicit ordering. - * Useful for top-k without having to call ord.reverse + * Take the largest k things according to the implicit ordering. Useful for top-k without having to call + * ord.reverse */ def sortedReverseTake[U >: T](k: Int)(implicit ord: Ordering[U]): This[K, Seq[U]] = sortedTake[U](k)(ord.reverse) /** - * This implements bottom-k (smallest k items) on each mapper for each key, then - * sends those to reducers to get the result. This is faster - * than using .take if k * (number of Keys) is small enough - * to fit in memory. + * This implements bottom-k (smallest k items) on each mapper for each key, then sends those to reducers to + * get the result. This is faster than using .take if k * (number of Keys) is small enough to fit in memory. */ def sortedTake[U >: T](k: Int)(implicit ord: Ordering[U]): This[K, Seq[U]] = { val mon = new PriorityQueueMonoid[U](k)(ord) @@ -232,7 +219,7 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se def count(fn: T => Boolean): This[K, Long] = mapValues(Count(fn)).sum - /** For each key, check to see if a predicate is true for all Values*/ + /** For each key, check to see if a predicate is true for all Values */ def forall(fn: T => Boolean): This[K, Boolean] = mapValues(fn).product @@ -263,16 +250,14 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se mapValueStream(TakeWhile(p)) /** - * Folds are composable aggregations that make one pass over the data. - * If you need to do several custom folds over the same data, use Fold.join - * and this method + * Folds are composable aggregations that make one pass over the data. If you need to do several custom + * folds over the same data, use Fold.join and this method */ def fold[V](f: Fold[T, V]): This[K, V] = mapValueStream(FoldIterator(f)) /** - * If the fold depends on the key, use this method to construct - * the fold for each key + * If the fold depends on the key, use this method to construct the fold for each key */ def foldWithKey[V](fn: K => Fold[T, V]): This[K, V] = mapGroup(FoldWithKeyIterator(fn)) @@ -286,17 +271,16 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se mapValueStream(ScanLeftIterator(z, fn)) /** - * Similar to reduce but always on the reduce-side (never optimized to mapside), - * and named for the scala function. fn need not be associative and/or commutative. - * Makes sense when you want to reduce, but in a particular sorted order. - * the old value comes in on the left. + * Similar to reduce but always on the reduce-side (never optimized to mapside), and named for the scala + * function. fn need not be associative and/or commutative. Makes sense when you want to reduce, but in a + * particular sorted order. the old value comes in on the left. */ def reduceLeft[U >: T](fn: (U, U) => U): This[K, U] = sumLeft[U](SemigroupFromFn(fn)) /** - * Semigroups MAY have a faster implementation of sum for iterators, - * so prefer using sum/sumLeft to reduce/reduceLeft + * Semigroups MAY have a faster implementation of sum for iterators, so prefer using sum/sumLeft to + * reduce/reduceLeft */ def sumLeft[U >: T](implicit sg: Semigroup[U]): This[K, U] = mapValueStream[U](SumAll(sg)) @@ -305,71 +289,66 @@ trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Se def size: This[K, Long] = mapValues(Constant(1L)).sum /** - * For each key, give the number of unique values. WARNING: May OOM. - * This assumes the values for each key can fit in memory. + * For each key, give the number of unique values. WARNING: May OOM. This assumes the values for each key + * can fit in memory. */ def distinctSize: This[K, Long] = toSet[T].mapValues(SizeOfSet()) /** - * For each key, remove duplicate values. WARNING: May OOM. - * This assumes the values for each key can fit in memory. + * For each key, remove duplicate values. WARNING: May OOM. This assumes the values for each key can fit in + * memory. */ def distinctValues: This[K, T] = toSet[T].flattenValues /** - * AVOID THIS IF POSSIBLE - * For each key, accumulate all the values into a List. WARNING: May OOM - * Only use this method if you are sure all the values will fit in memory. - * You really should try to ask why you need all the values, and if you - * want to do some custom reduction, do it in mapGroup or mapValueStream + * AVOID THIS IF POSSIBLE For each key, accumulate all the values into a List. WARNING: May OOM Only use + * this method if you are sure all the values will fit in memory. You really should try to ask why you need + * all the values, and if you want to do some custom reduction, do it in mapGroup or mapValueStream * - * This does no map-side aggregation even though it is a Monoid because - * toList does not decrease the size of the data at all, so in practice - * it only wastes effort to try to cache. + * This does no map-side aggregation even though it is a Monoid because toList does not decrease the size of + * the data at all, so in practice it only wastes effort to try to cache. */ def toList: This[K, List[T]] = mapValueStream(ToList[T]()) + /** - * AVOID THIS IF POSSIBLE - * Same risks apply here as to toList: you may OOM. See toList. - * Note that toSet needs to be parameterized even though toList does not. - * This is because List is covariant in its type parameter in the scala API, - * but Set is invariant. See: + * AVOID THIS IF POSSIBLE Same risks apply here as to toList: you may OOM. See toList. Note that toSet needs + * to be parameterized even though toList does not. This is because List is covariant in its type parameter + * in the scala API, but Set is invariant. See: * http://stackoverflow.com/questions/676615/why-is-scalas-immutable-set-not-covariant-in-its-type */ def toSet[U >: T]: This[K, Set[U]] = mapValues(ToSet[U]()).sum - /** For each key, give the maximum value*/ + /** For each key, give the maximum value */ def max[B >: T](implicit cmp: Ordering[B]): This[K, T] = reduce(MaxOrd[T, B](cmp)) - /** For each key, give the maximum value by some function*/ + /** For each key, give the maximum value by some function */ def maxBy[B](fn: T => B)(implicit cmp: Ordering[B]): This[K, T] = reduce(MaxOrdBy(fn, cmp)) - /** For each key, give the minimum value*/ + /** For each key, give the minimum value */ def min[B >: T](implicit cmp: Ordering[B]): This[K, T] = reduce(MinOrd[T, B](cmp)) - /** For each key, give the minimum value by some function*/ + /** For each key, give the minimum value by some function */ def minBy[B](fn: T => B)(implicit cmp: Ordering[B]): This[K, T] = reduce(MinOrdBy(fn, cmp)) - - /** Use this to error if there is more than 1 value per key - * Using this makes it easier to detect when data does - * not have the shape you expect and to communicate to - * scalding that certain optimizations are safe to do + /** + * Use this to error if there is more than 1 value per key Using this makes it easier to detect when data + * does not have the shape you expect and to communicate to scalding that certain optimizations are safe to + * do * - * Note, this has no effect and is a waste to call - * after sum because it is true by construction at that - * point + * Note, this has no effect and is a waste to call after sum because it is true by construction at that + * point */ def requireSingleValuePerKey: This[K, T] = mapValueStream(SumAll(RequireSingleSemigroup())) /** Convert to a TypedPipe and only keep the keys */ def keys: TypedPipe[K] = toTypedPipe.keys + /** Convert to a TypedPipe and only keep the values */ def values: TypedPipe[T] = toTypedPipe.values } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala index 4f7377f231..f00ccd046a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala @@ -12,13 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed /** - * Represents anything that starts as a TypedPipe of Key Value, where - * the value type has been erased. Acts as proof that the K in the tuple - * has an Ordering + * Represents anything that starts as a TypedPipe of Key Value, where the value type has been erased. Acts as + * proof that the K in the tuple has an Ordering */ trait KeyedPipe[K] { def keyOrdering: Ordering[K] diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala index 4df34601a0..4ea0099c13 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala @@ -37,167 +37,156 @@ import com.twitter.algebird.Semigroup */ /** - * lookupJoin simulates the behavior of a realtime system attempting - * to leftJoin (K, V) pairs against some other value type (JoinedV) - * by performing realtime lookups on a key-value Store. + * lookupJoin simulates the behavior of a realtime system attempting to leftJoin (K, V) pairs against some + * other value type (JoinedV) by performing realtime lookups on a key-value Store. * - * An example would join (K, V) pairs of (URL, Username) against a - * service of (URL, ImpressionCount). The result of this join would - * be a pipe of (ShortenedURL, (Username, - * Option[ImpressionCount])). + * An example would join (K, V) pairs of (URL, Username) against a service of (URL, ImpressionCount). The + * result of this join would be a pipe of (ShortenedURL, (Username, Option[ImpressionCount])). * - * To simulate this behavior, lookupJoin accepts pipes of key-value - * pairs with an explicit time value T attached. T must have some - * sensible ordering. The semantics are, if one were to hit the - * right pipe's simulated realtime service at any time between - * T(tuple) T(tuple + 1), one would receive Some((K, + * To simulate this behavior, lookupJoin accepts pipes of key-value pairs with an explicit time value T + * attached. T must have some sensible ordering. The semantics are, if one were to hit the right pipe's + * simulated realtime service at any time between T(tuple) T(tuple + 1), one would receive Some((K, * JoinedV)(tuple)). * - * The entries in the left pipe's tuples have the following - * meaning: + * The entries in the left pipe's tuples have the following meaning: * - * T: The time at which the (K, W) lookup occurred. - * K: the join key. - * W: the current value for the join key. + * T: The time at which the (K, W) lookup occurred. K: the join key. W: the current value for the join key. * * The right pipe's entries have the following meaning: * - * T: The time at which the "service" was fed an update - * K: the join K. - * V: value of the key at time T + * T: The time at which the "service" was fed an update K: the join K. V: value of the key at time T * - * Before the time T in the right pipe's very first entry, the - * simulated "service" will return None. After this time T, the - * right side will return None only if the key is absent, - * else, the service will return Some(joinedV). + * Before the time T in the right pipe's very first entry, the simulated "service" will return None. After + * this time T, the right side will return None only if the key is absent, else, the service will return + * Some(joinedV). */ object LookupJoin extends Serializable { /** - * This is the "infinite history" join and always joins regardless of how - * much time is between the left and the right + * This is the "infinite history" join and always joins regardless of how much time is between the left and + * the right */ def apply[T: Ordering, K: Ordering, V, JoinedV]( - left: TypedPipe[(T, (K, V))], - right: TypedPipe[(T, (K, JoinedV))], - reducers: Option[Int] = None): TypedPipe[(T, (K, (V, Option[JoinedV])))] = - + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + ): TypedPipe[(T, (K, (V, Option[JoinedV])))] = withWindow(left, right, reducers)((_, _) => true) /** - * In this case, the right pipe is fed through a scanLeft doing a Semigroup.plus - * before joined to the left + * In this case, the right pipe is fed through a scanLeft doing a Semigroup.plus before joined to the left */ - def rightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup](left: TypedPipe[(T, (K, V))], - right: TypedPipe[(T, (K, JoinedV))], - reducers: Option[Int] = None): TypedPipe[(T, (K, (V, Option[JoinedV])))] = + def rightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + ): TypedPipe[(T, (K, (V, Option[JoinedV])))] = withWindowRightSumming(left, right, reducers)((_, _) => true) /** - * This ensures that gate(Tleft, Tright) == true, else the None is emitted - * as the joined value. - * Useful for bounding the time of the join to a recent window + * This ensures that gate(Tleft, Tright) == true, else the None is emitted as the joined value. Useful for + * bounding the time of the join to a recent window */ - def withWindow[T: Ordering, K: Ordering, V, JoinedV](left: TypedPipe[(T, (K, V))], - right: TypedPipe[(T, (K, JoinedV))], - reducers: Option[Int] = None)(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { + def withWindow[T: Ordering, K: Ordering, V, JoinedV]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + )(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { - implicit val keepNew: Semigroup[JoinedV] = Semigroup.from { (older, newer) => newer } + implicit val keepNew: Semigroup[JoinedV] = Semigroup.from((older, newer) => newer) withWindowRightSumming(left, right, reducers)(gate) } /** - * This ensures that gate(Tleft, Tright) == true, else the None is emitted - * as the joined value, and sums are only done as long as they they come - * within the gate interval as well + * This ensures that gate(Tleft, Tright) == true, else the None is emitted as the joined value, and sums are + * only done as long as they they come within the gate interval as well */ - def withWindowRightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup](left: TypedPipe[(T, (K, V))], - right: TypedPipe[(T, (K, JoinedV))], - reducers: Option[Int] = None)(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { + def withWindowRightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + )(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { + /** - * Implicit ordering on an either that doesn't care about the - * actual container values, puts the lookups before the service writes - * Since we assume it takes non-zero time to do a lookup. + * Implicit ordering on an either that doesn't care about the actual container values, puts the lookups + * before the service writes Since we assume it takes non-zero time to do a lookup. */ implicit def eitherOrd[T, U]: Ordering[Either[T, U]] = new Ordering[Either[T, U]] { def compare(l: Either[T, U], r: Either[T, U]) = (l, r) match { - case (Left(_), Right(_)) => -1 - case (Right(_), Left(_)) => 1 - case (Left(_), Left(_)) => 0 + case (Left(_), Right(_)) => -1 + case (Right(_), Left(_)) => 1 + case (Left(_), Left(_)) => 0 case (Right(_), Right(_)) => 0 } } val joined: TypedPipe[(K, (Option[(T, JoinedV)], Option[(T, V, Option[JoinedV])]))] = - left.map { case (t, (k, v)) => (k, (t, Left(v): Either[V, JoinedV])) } - .++(right.map { - case (t, (k, joinedV)) => - (k, (t, Right(joinedV): Either[V, JoinedV])) + left + .map { case (t, (k, v)) => (k, (t, Left(v): Either[V, JoinedV])) } + .++(right.map { case (t, (k, joinedV)) => + (k, (t, Right(joinedV): Either[V, JoinedV])) }) .group .withReducers(reducers.getOrElse(-1)) // -1 means default in scalding .sorted /** - * Grouping by K leaves values of (T, Either[V, JoinedV]). Sort - * by time and scanLeft. The iterator will now represent pairs of - * T and either new values to join against or updates to the - * simulated "realtime store" described above. + * Grouping by K leaves values of (T, Either[V, JoinedV]). Sort by time and scanLeft. The iterator + * will now represent pairs of T and either new values to join against or updates to the simulated + * "realtime store" described above. */ .scanLeft( /** - * In the simulated realtime store described above, this - * None is the value in the store at the current - * time. Because we sort by time and scan forward, this - * value will be updated with a new value every time a - * Right(delta) shows up in the iterator. + * In the simulated realtime store described above, this None is the value in the store at the + * current time. Because we sort by time and scan forward, this value will be updated with a new + * value every time a Right(delta) shows up in the iterator. * - * The second entry in the pair will be None when the - * JoinedV is updated and Some(newValue) when a (K, V) - * shows up and a new join occurs. + * The second entry in the pair will be None when the JoinedV is updated and Some(newValue) when a + * (K, V) shows up and a new join occurs. */ - (Option.empty[(T, JoinedV)], Option.empty[(T, V, Option[JoinedV])])) { - case ((None, result), (time, Left(v))) => { - // The was no value previously - (None, Some((time, v, None))) - } - - case ((prev @ Some((oldt, jv)), result), (time, Left(v))) => { - // Left(v) means that we have a new value from the left - // pipe that we need to join against the current - // "lastJoined" value sitting in scanLeft's state. This - // is equivalent to a lookup on the data in the right - // pipe at time "thisTime". - val filteredJoined = if (gate(time, oldt)) Some(jv) else None - (prev, Some((time, v, filteredJoined))) - } - - case ((None, result), (time, Right(joined))) => { - // There was no value before, so we just update to joined - (Some((time, joined)), None) - } - - case ((Some((oldt, oldJ)), result), (time, Right(joined))) => { - // Right(joinedV) means that we've received a new value - // to use in the simulated realtime service - // described in the comments above - // did it fall out of cache? - val nextJoined = if (gate(time, oldt)) Semigroup.plus(oldJ, joined) else joined - (Some((time, nextJoined)), None) - } - }.toTypedPipe + (Option.empty[(T, JoinedV)], Option.empty[(T, V, Option[JoinedV])]) + ) { + case ((None, result), (time, Left(v))) => { + // The was no value previously + (None, Some((time, v, None))) + } - // Now, get rid of residual state from the scanLeft above: - joined.flatMap { - case (k, (_, optV)) => - // filter out every event that produced a Right(delta) above, - // leaving only the leftJoin events that occurred above: - optV.map { - case (t, v, optJoined) => (t, (k, (v, optJoined))) + case ((prev @ Some((oldt, jv)), result), (time, Left(v))) => { + // Left(v) means that we have a new value from the left + // pipe that we need to join against the current + // "lastJoined" value sitting in scanLeft's state. This + // is equivalent to a lookup on the data in the right + // pipe at time "thisTime". + val filteredJoined = if (gate(time, oldt)) Some(jv) else None + (prev, Some((time, v, filteredJoined))) + } + + case ((None, result), (time, Right(joined))) => { + // There was no value before, so we just update to joined + (Some((time, joined)), None) + } + + case ((Some((oldt, oldJ)), result), (time, Right(joined))) => { + // Right(joinedV) means that we've received a new value + // to use in the simulated realtime service + // described in the comments above + // did it fall out of cache? + val nextJoined = if (gate(time, oldt)) Semigroup.plus(oldJ, joined) else joined + (Some((time, nextJoined)), None) + } } + .toTypedPipe + + // Now, get rid of residual state from the scanLeft above: + joined.flatMap { case (k, (_, optV)) => + // filter out every event that produced a Right(delta) above, + // leaving only the leftJoin events that occurred above: + optV.map { case (t, v, optJoined) => + (t, (k, (v, optJoined))) + } } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala index af19568b43..cf8c39b7ec 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import com.twitter.scalding._ diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala index 6ac4a59e49..f65803028f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala @@ -2,9 +2,8 @@ package com.twitter.scalding.typed /** - * This is an autogenerated object which gives you easy access to - * doing N-way joins so the types are cleaner. However, it just calls - * the underlying methods on CoGroupable and flattens the resulting tuple + * This is an autogenerated object which gives you easy access to doing N-way joins so the types are cleaner. + * However, it just calls the underlying methods on CoGroupable and flattens the resulting tuple */ object MultiJoin extends java.io.Serializable { import com.twitter.scalding.typed.FlattenGroup._ @@ -12,42 +11,81 @@ object MultiJoin extends java.io.Serializable { def apply[KEY, A, B](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B]): CoGrouped[KEY, (A, B)] = a.join(b) - def apply[KEY, A, B, C](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C]): CoGrouped[KEY, (A, B, C)] = + def apply[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (A, B, C)] = a.join(b) .join(c) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D]): CoGrouped[KEY, (A, B, C, D)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (A, B, C, D)] = a.join(b) .join(c) .join(d) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E]): CoGrouped[KEY, (A, B, C, D, E)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (A, B, C, D, E)] = a.join(b) .join(c) .join(d) .join(e) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F]): CoGrouped[KEY, (A, B, C, D, E, F)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (A, B, C, D, E, F)] = a.join(b) .join(c) .join(d) .join(e) .join(f) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G]): CoGrouped[KEY, (A, B, C, D, E, F, G)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (A, B, C, D, E, F, G)] = a.join(b) .join(c) .join(d) .join(e) .join(f) .join(g) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H]): CoGrouped[KEY, (A, B, C, D, E, F, G, H)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H)] = a.join(b) .join(c) .join(d) @@ -55,9 +93,19 @@ object MultiJoin extends java.io.Serializable { .join(f) .join(g) .join(h) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I)] = a.join(b) .join(c) .join(d) @@ -66,9 +114,20 @@ object MultiJoin extends java.io.Serializable { .join(g) .join(h) .join(i) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J)] = a.join(b) .join(c) .join(d) @@ -78,9 +137,21 @@ object MultiJoin extends java.io.Serializable { .join(h) .join(i) .join(j) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K)] = a.join(b) .join(c) .join(d) @@ -91,9 +162,22 @@ object MultiJoin extends java.io.Serializable { .join(i) .join(j) .join(k) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L)] = a.join(b) .join(c) .join(d) @@ -105,9 +189,23 @@ object MultiJoin extends java.io.Serializable { .join(j) .join(k) .join(l) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M)] = a.join(b) .join(c) .join(d) @@ -120,9 +218,24 @@ object MultiJoin extends java.io.Serializable { .join(k) .join(l) .join(m) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N)] = a.join(b) .join(c) .join(d) @@ -136,9 +249,25 @@ object MultiJoin extends java.io.Serializable { .join(l) .join(m) .join(n) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O)] = a.join(b) .join(c) .join(d) @@ -153,9 +282,26 @@ object MultiJoin extends java.io.Serializable { .join(m) .join(n) .join(o) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P)] = a.join(b) .join(c) .join(d) @@ -171,9 +317,27 @@ object MultiJoin extends java.io.Serializable { .join(n) .join(o) .join(p) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q)] = a.join(b) .join(c) .join(d) @@ -190,9 +354,28 @@ object MultiJoin extends java.io.Serializable { .join(o) .join(p) .join(q) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R)] = a.join(b) .join(c) .join(d) @@ -210,9 +393,29 @@ object MultiJoin extends java.io.Serializable { .join(p) .join(q) .join(r) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S)] = a.join(b) .join(c) .join(d) @@ -231,9 +434,30 @@ object MultiJoin extends java.io.Serializable { .join(q) .join(r) .join(s) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T)] = a.join(b) .join(c) .join(d) @@ -253,9 +477,31 @@ object MultiJoin extends java.io.Serializable { .join(r) .join(s) .join(t) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U)] = a.join(b) .join(c) .join(d) @@ -276,9 +522,32 @@ object MultiJoin extends java.io.Serializable { .join(s) .join(t) .join(u) - .mapValues { tup => flattenNestedTuple(tup) } - - def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U], v: CoGroupable[KEY, V]): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V)] = + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V)] = a.join(b) .join(c) .join(d) @@ -300,47 +569,86 @@ object MultiJoin extends java.io.Serializable { .join(t) .join(u) .join(v) - .mapValues { tup => flattenNestedTuple(tup) } + .mapValues(tup => flattenNestedTuple(tup)) def left[KEY, A, B](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B]): CoGrouped[KEY, (A, Option[B])] = a.leftJoin(b) - def left[KEY, A, B, C](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C]): CoGrouped[KEY, (A, Option[B], Option[C])] = + def left[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (A, Option[B], Option[C])] = a.leftJoin(b) .leftJoin(c) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D])] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E])] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) .leftJoin(e) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F])] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) .leftJoin(e) .leftJoin(f) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) .leftJoin(e) .leftJoin(f) .leftJoin(g) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H])] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -348,9 +656,22 @@ object MultiJoin extends java.io.Serializable { .leftJoin(f) .leftJoin(g) .leftJoin(h) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[ + KEY, + (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -359,9 +680,23 @@ object MultiJoin extends java.io.Serializable { .leftJoin(g) .leftJoin(h) .leftJoin(i) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[ + KEY, + (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J]) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -371,9 +706,36 @@ object MultiJoin extends java.io.Serializable { .leftJoin(h) .leftJoin(i) .leftJoin(j) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -384,9 +746,38 @@ object MultiJoin extends java.io.Serializable { .leftJoin(i) .leftJoin(j) .leftJoin(k) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -398,9 +789,40 @@ object MultiJoin extends java.io.Serializable { .leftJoin(j) .leftJoin(k) .leftJoin(l) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -413,9 +835,42 @@ object MultiJoin extends java.io.Serializable { .leftJoin(k) .leftJoin(l) .leftJoin(m) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -429,9 +884,44 @@ object MultiJoin extends java.io.Serializable { .leftJoin(l) .leftJoin(m) .leftJoin(n) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -446,9 +936,46 @@ object MultiJoin extends java.io.Serializable { .leftJoin(m) .leftJoin(n) .leftJoin(o) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -464,9 +991,48 @@ object MultiJoin extends java.io.Serializable { .leftJoin(n) .leftJoin(o) .leftJoin(p) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -483,9 +1049,50 @@ object MultiJoin extends java.io.Serializable { .leftJoin(o) .leftJoin(p) .leftJoin(q) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -503,9 +1110,52 @@ object MultiJoin extends java.io.Serializable { .leftJoin(p) .leftJoin(q) .leftJoin(r) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -524,9 +1174,54 @@ object MultiJoin extends java.io.Serializable { .leftJoin(q) .leftJoin(r) .leftJoin(s) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -546,9 +1241,56 @@ object MultiJoin extends java.io.Serializable { .leftJoin(r) .leftJoin(s) .leftJoin(t) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -569,9 +1311,58 @@ object MultiJoin extends java.io.Serializable { .leftJoin(s) .leftJoin(t) .leftJoin(u) - .mapValues { tup => flattenNestedTuple(tup) } - - def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U], v: CoGroupable[KEY, V]): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U], Option[V])] = + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) + ] = a.leftJoin(b) .leftJoin(c) .leftJoin(d) @@ -593,47 +1384,92 @@ object MultiJoin extends java.io.Serializable { .leftJoin(t) .leftJoin(u) .leftJoin(v) - .mapValues { tup => flattenNestedTuple(tup) } + .mapValues(tup => flattenNestedTuple(tup)) - def outer[KEY, A, B](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B]): CoGrouped[KEY, (Option[A], Option[B])] = + def outer[KEY, A, B]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B] + ): CoGrouped[KEY, (Option[A], Option[B])] = a.outerJoin(b) - def outer[KEY, A, B, C](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C]): CoGrouped[KEY, (Option[A], Option[B], Option[C])] = + def outer[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C])] = a.outerJoin(b) .outerJoin(c) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D])] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) .outerJoin(e) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) .outerJoin(e) .outerJoin(f) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) .outerJoin(e) .outerJoin(f) .outerJoin(g) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[ + KEY, + (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H]) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -641,9 +1477,22 @@ object MultiJoin extends java.io.Serializable { .outerJoin(f) .outerJoin(g) .outerJoin(h) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[ + KEY, + (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -652,9 +1501,34 @@ object MultiJoin extends java.io.Serializable { .outerJoin(g) .outerJoin(h) .outerJoin(i) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -664,9 +1538,36 @@ object MultiJoin extends java.io.Serializable { .outerJoin(h) .outerJoin(i) .outerJoin(j) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -677,9 +1578,38 @@ object MultiJoin extends java.io.Serializable { .outerJoin(i) .outerJoin(j) .outerJoin(k) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -691,9 +1621,40 @@ object MultiJoin extends java.io.Serializable { .outerJoin(j) .outerJoin(k) .outerJoin(l) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -706,9 +1667,42 @@ object MultiJoin extends java.io.Serializable { .outerJoin(k) .outerJoin(l) .outerJoin(m) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -722,9 +1716,44 @@ object MultiJoin extends java.io.Serializable { .outerJoin(l) .outerJoin(m) .outerJoin(n) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -739,9 +1768,46 @@ object MultiJoin extends java.io.Serializable { .outerJoin(m) .outerJoin(n) .outerJoin(o) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -757,9 +1823,48 @@ object MultiJoin extends java.io.Serializable { .outerJoin(n) .outerJoin(o) .outerJoin(p) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -776,9 +1881,50 @@ object MultiJoin extends java.io.Serializable { .outerJoin(o) .outerJoin(p) .outerJoin(q) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -796,9 +1942,52 @@ object MultiJoin extends java.io.Serializable { .outerJoin(p) .outerJoin(q) .outerJoin(r) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -817,9 +2006,54 @@ object MultiJoin extends java.io.Serializable { .outerJoin(q) .outerJoin(r) .outerJoin(s) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -839,9 +2073,56 @@ object MultiJoin extends java.io.Serializable { .outerJoin(r) .outerJoin(s) .outerJoin(t) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -862,9 +2143,58 @@ object MultiJoin extends java.io.Serializable { .outerJoin(s) .outerJoin(t) .outerJoin(u) - .mapValues { tup => flattenNestedOptionTuple(tup) } - - def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B], c: CoGroupable[KEY, C], d: CoGroupable[KEY, D], e: CoGroupable[KEY, E], f: CoGroupable[KEY, F], g: CoGroupable[KEY, G], h: CoGroupable[KEY, H], i: CoGroupable[KEY, I], j: CoGroupable[KEY, J], k: CoGroupable[KEY, K], l: CoGroupable[KEY, L], m: CoGroupable[KEY, M], n: CoGroupable[KEY, N], o: CoGroupable[KEY, O], p: CoGroupable[KEY, P], q: CoGroupable[KEY, Q], r: CoGroupable[KEY, R], s: CoGroupable[KEY, S], t: CoGroupable[KEY, T], u: CoGroupable[KEY, U], v: CoGroupable[KEY, V]): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J], Option[K], Option[L], Option[M], Option[N], Option[O], Option[P], Option[Q], Option[R], Option[S], Option[T], Option[U], Option[V])] = + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) + ] = a.outerJoin(b) .outerJoin(c) .outerJoin(d) @@ -886,7 +2216,7 @@ object MultiJoin extends java.io.Serializable { .outerJoin(t) .outerJoin(u) .outerJoin(v) - .mapValues { tup => flattenNestedOptionTuple(tup) } + .mapValues(tup => flattenNestedOptionTuple(tup)) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala index 53c1b59ccb..71cb096a6a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala @@ -4,20 +4,16 @@ import com.twitter.scalding.serialization.Externalizer import java.io.Serializable /** - * This is a weakly typed multi-way join function. By construction, - * it should be kept in sync with the types in a Seq[TypedPipe[(K, Any)]] + * This is a weakly typed multi-way join function. By construction, it should be kept in sync with the types + * in a Seq[TypedPipe[(K, Any)]] * - * a more sophisticated typing could use an HList of TypedPipe - * and another more advanced coding here to prove the types line up. - * However, this is somewhat easy to test and only exposed to - * those writing backends, so we are currently satisfied with the - * weak typing in this case + * a more sophisticated typing could use an HList of TypedPipe and another more advanced coding here to prove + * the types line up. However, this is somewhat easy to test and only exposed to those writing backends, so we + * are currently satisfied with the weak typing in this case * - * We use Externalizer internally to independently serialize each function - * in the composition. This, in principle, should allow Externalizer - * to work better since different functions may be serializable with - * Kryo or Java, but currently Externalizer has to use java or kryo - * for the entire object. + * We use Externalizer internally to independently serialize each function in the composition. This, in + * principle, should allow Externalizer to work better since different functions may be serializable with Kryo + * or Java, but currently Externalizer has to use java or kryo for the entire object. */ sealed abstract class MultiJoinFunction[A, +B] extends Serializable { def inputSize: Int @@ -34,9 +30,10 @@ object MultiJoinFunction extends Serializable { } final case class PairCachedRight[K, A, B, C]( - left: MultiJoinFunction[K, A], - right: MultiJoinFunction[K, B], - @transient fn: (K, Iterator[A], Iterable[B]) => Iterator[C]) extends MultiJoinFunction[K, C] { + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + @transient fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends MultiJoinFunction[K, C] { private[this] val fnEx = Externalizer(fn) @@ -54,7 +51,10 @@ object MultiJoinFunction extends Serializable { * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and * the use of Any) */ - require(rightStreams.size == inputSize - 1, s"expected ${inputSize} inputSize, found ${rightStreams.size + 1}") + require( + rightStreams.size == inputSize - 1, + s"expected $inputSize inputSize, found ${rightStreams.size + 1}" + ) val (leftSeq, rightSeq) = rightStreams.splitAt(leftSeqCount) val joinedLeft = left(key, leftMost, leftSeq) @@ -65,9 +65,10 @@ object MultiJoinFunction extends Serializable { } final case class Pair[K, A, B, C]( - left: MultiJoinFunction[K, A], - right: MultiJoinFunction[K, B], - @transient fn: (K, Iterator[A], Iterable[B]) => Iterator[C]) extends MultiJoinFunction[K, C] { + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + @transient fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends MultiJoinFunction[K, C] { private[this] val fnEx = Externalizer(fn) @@ -85,7 +86,10 @@ object MultiJoinFunction extends Serializable { * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and * the use of Any) */ - require(rightStreams.size == inputSize - 1, s"expected ${inputSize} inputSize, found ${rightStreams.size + 1}") + require( + rightStreams.size == inputSize - 1, + s"expected $inputSize inputSize, found ${rightStreams.size + 1}" + ) val (leftSeq, rightSeq) = rightStreams.splitAt(leftSeqCount) val joinedLeft = left(key, leftMost, leftSeq) @@ -109,8 +113,9 @@ object MultiJoinFunction extends Serializable { * This is used to implement mapGroup on already joined streams */ final case class MapGroup[K, A, B]( - input: MultiJoinFunction[K, A], - @transient mapGroupFn: (K, Iterator[A]) => Iterator[B]) extends MultiJoinFunction[K, B] { + input: MultiJoinFunction[K, A], + @transient mapGroupFn: (K, Iterator[A]) => Iterator[B] + ) extends MultiJoinFunction[K, B] { private[this] val fnEx = Externalizer(mapGroupFn) @@ -123,11 +128,11 @@ object MultiJoinFunction extends Serializable { } /** - * This is used to join IteratorMappedReduce with others. - * We could compose Casting[A] with MapGroup[K, A, B] but since it is common enough we give - * it its own case. + * This is used to join IteratorMappedReduce with others. We could compose Casting[A] with MapGroup[K, A, B] + * but since it is common enough we give it its own case. */ - final case class MapCast[K, A, B](@transient mapGroupFn: (K, Iterator[A]) => Iterator[B]) extends MultiJoinFunction[K, B] { + final case class MapCast[K, A, B](@transient mapGroupFn: (K, Iterator[A]) => Iterator[B]) + extends MultiJoinFunction[K, B] { private[this] val fnEx = Externalizer(mapGroupFn) @@ -139,4 +144,3 @@ object MultiJoinFunction extends Serializable { } } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala index b935e1a91d..a7569f34bb 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala @@ -12,13 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed /** - * This type is used to implement .andThen on a function in a way - * that will never blow up the stack. This is done to prevent - * deep scalding TypedPipe pipelines from blowing the stack + * This type is used to implement .andThen on a function in a way that will never blow up the stack. This is + * done to prevent deep scalding TypedPipe pipelines from blowing the stack * * This may be slow, but is used in scalding at planning time */ @@ -28,17 +27,20 @@ sealed trait NoStackAndThen[-A, +B] extends java.io.Serializable { def andThen[C](that: NoStackAndThen[B, C]): NoStackAndThen[A, C] = { import NoStackAndThen._ @annotation.tailrec - def push(front: NoStackAndThen[A, Any], - next: NoStackAndThen[Any, Any], - toAndThen: ReversedStack[Any, C]): NoStackAndThen[A, C] = + def push( + front: NoStackAndThen[A, Any], + next: NoStackAndThen[Any, Any], + toAndThen: ReversedStack[Any, C] + ): NoStackAndThen[A, C] = (next, toAndThen) match { case (NoStackWrap(fn), EmptyStack(fn2)) => NoStackMore(front, fn).andThen(fn2) - case (NoStackWrap(fn), NonEmpty(h, tail)) => push(NoStackMore(front, fn), NoStackAndThen.NoStackWrap(h), tail) + case (NoStackWrap(fn), NonEmpty(h, tail)) => + push(NoStackMore(front, fn), NoStackAndThen.NoStackWrap(h), tail) case (NoStackMore(first, tail), _) => push(front, first, NonEmpty(tail, toAndThen)) - case (WithStackTrace(_, _), _) => sys.error("should be unreachable") + case (WithStackTrace(_, _), _) => sys.error("should be unreachable") } that match { - case NoStackWrap(fn) => andThen(fn) + case NoStackWrap(fn) => andThen(fn) case NoStackMore(head, tail) => // casts needed for the tailrec, they can't cause runtime errors push(this, head.asInstanceOf[NoStackAndThen[Any, Any]], EmptyStack(tail)) @@ -54,9 +56,13 @@ object NoStackAndThen { private sealed trait ReversedStack[-A, +B] private final case class EmptyStack[-A, +B](fn: A => B) extends ReversedStack[A, B] - private final case class NonEmpty[-A, B, +C](head: A => B, rest: ReversedStack[B, C]) extends ReversedStack[A, C] + private final case class NonEmpty[-A, B, +C](head: A => B, rest: ReversedStack[B, C]) + extends ReversedStack[A, C] - private[scalding] final case class WithStackTrace[A, B](inner: NoStackAndThen[A, B], stackEntry: Array[StackTraceElement]) extends NoStackAndThen[A, B] { + private[scalding] final case class WithStackTrace[A, B]( + inner: NoStackAndThen[A, B], + stackEntry: Array[StackTraceElement] + ) extends NoStackAndThen[A, B] { override def apply(a: A): B = inner(a) override def andThen[C](fn: B => C): NoStackAndThen[A, C] = @@ -71,7 +77,8 @@ object NoStackAndThen { def apply(a: A) = fn(a) } // This is the defunctionalized andThen - private final case class NoStackMore[A, B, C](first: NoStackAndThen[A, B], andThenFn: (B) => C) extends NoStackAndThen[A, C] { + private final case class NoStackMore[A, B, C](first: NoStackAndThen[A, B], andThenFn: (B) => C) + extends NoStackAndThen[A, C] { /* * scala cannot optimize tail calls if the types change. * Any call that changes types, we replace that type with Any. These casts @@ -80,21 +87,20 @@ object NoStackAndThen { @annotation.tailrec private def reversed(toPush: NoStackAndThen[A, Any], rest: ReversedStack[Any, C]): ReversedStack[A, C] = toPush match { - case NoStackWrap(fn) => NonEmpty(fn, rest) + case NoStackWrap(fn) => NonEmpty(fn, rest) case NoStackMore(more, fn) => reversed(more, NonEmpty(fn, rest)) - case WithStackTrace(_, _) => sys.error("should be unreachable") + case WithStackTrace(_, _) => sys.error("should be unreachable") } @annotation.tailrec private def call(arg: Any, revstack: ReversedStack[Any, C]): C = revstack match { - case EmptyStack(last) => last(arg) + case EmptyStack(last) => last(arg) case NonEmpty(head, rest) => call(head(arg), rest) } private lazy val revStack: ReversedStack[Any, C] = // casts needed for the tailrec, they can't cause runtime errors - reversed(first, EmptyStack(andThenFn.asInstanceOf[(Any) => (C)])) + reversed(first, EmptyStack(andThenFn.asInstanceOf[(Any) => C])) .asInstanceOf[ReversedStack[Any, C]] def apply(a: A): C = call(a, revStack) } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala index 6518fceb32..380e375085 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala @@ -3,8 +3,7 @@ package com.twitter.scalding.typed import com.stripe.dagon.Rule /** - * This is a class to allow customization - * of how we plan typed pipes + * This is a class to allow customization of how we plan typed pipes */ abstract class OptimizationPhases { def phases: Seq[Rule[TypedPipe]] diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala index 43992cf18f..6eda0bee0a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala @@ -1,21 +1,38 @@ package com.twitter.scalding.typed import com.twitter.algebird.Monoid -import com.stripe.dagon.{ FunctionK, Memoize, Rule, PartialRule, Dag, Literal } -import com.twitter.scalding.typed.functions.{ FlatMapping, FlatMappedFn, FlatMapValuesToFlatMap, FilterKeysToFilter, FilterGroup, Fill, MapValuesToMap, MapGroupMapValues, MapGroupFlatMapValues, MergeFlatMaps, SumAll, MapValueStream } -import com.twitter.scalding.typed.functions.ComposedFunctions.{ ComposedMapFn, ComposedFilterFn, ComposedOnComplete } +import com.stripe.dagon.{Dag, FunctionK, Literal, Memoize, PartialRule, Rule} +import com.twitter.scalding.typed.functions.{ + Fill, + FilterGroup, + FilterKeysToFilter, + FlatMapValuesToFlatMap, + FlatMappedFn, + FlatMapping, + MapGroupFlatMapValues, + MapGroupMapValues, + MapValueStream, + MapValuesToMap, + MergeFlatMaps, + SumAll +} +import com.twitter.scalding.typed.functions.ComposedFunctions.{ + ComposedFilterFn, + ComposedMapFn, + ComposedOnComplete +} object OptimizationRules { type LiteralPipe[T] = Literal[TypedPipe, T] - import Literal.{ Unary, Binary } + import Literal.{Binary, Unary} import TypedPipe._ /** - * Since our TypedPipe is covariant, but the Literal is not - * this is actually safe in this context, but not in general + * Since our TypedPipe is covariant, but the Literal is not this is actually safe in this context, but not + * in general */ - def widen[T](l: LiteralPipe[_ <: T]): LiteralPipe[T] = { + def widen[T](l: LiteralPipe[_ <: T]): LiteralPipe[T] = // to prove this is safe, see that if you have // LiteralPipe[_ <: T] we can call .evaluate to get // TypedPipe[_ <: T] which due to covariance is @@ -25,113 +42,124 @@ object OptimizationRules { // that would be wasteful to apply since the final // result is identity. l.asInstanceOf[LiteralPipe[T]] - } /** - * Convert a TypedPipe[T] to a Literal[TypedPipe, T] for - * use with Dagon + * Convert a TypedPipe[T] to a Literal[TypedPipe, T] for use with Dagon */ def toLiteral: FunctionK[TypedPipe, LiteralPipe] = - Memoize.functionK[TypedPipe, LiteralPipe]( - new Memoize.RecursiveK[TypedPipe, LiteralPipe] { - - def toFunction[A] = { - case (cp: CounterPipe[a], f) => - Unary(f(cp.pipe), CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) - case (c: CrossPipe[a, b], f) => - Binary(f(c.left), f(c.right), CrossPipe(_: TypedPipe[a], _: TypedPipe[b])) - case (cv@CrossValue(_, _), f) => - def go[A, B](cv: CrossValue[A, B]): LiteralPipe[(A, B)] = - cv match { - case CrossValue(a, ComputedValue(v)) => - Binary(f(a), f(v), { (a: TypedPipe[A], b: TypedPipe[B]) => + Memoize.functionK[TypedPipe, LiteralPipe](new Memoize.RecursiveK[TypedPipe, LiteralPipe] { + + def toFunction[A] = { + case (cp: CounterPipe[a], f) => + Unary(f(cp.pipe), CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) + case (c: CrossPipe[a, b], f) => + Binary(f(c.left), f(c.right), CrossPipe(_: TypedPipe[a], _: TypedPipe[b])) + case (cv @ CrossValue(_, _), f) => + def go[A, B](cv: CrossValue[A, B]): LiteralPipe[(A, B)] = + cv match { + case CrossValue(a, ComputedValue(v)) => + Binary( + f(a), + f(v), + (a: TypedPipe[A], b: TypedPipe[B]) => CrossValue(a, ComputedValue(b)) - }) - case CrossValue(a, v) => - Unary(f(a), CrossValue(_: TypedPipe[A], v)) - } - widen(go(cv)) - case (p: DebugPipe[a], f) => - Unary(f(p.input), DebugPipe(_: TypedPipe[a])) - case (p: FilterKeys[a, b], f) => - widen(Unary(f(p.input), FilterKeys(_: TypedPipe[(a, b)], p.fn))) - case (p: Filter[a], f) => - Unary(f(p.input), Filter(_: TypedPipe[a], p.fn)) - case (p: Fork[a], f) => - Unary(f(p.input), Fork(_: TypedPipe[a])) - case (p: FlatMapValues[a, b, c], f) => - widen(Unary(f(p.input), FlatMapValues(_: TypedPipe[(a, b)], p.fn))) - case (p: FlatMapped[a, b], f) => - Unary(f(p.input), FlatMapped(_: TypedPipe[a], p.fn)) - case (p: ForceToDisk[a], f) => - Unary(f(p.input), ForceToDisk(_: TypedPipe[a])) - case (it@IterablePipe(_), _) => - Literal.Const(it) - case (p: MapValues[a, b, c], f) => - widen(Unary(f(p.input), MapValues(_: TypedPipe[(a, b)], p.fn))) - case (p: Mapped[a, b], f) => - Unary(f(p.input), Mapped(_: TypedPipe[a], p.fn)) - case (p: MergedTypedPipe[a], f) => - Binary(f(p.left), f(p.right), MergedTypedPipe(_: TypedPipe[a], _: TypedPipe[a])) - case (src@SourcePipe(_), _) => - Literal.Const(src) - case (p: SumByLocalKeys[a, b], f) => - widen(Unary(f(p.input), SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup))) - case (p: TrappedPipe[a], f) => - Unary(f(p.input), TrappedPipe[a](_: TypedPipe[a], p.sink, p.conv)) - case (p: WithDescriptionTypedPipe[a], f) => - Unary(f(p.input), WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) - case (p: WithOnComplete[a], f) => - Unary(f(p.input), WithOnComplete(_: TypedPipe[a], p.fn)) - case (EmptyTypedPipe, _) => - Literal.Const(EmptyTypedPipe) - case (hg: HashCoGroup[a, b, c, d], f) => - widen(handleHashCoGroup(hg, f)) - case (CoGroupedPipe(cg), f) => - widen(handleCoGrouped(cg, f)) - case (ReduceStepPipe(rs), f) => - widen(handleReduceStep(rs, f)) - } - }) - - private def handleReduceStep[K, V1, V2](rs: ReduceStep[K, V1, V2], recurse: FunctionK[TypedPipe, LiteralPipe]): LiteralPipe[(K, V2)] = { - // zero out the input so we can potentially GC it - val emptyRs = ReduceStep.setInput[K, V1, V2](rs, TypedPipe.empty) - - Unary(widen[(K, V1)](recurse(rs.mapped)), { (tp: TypedPipe[(K, V1)]) => + ) + case CrossValue(a, v) => + Unary(f(a), CrossValue(_: TypedPipe[A], v)) + } + widen(go(cv)) + case (p: DebugPipe[a], f) => + Unary(f(p.input), DebugPipe(_: TypedPipe[a])) + case (p: FilterKeys[a, b], f) => + widen(Unary(f(p.input), FilterKeys(_: TypedPipe[(a, b)], p.fn))) + case (p: Filter[a], f) => + Unary(f(p.input), Filter(_: TypedPipe[a], p.fn)) + case (p: Fork[a], f) => + Unary(f(p.input), Fork(_: TypedPipe[a])) + case (p: FlatMapValues[a, b, c], f) => + widen(Unary(f(p.input), FlatMapValues(_: TypedPipe[(a, b)], p.fn))) + case (p: FlatMapped[a, b], f) => + Unary(f(p.input), FlatMapped(_: TypedPipe[a], p.fn)) + case (p: ForceToDisk[a], f) => + Unary(f(p.input), ForceToDisk(_: TypedPipe[a])) + case (it @ IterablePipe(_), _) => + Literal.Const(it) + case (p: MapValues[a, b, c], f) => + widen(Unary(f(p.input), MapValues(_: TypedPipe[(a, b)], p.fn))) + case (p: Mapped[a, b], f) => + Unary(f(p.input), Mapped(_: TypedPipe[a], p.fn)) + case (p: MergedTypedPipe[a], f) => + Binary(f(p.left), f(p.right), MergedTypedPipe(_: TypedPipe[a], _: TypedPipe[a])) + case (src @ SourcePipe(_), _) => + Literal.Const(src) + case (p: SumByLocalKeys[a, b], f) => + widen(Unary(f(p.input), SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup))) + case (p: TrappedPipe[a], f) => + Unary(f(p.input), TrappedPipe[a](_: TypedPipe[a], p.sink, p.conv)) + case (p: WithDescriptionTypedPipe[a], f) => + Unary(f(p.input), WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) + case (p: WithOnComplete[a], f) => + Unary(f(p.input), WithOnComplete(_: TypedPipe[a], p.fn)) + case (EmptyTypedPipe, _) => + Literal.Const(EmptyTypedPipe) + case (hg: HashCoGroup[a, b, c, d], f) => + widen(handleHashCoGroup(hg, f)) + case (CoGroupedPipe(cg), f) => + widen(handleCoGrouped(cg, f)) + case (ReduceStepPipe(rs), f) => + widen(handleReduceStep(rs, f)) + } + }) + + private def handleReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, V2)] = { + // zero out the input so we can potentially GC it + val emptyRs = ReduceStep.setInput[K, V1, V2](rs, TypedPipe.empty) + + Unary( + widen[(K, V1)](recurse(rs.mapped)), + (tp: TypedPipe[(K, V1)]) => ReduceStepPipe(ReduceStep.setInput[K, V1, V2](emptyRs, tp)) - }) - } + ) + } - private def handleCoGrouped[K, V](cg: CoGroupable[K, V], recurse: FunctionK[TypedPipe, LiteralPipe]): LiteralPipe[(K, V)] = { - import CoGrouped._ + private def handleCoGrouped[K, V]( + cg: CoGroupable[K, V], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, V)] = { + import CoGrouped._ - def pipeToCG[V1](t: TypedPipe[(K, V1)]): (CoGroupable[K, V1], List[(String, Boolean)]) = - t match { - case ReduceStepPipe(cg: CoGroupable[K @unchecked, V1 @unchecked]) => - // we are relying on the fact that we use Ordering[K] - // as a contravariant type, despite it not being defined - // that way. - (cg, Nil) - case CoGroupedPipe(cg) => - // we are relying on the fact that we use Ordering[K] - // as a contravariant type, despite it not being defined - // that way. - (cg.asInstanceOf[CoGroupable[K, V1]], Nil) - case WithDescriptionTypedPipe(pipe, descs) => - val (cg, d1) = pipeToCG(pipe) - (cg, ComposeDescriptions.combine(d1, descs)) - case kvPipe => - (IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly), Nil) - } + def pipeToCG[V1](t: TypedPipe[(K, V1)]): (CoGroupable[K, V1], List[(String, Boolean)]) = + t match { + case ReduceStepPipe(cg: CoGroupable[K @unchecked, V1 @unchecked]) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + (cg, Nil) + case CoGroupedPipe(cg) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + (cg.asInstanceOf[CoGroupable[K, V1]], Nil) + case WithDescriptionTypedPipe(pipe, descs) => + val (cg, d1) = pipeToCG(pipe) + (cg, ComposeDescriptions.combine(d1, descs)) + case kvPipe => + (IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly), Nil) + } - cg match { - case p@Pair(_, _, _) => - def go[A, B, C](pair: Pair[K, A, B, C]): LiteralPipe[(K, C)] = { - val llit = handleCoGrouped(pair.larger, recurse) - val rlit = handleCoGrouped(pair.smaller, recurse) - val fn = pair.fn - Binary(llit, rlit, { (l: TypedPipe[(K, A)], r: TypedPipe[(K, B)]) => + cg match { + case p @ Pair(_, _, _) => + def go[A, B, C](pair: Pair[K, A, B, C]): LiteralPipe[(K, C)] = { + val llit = handleCoGrouped(pair.larger, recurse) + val rlit = handleCoGrouped(pair.smaller, recurse) + val fn = pair.fn + Binary( + llit, + rlit, + { (l: TypedPipe[(K, A)], r: TypedPipe[(K, B)]) => val (left, d1) = pipeToCG(l) val (right, d2) = pipeToCG(r) val d3 = ComposeDescriptions.combine(d1, d2) @@ -140,29 +168,36 @@ object OptimizationRules { p.withDescription(d) } CoGroupedPipe(withD) - }) - } - widen(go(p)) - case wr@WithReducers(_, _) => - def go[V1 <: V](wr: WithReducers[K, V1]): LiteralPipe[(K, V)] = { - val reds = wr.reds - Unary[TypedPipe, (K, V1), (K, V)](handleCoGrouped(wr.on, recurse), { (tp: TypedPipe[(K, V1)]) => + } + ) + } + widen(go(p)) + case wr @ WithReducers(_, _) => + def go[V1 <: V](wr: WithReducers[K, V1]): LiteralPipe[(K, V)] = { + val reds = wr.reds + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(wr.on, recurse), + (tp: TypedPipe[(K, V1)]) => tp match { case ReduceStepPipe(rs) => ReduceStepPipe(ReduceStep.withReducers(rs, reds)) case CoGroupedPipe(cg) => CoGroupedPipe(WithReducers(cg, reds)) case kvPipe => - ReduceStepPipe(IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) - .withReducers(reds)) + ReduceStepPipe( + IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + .withReducers(reds) + ) } - }) - } - go(wr) - case wd@WithDescription(_, _) => - def go[V1 <: V](wd: WithDescription[K, V1]): LiteralPipe[(K, V)] = { - val desc = wd.description - Unary[TypedPipe, (K, V1), (K, V)](handleCoGrouped(wd.on, recurse), { (tp: TypedPipe[(K, V1)]) => + ) + } + go(wr) + case wd @ WithDescription(_, _) => + def go[V1 <: V](wd: WithDescription[K, V1]): LiteralPipe[(K, V)] = { + val desc = wd.description + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(wd.on, recurse), + (tp: TypedPipe[(K, V1)]) => tp match { case ReduceStepPipe(rs) => ReduceStepPipe(ReduceStep.withDescription(rs, desc)) @@ -171,13 +206,15 @@ object OptimizationRules { case kvPipe => kvPipe.withDescription(desc) } - }) - } - go(wd) - case fk@FilterKeys(_, _) => - def go[V1 <: V](fk: FilterKeys[K, V1]): LiteralPipe[(K, V)] = { - val fn = fk.fn - Unary[TypedPipe, (K, V1), (K, V)](handleCoGrouped(fk.on, recurse), { (tp: TypedPipe[(K, V1)]) => + ) + } + go(wd) + case fk @ FilterKeys(_, _) => + def go[V1 <: V](fk: FilterKeys[K, V1]): LiteralPipe[(K, V)] = { + val fn = fk.fn + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(fk.on, recurse), + (tp: TypedPipe[(K, V1)]) => tp match { case ReduceStepPipe(rs) => val mapped = rs.mapped @@ -188,13 +225,15 @@ object OptimizationRules { case kvPipe => TypedPipe.FilterKeys(kvPipe, fn) } - }) - } - go(fk) - case mg@MapGroup(_, _) => - def go[V1, V2 <: V](mg: MapGroup[K, V1, V2]): LiteralPipe[(K, V)] = { - val fn = mg.fn - Unary[TypedPipe, (K, V1), (K, V)](handleCoGrouped(mg.on, recurse), { (tp: TypedPipe[(K, V1)]) => + ) + } + go(fk) + case mg @ MapGroup(_, _) => + def go[V1, V2 <: V](mg: MapGroup[K, V1, V2]): LiteralPipe[(K, V)] = { + val fn = mg.fn + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(mg.on, recurse), + (tp: TypedPipe[(K, V1)]) => tp match { case ReduceStepPipe(rs) => ReduceStepPipe(ReduceStep.mapGroup(rs)(fn)) @@ -203,45 +242,50 @@ object OptimizationRules { case kvPipe => ReduceStepPipe( IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) - .mapGroup(fn)) + .mapGroup(fn) + ) } - }) - } - go(mg) - case step@IdentityReduce(_, _, _, _, _) => - widen(handleReduceStep(step, recurse)) - case step@UnsortedIdentityReduce(_, _, _, _, _) => - widen(handleReduceStep(step, recurse)) - case step@IteratorMappedReduce(_, _, _, _, _) => - widen(handleReduceStep(step, recurse)) - } + ) + } + go(mg) + case step @ IdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) } + } - private def handleHashCoGroup[K, V, V2, R](hj: HashCoGroup[K, V, V2, R], recurse: FunctionK[TypedPipe, LiteralPipe]): LiteralPipe[(K, R)] = { - val rightLit: LiteralPipe[(K, V2)] = { - val rs = HashJoinable.toReduceStep(hj.right) - def go[A, B, C](rs: ReduceStep[A, B, C]): LiteralPipe[(A, C)] = - Unary(recurse(rs.mapped), { tp: TypedPipe[(A, B)] => ReduceStepPipe(ReduceStep.setInput(rs, tp)) }) - widen(go(rs)) - } - - val ordK: Ordering[K] = hj.right.keyOrdering - val joiner = hj.joiner - - Binary(recurse(hj.left), rightLit, - { (ltp: TypedPipe[(K, V)], rtp: TypedPipe[(K, V2)]) => - rtp match { - case ReduceStepPipe(hg: HashJoinable[K @unchecked, V2 @unchecked]) => - HashCoGroup(ltp, hg, joiner) - case otherwise => - HashCoGroup(ltp, IdentityReduce[K, V2, V2](ordK, otherwise, None, Nil, implicitly), joiner) - } - }) + private def handleHashCoGroup[K, V, V2, R]( + hj: HashCoGroup[K, V, V2, R], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, R)] = { + val rightLit: LiteralPipe[(K, V2)] = { + val rs = HashJoinable.toReduceStep(hj.right) + def go[A, B, C](rs: ReduceStep[A, B, C]): LiteralPipe[(A, C)] = + Unary(recurse(rs.mapped), { tp: TypedPipe[(A, B)] => ReduceStepPipe(ReduceStep.setInput(rs, tp)) }) + widen(go(rs)) } + val ordK: Ordering[K] = hj.right.keyOrdering + val joiner = hj.joiner + + Binary( + recurse(hj.left), + rightLit, + (ltp: TypedPipe[(K, V)], rtp: TypedPipe[(K, V2)]) => + rtp match { + case ReduceStepPipe(hg: HashJoinable[K @unchecked, V2 @unchecked]) => + HashCoGroup(ltp, hg, joiner) + case otherwise => + HashCoGroup(ltp, IdentityReduce[K, V2, V2](ordK, otherwise, None, Nil, implicitly), joiner) + } + ) + } + /** - * Unroll a set of merges up to the first non-merge node, dropping - * an EmptyTypedPipe from the list + * Unroll a set of merges up to the first non-merge node, dropping an EmptyTypedPipe from the list */ def unrollMerge[A](t: TypedPipe[A]): List[TypedPipe[A]] = { @annotation.tailrec @@ -250,18 +294,18 @@ object OptimizationRules { case MergedTypedPipe(l, r) => loop(l, r :: todo, acc) case EmptyTypedPipe => todo match { - case Nil => acc.reverse + case Nil => acc.reverse case h :: tail => loop(h, tail, acc) } case IterablePipe(as) if as.isEmpty => todo match { - case Nil => acc.reverse + case Nil => acc.reverse case h :: tail => loop(h, tail, acc) } case notMerge => val acc1 = notMerge :: acc todo match { - case Nil => acc1.reverse + case Nil => acc1.reverse case h :: tail => loop(h, tail, acc1) } } @@ -270,17 +314,15 @@ object OptimizationRules { } /** - * Make sure each returned item is unique. Any duplicates - * are merged using flatMap(Iterator.fill(size)(_)) + * Make sure each returned item is unique. Any duplicates are merged using flatMap(Iterator.fill(size)(_)) * - * TODO: this could be more precise by combining more - * complex mapping operations into one large flatMap + * TODO: this could be more precise by combining more complex mapping operations into one large flatMap */ def dedupMerge[A](as: List[TypedPipe[A]]): List[TypedPipe[A]] = - as.groupBy { tp => tp } + as.groupBy(tp => tp) .iterator .map { - case (p, Nil) => sys.error(s"unreachable: $p has no values") + case (p, Nil) => sys.error(s"unreachable: $p has no values") case (p, _ :: Nil) => p // just once case (p, repeated) => val rsize = repeated.size @@ -295,22 +337,21 @@ object OptimizationRules { ///////////////////////////// /** - * It is easier for planning if all fanouts are made explicit. - * This rule adds a Fork node every time there is a fanout + * It is easier for planning if all fanouts are made explicit. This rule adds a Fork node every time there + * is a fanout * - * This rule applied first makes it easier to match in subsequent - * rules without constantly checking for fanout nodes. + * This rule applied first makes it easier to match in subsequent rules without constantly checking for + * fanout nodes. * - * This can increase the number of map-reduce steps compared - * to simply recomputing on both sides of a fork + * This can increase the number of map-reduce steps compared to simply recomputing on both sides of a fork */ object AddExplicitForks extends Rule[TypedPipe] { def maybeFork[A](on: Dag[TypedPipe], t: TypedPipe[A]): Option[TypedPipe[A]] = t match { - case ForceToDisk(_) => None - case Fork(t) if on.contains(ForceToDisk(t)) => Some(ForceToDisk(t)) - case Fork(_) => None + case ForceToDisk(_) => None + case Fork(t) if on.contains(ForceToDisk(t)) => Some(ForceToDisk(t)) + case Fork(_) => None case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) => None case other if !on.hasSingleDependent(other) => Some { @@ -344,68 +385,75 @@ object OptimizationRules { forkCoGroup(on, right).map { Pair(left, _, jf) } - case Pair(_, _, _) => None // neither side needs a fork + case Pair(_, _, _) => None // neither side needs a fork case WithDescription(cg, d) => forkCoGroup(on, cg).map(WithDescription(_, d)) - case WithReducers(cg, r) => forkCoGroup(on, cg).map(WithReducers(_, r)) - case MapGroup(cg, fn) => forkCoGroup(on, cg).map(MapGroup(_, fn)) - case FilterKeys(cg, fn) => forkCoGroup(on, cg).map(FilterKeys(_, fn)) + case WithReducers(cg, r) => forkCoGroup(on, cg).map(WithReducers(_, r)) + case MapGroup(cg, fn) => forkCoGroup(on, cg).map(MapGroup(_, fn)) + case FilterKeys(cg, fn) => forkCoGroup(on, cg).map(FilterKeys(_, fn)) } } /** - * The casts in here are safe, but scala loses track of the types in these kinds of - * pattern matches. - * We can fix it by changing the types on the identity reduces to use EqTypes[V1, V2] - * in case class and leaving the V2 parameter. + * The casts in here are safe, but scala loses track of the types in these kinds of pattern matches. We + * can fix it by changing the types on the identity reduces to use EqTypes[V1, V2] in case class and + * leaving the V2 parameter. */ - private def forkReduceStep[A, B, C](on: Dag[TypedPipe], rs: ReduceStep[A, B, C]): Option[ReduceStep[A, B, C]] = + private def forkReduceStep[A, B, C]( + on: Dag[TypedPipe], + rs: ReduceStep[A, B, C] + ): Option[ReduceStep[A, B, C]] = maybeFork(on, rs.mapped).map(ReduceStep.setInput(rs, _)) - private def forkHashJoinable[K, V](on: Dag[TypedPipe], hj: HashJoinable[K, V]): Option[HashJoinable[K, V]] = + private def forkHashJoinable[K, V]( + on: Dag[TypedPipe], + hj: HashJoinable[K, V] + ): Option[HashJoinable[K, V]] = hj match { - case step@IdentityReduce(_, _, _, _, _) => - maybeFork(on, step.mapped).map { p => step.copy(mapped = p) } - case step@UnsortedIdentityReduce(_, _, _, _, _) => - maybeFork(on, step.mapped).map { p => step.copy(mapped = p) } - case step@IteratorMappedReduce(_, _, _, _, _) => - maybeFork(on, step.mapped).map { p => step.copy(mapped = p) } + case step @ IdentityReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) } def apply[T](on: Dag[TypedPipe]) = { - case CounterPipe(a) if needsFork(on, a) => maybeFork(on, a).map(CounterPipe(_)) - case CrossPipe(a, b) if needsFork(on, a) => maybeFork(on, a).map(CrossPipe(_, b)) - case CrossPipe(a, b) if needsFork(on, b) => maybeFork(on, b).map(CrossPipe(a, _)) + case CounterPipe(a) if needsFork(on, a) => maybeFork(on, a).map(CounterPipe(_)) + case CrossPipe(a, b) if needsFork(on, a) => maybeFork(on, a).map(CrossPipe(_, b)) + case CrossPipe(a, b) if needsFork(on, b) => maybeFork(on, b).map(CrossPipe(a, _)) case CrossValue(a, b) if needsFork(on, a) => maybeFork(on, a).map(CrossValue(_, b)) - case CrossValue(a, ComputedValue(b)) if needsFork(on, b) => maybeFork(on, b).map { fb => CrossValue(a, ComputedValue(fb)) } - case DebugPipe(p) => maybeFork(on, p).map(DebugPipe(_)) + case CrossValue(a, ComputedValue(b)) if needsFork(on, b) => + maybeFork(on, b).map(fb => CrossValue(a, ComputedValue(fb))) + case DebugPipe(p) => maybeFork(on, p).map(DebugPipe(_)) case FilterKeys(p, fn) => maybeFork(on, p).map(FilterKeys(_, fn)) - case f@Filter(_, _) => + case f @ Filter(_, _) => def go[A](f: Filter[A]): Option[TypedPipe[A]] = { val Filter(p, fn) = f maybeFork(on, p).map(Filter(_, fn)) } go(f) - case FlatMapValues(p, fn) => maybeFork(on, p).map(FlatMapValues(_, fn)) - case FlatMapped(p, fn) => maybeFork(on, p).map(FlatMapped(_, fn)) + case FlatMapValues(p, fn) => maybeFork(on, p).map(FlatMapValues(_, fn)) + case FlatMapped(p, fn) => maybeFork(on, p).map(FlatMapped(_, fn)) case ForceToDisk(_) | Fork(_) => None // already has a barrier - case HashCoGroup(left, right, jf) if needsFork(on, left) => maybeFork(on, left).map(HashCoGroup(_, right, jf)) + case HashCoGroup(left, right, jf) if needsFork(on, left) => + maybeFork(on, left).map(HashCoGroup(_, right, jf)) case HashCoGroup(left, right, jf) => forkHashJoinable(on, right).map(HashCoGroup(left, _, jf)) - case MapValues(p, fn) => maybeFork(on, p).map(MapValues(_, fn)) - case Mapped(p, fn) => maybeFork(on, p).map(Mapped(_, fn)) + case MapValues(p, fn) => maybeFork(on, p).map(MapValues(_, fn)) + case Mapped(p, fn) => maybeFork(on, p).map(Mapped(_, fn)) case MergedTypedPipe(a, b) if needsFork(on, a) => maybeFork(on, a).map(MergedTypedPipe(_, b)) case MergedTypedPipe(a, b) if needsFork(on, b) => maybeFork(on, b).map(MergedTypedPipe(a, _)) - case ReduceStepPipe(rs) => forkReduceStep(on, rs).map(ReduceStepPipe(_)) - case SumByLocalKeys(p, sg) => maybeFork(on, p).map(SumByLocalKeys(_, sg)) - case t@TrappedPipe(_, _, _) => + case ReduceStepPipe(rs) => forkReduceStep(on, rs).map(ReduceStepPipe(_)) + case SumByLocalKeys(p, sg) => maybeFork(on, p).map(SumByLocalKeys(_, sg)) + case t @ TrappedPipe(_, _, _) => def go[A](t: TrappedPipe[A]): Option[TypedPipe[A]] = { val TrappedPipe(p, sink, conv) = t maybeFork(on, p).map(TrappedPipe(_, sink, conv)) } go(t) - case CoGroupedPipe(cgp) => forkCoGroup(on, cgp).map(CoGroupedPipe(_)) - case WithOnComplete(p, fn) => maybeFork(on, p).map(WithOnComplete(_, fn)) + case CoGroupedPipe(cgp) => forkCoGroup(on, cgp).map(CoGroupedPipe(_)) + case WithOnComplete(p, fn) => maybeFork(on, p).map(WithOnComplete(_, fn)) case WithDescriptionTypedPipe(p, ds) => maybeFork(on, p).map(WithDescriptionTypedPipe(_, ds)) - case _ => None + case _ => None } } @@ -436,15 +484,15 @@ object OptimizationRules { /** * a.filter(f).filter(g) == a.filter { x => f(x) && g(x) } * - * also if a filterKeys follows a filter, we might as well - * compose because we can't push the filterKeys up higher + * also if a filterKeys follows a filter, we might as well compose because we can't push the filterKeys up + * higher */ object ComposeFilter extends Rule[TypedPipe] { def apply[T](on: Dag[TypedPipe]) = { // scala can't type check this, so we hold its hand: // case Filter(Filter(in, fn0), fn1) => // Some(Filter(in, ComposedFilterFn(fn0, fn1))) - case f@Filter(_, _) => + case f @ Filter(_, _) => def go[A](f: Filter[A]): Option[TypedPipe[A]] = f.input match { case f1: Filter[a] => @@ -464,8 +512,8 @@ object OptimizationRules { } /** - * If we assume that Orderings are coherent, which we do generally in - * scalding in joins for instance, we can compose two reduce steps + * If we assume that Orderings are coherent, which we do generally in scalding in joins for instance, we can + * compose two reduce steps */ object ComposeReduceSteps extends Rule[TypedPipe] { def apply[A](on: Dag[TypedPipe]) = { @@ -493,17 +541,16 @@ object OptimizationRules { * a.onComplete(f).onComplete(g) == a.onComplete { () => f(); g() } */ object ComposeWithOnComplete extends PartialRule[TypedPipe] { - def applyWhere[T](on: Dag[TypedPipe]) = { - case WithOnComplete(WithOnComplete(pipe, fn0), fn1) => - WithOnComplete(pipe, ComposedOnComplete(fn0, fn1)) + def applyWhere[T](on: Dag[TypedPipe]) = { case WithOnComplete(WithOnComplete(pipe, fn0), fn1) => + WithOnComplete(pipe, ComposedOnComplete(fn0, fn1)) } } + /** * a.map(f).flatMap(g) == a.flatMap { x => g(f(x)) } * a.flatMap(f).map(g) == a.flatMap { x => f(x).map(g) } * - * This is a rule you may want to apply after having - * composed all the maps first + * This is a rule you may want to apply after having composed all the maps first */ object ComposeMapFlatMap extends PartialRule[TypedPipe] { def applyWhere[T](on: Dag[TypedPipe]) = { @@ -518,13 +565,11 @@ object OptimizationRules { } } - /** * a.filter(f).flatMap(g) == a.flatMap { x => if (f(x)) g(x) else Iterator.empty } * a.flatMap(f).filter(g) == a.flatMap { x => f(x).filter(g) } * - * This is a rule you may want to apply after having - * composed all the filters first + * This is a rule you may want to apply after having composed all the filters first */ object ComposeFilterFlatMap extends Rule[TypedPipe] { def apply[T](on: Dag[TypedPipe]) = { @@ -540,15 +585,15 @@ object OptimizationRules { None } } + /** * a.filter(f).map(g) == a.flatMap { x => if (f(x)) Iterator.single(g(x)) else Iterator.empty } * a.map(f).filter(g) == a.flatMap { x => val y = f(x); if (g(y)) Iterator.single(y) else Iterator.empty } * - * This is a rule you may want to apply after having - * composed all the filters first + * This is a rule you may want to apply after having composed all the filters first * - * This may be a deoptimization on some platforms that have native filters since - * you could avoid the Iterator boxing in that case. + * This may be a deoptimization on some platforms that have native filters since you could avoid the + * Iterator boxing in that case. */ object ComposeFilterMap extends Rule[TypedPipe] { def apply[T](on: Dag[TypedPipe]) = { @@ -557,7 +602,12 @@ object OptimizationRules { case filter: Filter[b] => filter.input match { case fm: Mapped[a, b] => - Some(FlatMapped[a, b](fm.input, FlatMappedFn.fromMap(fm.fn).combine(FlatMappedFn.fromFilter(filter.fn)))) + Some( + FlatMapped[a, b]( + fm.input, + FlatMappedFn.fromMap(fm.fn).combine(FlatMappedFn.fromFilter(filter.fn)) + ) + ) case _ => None } case _ => @@ -566,23 +616,25 @@ object OptimizationRules { } /** - * This rule is important in that it allows us to reduce - * the number of nodes in the graph, which is helpful to speed up rule application + * This rule is important in that it allows us to reduce the number of nodes in the graph, which is helpful + * to speed up rule application */ object ComposeDescriptions extends PartialRule[TypedPipe] { def combine(descs1: List[(String, Boolean)], descs2: List[(String, Boolean)]): List[(String, Boolean)] = { val combined = descs1 ::: descs2 - combined.foldLeft((Set.empty[String], List.empty[(String, Boolean)])) { - case (state@(s, acc), item@(m, true)) => - if (s(m)) state - else (s + m, item :: acc) - case ((s, acc), item) => - (s, item :: acc) - }._2.reverse + combined + .foldLeft((Set.empty[String], List.empty[(String, Boolean)])) { + case (state @ (s, acc), item @ (m, true)) => + if (s(m)) state + else (s + m, item :: acc) + case ((s, acc), item) => + (s, item :: acc) + } + ._2 + .reverse } - def applyWhere[T](on: Dag[TypedPipe]) = { case WithDescriptionTypedPipe(WithDescriptionTypedPipe(input, descs1), descs2) => WithDescriptionTypedPipe(input, combine(descs1, descs2)) @@ -590,8 +642,8 @@ object OptimizationRules { } /** - * In scalding 0.17 and earlier, descriptions were automatically pushdown below - * merges and flatMaps/map/etc.. + * In scalding 0.17 and earlier, descriptions were automatically pushdown below merges and + * flatMaps/map/etc.. */ object DescribeLater extends PartialRule[TypedPipe] { def applyWhere[T](on: Dag[TypedPipe]) = { @@ -603,7 +655,7 @@ object OptimizationRules { WithDescriptionTypedPipe(FlatMapped(in, fn), descs) case FlatMapValues(WithDescriptionTypedPipe(in, descs), fn) => WithDescriptionTypedPipe(FlatMapValues(in, fn), descs) - case f@Filter(WithDescriptionTypedPipe(_, _), _) => + case f @ Filter(WithDescriptionTypedPipe(_, _), _) => def go[A](f: Filter[A]): TypedPipe[A] = f match { case Filter(WithDescriptionTypedPipe(in, descs), fn) => @@ -627,12 +679,12 @@ object OptimizationRules { } /** - * (a ++ a) == a.flatMap { t => List(t, t) } - * This is a very simple rule that is subsumed by DeDiamondMappers below + * (a ++ a) == a.flatMap { t => List(t, t) } This is a very simple rule that is subsumed by DeDiamondMappers + * below */ object DiamondToFlatMap extends Rule[TypedPipe] { def apply[T](on: Dag[TypedPipe]) = { - case m@MergedTypedPipe(_, _) => + case m @ MergedTypedPipe(_, _) => val pipes = unrollMerge(m) val flatMapped = dedupMerge(pipes) @@ -649,12 +701,10 @@ object OptimizationRules { } /** - * This is a more expensive, but more general version of the - * previous rule: we can merge trailing mapping operations - * that originate at a common node. + * This is a more expensive, but more general version of the previous rule: we can merge trailing mapping + * operations that originate at a common node. * - * After this rule, the only diamonds that exist have at least - * one non-mapping operation on the path. + * After this rule, the only diamonds that exist have at least one non-mapping operation on the path. */ object DeDiamondMappers extends Rule[TypedPipe] { sealed abstract class Mapper[+A] { @@ -667,8 +717,7 @@ object OptimizationRules { Mapper(input, fn.combine(fn2), descriptions) def withDescriptions(desc: List[(String, Boolean)]): Mapper.Aux[Init, A] = - Mapper(input, fn, - ComposeDescriptions.combine(descriptions, desc)) + Mapper(input, fn, ComposeDescriptions.combine(descriptions, desc)) def toTypedPipe: TypedPipe[A] = { val pipe = FlatMappedFn.asId(fn) match { @@ -703,7 +752,11 @@ object OptimizationRules { } } - def apply[A, B](p: TypedPipe[A], fn0: FlatMappedFn[A, B], desc: List[(String, Boolean)]): Mapper[B] { type Init = A } = + def apply[A, B]( + p: TypedPipe[A], + fn0: FlatMappedFn[A, B], + desc: List[(String, Boolean)] + ): Mapper[B] { type Init = A } = new Mapper[B] { type Init = A val input = p @@ -718,13 +771,13 @@ object OptimizationRules { def toMappers[A](tp: TypedPipe[A]): List[Mapper[A]] = tp match { // First, these are non-mapped pipes. - case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) | ReduceStepPipe(_) | - CoGroupedPipe(_) | CrossPipe(_, _) | CounterPipe(_) | CrossValue(_, _) | DebugPipe(_) | - ForceToDisk(_) | Fork(_) | HashCoGroup(_, _, _) | SumByLocalKeys(_, _) | TrappedPipe(_, _, _) | WithOnComplete(_, _) => + case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) | ReduceStepPipe(_) | CoGroupedPipe(_) | + CrossPipe(_, _) | CounterPipe(_) | CrossValue(_, _) | DebugPipe(_) | ForceToDisk(_) | Fork(_) | + HashCoGroup(_, _, _) | SumByLocalKeys(_, _) | TrappedPipe(_, _, _) | WithOnComplete(_, _) => Mapper.unmapped(tp) :: Nil case FilterKeys(p, fn) => toMappers(p).map(_.combine(FlatMappedFn.fromFilter(FilterKeysToFilter(fn)))) - case f@Filter(_, _) => + case f @ Filter(_, _) => // type inference needs hand holding on this one def go[A1 <: A](p: TypedPipe[A1], fn: A1 => Boolean): List[Mapper[A]] = { val fn1: FlatMappedFn[A1, A] = @@ -750,11 +803,11 @@ object OptimizationRules { // that the init is the same private def merge[A](ms: Iterable[Mapper[A]]): TypedPipe[A] = ms.toList match { - case Nil => EmptyTypedPipe + case Nil => EmptyTypedPipe case h :: Nil => // there is only one Mapper, just convert back to a TypedPipe h.toTypedPipe - case all@(h :: t) => + case all @ (h :: t) => // we have several merged back from a common point // we don't know what that previous type was, but // we cast it to Any, we know the values in there @@ -769,12 +822,12 @@ object OptimizationRules { def apply[A](on: Dag[TypedPipe]) = { // here are the trailing mappers of this pipe - case fork@Fork(inner) if on.hasSingleDependent(fork) => + case fork @ Fork(inner) if on.hasSingleDependent(fork) => // due to a previous application of this rule, // this fork may have been reduced to only one // downstream, in that case, we can remove the fork Some(inner) - case m@MergedTypedPipe(_, _) => + case m @ MergedTypedPipe(_, _) => // this rule only applies to merged pipes // let's see if we have any duplicated inputs: val mapperGroups = toMappers(m).groupBy(_.input) @@ -791,35 +844,33 @@ object OptimizationRules { } /** - * After a forceToDisk there is no need to immediately fork. - * Calling forceToDisk twice in a row is the same as once. - * Calling fork twice in a row is the same as once. + * After a forceToDisk there is no need to immediately fork. Calling forceToDisk twice in a row is the same + * as once. Calling fork twice in a row is the same as once. */ object RemoveDuplicateForceFork extends PartialRule[TypedPipe] { def applyWhere[T](on: Dag[TypedPipe]) = { - case ForceToDisk(ForceToDisk(t)) => ForceToDisk(t) + case ForceToDisk(ForceToDisk(t)) => ForceToDisk(t) case ForceToDisk(WithDescriptionTypedPipe(ForceToDisk(t), desc)) => // we might as well only do one force to disk in this case WithDescriptionTypedPipe(ForceToDisk(t), desc) - case ForceToDisk(Fork(t)) => ForceToDisk(t) - case Fork(Fork(t)) => Fork(t) - case Fork(ForceToDisk(t)) => ForceToDisk(t) + case ForceToDisk(Fork(t)) => ForceToDisk(t) + case Fork(Fork(t)) => Fork(t) + case Fork(ForceToDisk(t)) => ForceToDisk(t) case Fork(t) if on.contains(ForceToDisk(t)) => ForceToDisk(t) } } /** - * If a fork has no fan-out when planned, it serves no purpose - * and is safe to remove. Likewise, there is no reason - * to put a forceToDisk immediatle after a source + * If a fork has no fan-out when planned, it serves no purpose and is safe to remove. Likewise, there is no + * reason to put a forceToDisk immediatle after a source */ object RemoveUselessFork extends PartialRule[TypedPipe] { def applyWhere[T](on: Dag[TypedPipe]) = { - case fork@Fork(t) if on.hasSingleDependent(fork) => t - case Fork(src@SourcePipe(_)) => src - case Fork(iter@IterablePipe(_)) => iter - case ForceToDisk(src@SourcePipe(_)) => src - case ForceToDisk(iter@IterablePipe(_)) => iter + case fork @ Fork(t) if on.hasSingleDependent(fork) => t + case Fork(src @ SourcePipe(_)) => src + case Fork(iter @ IterablePipe(_)) => iter + case ForceToDisk(src @ SourcePipe(_)) => src + case ForceToDisk(iter @ IterablePipe(_)) => iter } } @@ -837,8 +888,8 @@ object OptimizationRules { /** * We ignore .group if there are is no setting of reducers * - * This is arguably not a great idea, but scalding has always - * done it to minimize accidental map-reduce steps + * This is arguably not a great idea, but scalding has always done it to minimize accidental map-reduce + * steps */ object IgnoreNoOpGroup extends PartialRule[TypedPipe] { def applyWhere[T](on: Dag[TypedPipe]) = { @@ -851,11 +902,9 @@ object OptimizationRules { /** * In map-reduce settings, Merge is almost free in two contexts: - * 1. the final write - * 2. at the point we are doing a shuffle anyway. + * 1. the final write 2. at the point we are doing a shuffle anyway. * - * By defering merge as long as possible, we hope to find more such - * cases + * By defering merge as long as possible, we hope to find more such cases */ object DeferMerge extends PartialRule[TypedPipe] { private def handleFilter[A]: PartialFunction[Filter[A], TypedPipe[A]] = { @@ -871,32 +920,36 @@ object OptimizationRules { MergedTypedPipe(MapValues(a, fn), MapValues(b, fn)) case FlatMapValues(MergedTypedPipe(a, b), fn) => MergedTypedPipe(FlatMapValues(a, fn), FlatMapValues(b, fn)) - case f@Filter(_, _) if handleFilter.isDefinedAt(f) => handleFilter(f) + case f @ Filter(_, _) if handleFilter.isDefinedAt(f) => handleFilter(f) case FilterKeys(MergedTypedPipe(a, b), fn) => MergedTypedPipe(FilterKeys(a, fn), FilterKeys(b, fn)) } } /** - * Push filterKeys up as early as possible. This can happen before - * a shuffle, which can be a major win. This allows you to write - * generic methods that return all the data, but if downstream someone - * only wants certain keys they don't pay to compute everything. + * Push filterKeys up as early as possible. This can happen before a shuffle, which can be a major win. This + * allows you to write generic methods that return all the data, but if downstream someone only wants + * certain keys they don't pay to compute everything. * - * This is an optimization we didn't do in scalding 0.17 and earlier - * because .toTypedPipe on the group totally hid the structure from - * us + * This is an optimization we didn't do in scalding 0.17 and earlier because .toTypedPipe on the group + * totally hid the structure from us */ object FilterKeysEarly extends Rule[TypedPipe] { - private def filterReduceStep[K, V1, V2](rs: ReduceStep[K, V1, V2], fn: K => Boolean): ReduceStep[K, V1, V2] = + private def filterReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + fn: K => Boolean + ): ReduceStep[K, V1, V2] = ReduceStep.setInput(rs, FilterKeys(rs.mapped, fn)) private def filterCoGroupable[K, V](rs: CoGroupable[K, V], fn: K => Boolean): CoGroupable[K, V] = rs match { case rs: ReduceStep[K @unchecked, v1, V @unchecked] => - ReduceStep.toHashJoinable(filterReduceStep(rs, fn)) + ReduceStep + .toHashJoinable(filterReduceStep(rs, fn)) .getOrElse { - sys.error("unreachable: filterReduceStep returns the same type, and this input type was CoGroupable") + sys.error( + "unreachable: filterReduceStep returns the same type, and this input type was CoGroupable" + ) } case cg: CoGrouped[K @unchecked, V @unchecked] => filterCoGroup(cg, fn) } @@ -932,9 +985,8 @@ object OptimizationRules { } /** - * EmptyTypedPipe is kind of zero of most of these operations - * We go ahead and simplify as much as possible if we see - * an EmptyTypedPipe + * EmptyTypedPipe is kind of zero of most of these operations We go ahead and simplify as much as possible + * if we see an EmptyTypedPipe */ object EmptyIsOftenNoOp extends PartialRule[TypedPipe] { @@ -943,17 +995,19 @@ object OptimizationRules { def empty(t: TypedPipe[Any]): Boolean = t match { case EmptyTypedPipe => true - case _ => false + case _ => false } cg match { - case Pair(left, _, jf) if left.inputs.forall(empty) && (Joiner.isLeftJoinLike(jf) == Some(true)) => true - case Pair(_, right, jf) if right.inputs.forall(empty) && (Joiner.isRightJoinLike(jf) == Some(true)) => true + case Pair(left, _, jf) if left.inputs.forall(empty) && (Joiner.isLeftJoinLike(jf) == Some(true)) => + true + case Pair(_, right, jf) if right.inputs.forall(empty) && (Joiner.isRightJoinLike(jf) == Some(true)) => + true case Pair(left, right, _) if left.inputs.forall(empty) && right.inputs.forall(empty) => true - case Pair(_, _, _) => false + case Pair(_, _, _) => false case WithDescription(cg, _) => emptyCogroup(cg) - case WithReducers(cg, _) => emptyCogroup(cg) - case MapGroup(cg, _) => emptyCogroup(cg) - case FilterKeys(cg, _) => emptyCogroup(cg) + case WithReducers(cg, _) => emptyCogroup(cg) + case MapGroup(cg, _) => emptyCogroup(cg) + case FilterKeys(cg, _) => emptyCogroup(cg) } } @@ -961,29 +1015,33 @@ object OptimizationRules { HashJoinable.toReduceStep(hj).mapped == EmptyTypedPipe def applyWhere[T](on: Dag[TypedPipe]) = { - case CrossPipe(EmptyTypedPipe, _) => EmptyTypedPipe - case CrossPipe(_, EmptyTypedPipe) => EmptyTypedPipe - case CrossValue(EmptyTypedPipe, _) => EmptyTypedPipe + case CrossPipe(EmptyTypedPipe, _) => EmptyTypedPipe + case CrossPipe(_, EmptyTypedPipe) => EmptyTypedPipe + case CrossValue(EmptyTypedPipe, _) => EmptyTypedPipe case CrossValue(_, ComputedValue(EmptyTypedPipe)) => EmptyTypedPipe - case CrossValue(_, EmptyValue) => EmptyTypedPipe - case DebugPipe(EmptyTypedPipe) => EmptyTypedPipe - case FilterKeys(EmptyTypedPipe, _) => EmptyTypedPipe - case Filter(EmptyTypedPipe, _) => EmptyTypedPipe - case FlatMapValues(EmptyTypedPipe, _) => EmptyTypedPipe - case FlatMapped(EmptyTypedPipe, _) => EmptyTypedPipe - case ForceToDisk(EmptyTypedPipe) => EmptyTypedPipe - case HashCoGroup(EmptyTypedPipe, _, _) => EmptyTypedPipe - case HashCoGroup(_, right, hjf) if emptyHashJoinable(right) && Joiner.isInnerHashJoinLike(hjf) == Some(true) => EmptyTypedPipe - case MapValues(EmptyTypedPipe, _) => EmptyTypedPipe - case Mapped(EmptyTypedPipe, _) => EmptyTypedPipe - case MergedTypedPipe(EmptyTypedPipe, a) => a - case MergedTypedPipe(a, EmptyTypedPipe) => a + case CrossValue(_, EmptyValue) => EmptyTypedPipe + case DebugPipe(EmptyTypedPipe) => EmptyTypedPipe + case FilterKeys(EmptyTypedPipe, _) => EmptyTypedPipe + case Filter(EmptyTypedPipe, _) => EmptyTypedPipe + case FlatMapValues(EmptyTypedPipe, _) => EmptyTypedPipe + case FlatMapped(EmptyTypedPipe, _) => EmptyTypedPipe + case ForceToDisk(EmptyTypedPipe) => EmptyTypedPipe + case HashCoGroup(EmptyTypedPipe, _, _) => EmptyTypedPipe + case HashCoGroup(_, right, hjf) + if emptyHashJoinable(right) && Joiner.isInnerHashJoinLike(hjf) == Some(true) => + EmptyTypedPipe + case MapValues(EmptyTypedPipe, _) => EmptyTypedPipe + case Mapped(EmptyTypedPipe, _) => EmptyTypedPipe + case MergedTypedPipe(EmptyTypedPipe, a) => a + case MergedTypedPipe(a, EmptyTypedPipe) => a case ReduceStepPipe(rs: ReduceStep[_, _, _]) if rs.mapped == EmptyTypedPipe => EmptyTypedPipe - case SumByLocalKeys(EmptyTypedPipe, _) => EmptyTypedPipe - case TrappedPipe(EmptyTypedPipe, _, _) => EmptyTypedPipe - case CoGroupedPipe(cgp) if emptyCogroup(cgp) => EmptyTypedPipe - case WithOnComplete(EmptyTypedPipe, _) => EmptyTypedPipe // there is nothing to do, so we never have workers complete - case WithDescriptionTypedPipe(EmptyTypedPipe, _) => EmptyTypedPipe // descriptions apply to tasks, but empty has no tasks + case SumByLocalKeys(EmptyTypedPipe, _) => EmptyTypedPipe + case TrappedPipe(EmptyTypedPipe, _, _) => EmptyTypedPipe + case CoGroupedPipe(cgp) if emptyCogroup(cgp) => EmptyTypedPipe + case WithOnComplete(EmptyTypedPipe, _) => + EmptyTypedPipe // there is nothing to do, so we never have workers complete + case WithDescriptionTypedPipe(EmptyTypedPipe, _) => + EmptyTypedPipe // descriptions apply to tasks, but empty has no tasks // This rule is tempting, but dangerous since if used in combination // with AddExplicitForks it would create an infinite loop @@ -1001,13 +1059,12 @@ object OptimizationRules { } /** - * This is useful on map-reduce like systems to avoid - * serializing data into the system that you are going - * to then filter + * This is useful on map-reduce like systems to avoid serializing data into the system that you are going to + * then filter */ object FilterLocally extends Rule[TypedPipe] { def apply[T](on: Dag[TypedPipe]) = { - case f@Filter(_, _) => + case f @ Filter(_, _) => def go[T1 <: T](f: Filter[T1]): Option[TypedPipe[T]] = f match { case Filter(IterablePipe(iter), fn) => @@ -1015,7 +1072,7 @@ object OptimizationRules { case _ => None } go(f) - case f@FilterKeys(_, _) => + case f @ FilterKeys(_, _) => def go[K, V, T >: (K, V)](f: FilterKeys[K, V]): Option[TypedPipe[T]] = f match { case FilterKeys(IterablePipe(iter), fn) => @@ -1026,9 +1083,9 @@ object OptimizationRules { case _ => None } } + /** - * ForceToDisk before hashJoin, this makes sure any filters - * have been applied + * ForceToDisk before hashJoin, this makes sure any filters have been applied */ object ForceToDiskBeforeHashJoin extends Rule[TypedPipe] { // A set of operations naturally have barriers after them, @@ -1044,7 +1101,10 @@ object OptimizationRules { // this is a no-op reduce that will be removed, so we may need to add a force maybeForce(input) case SourcePipe(_) | IterablePipe(_) | CoGroupedPipe(_) | ReduceStepPipe(_) | ForceToDisk(_) => t - case WithOnComplete(pipe, fn) => // TODO it is not clear this is safe in cascading 3, since oncomplete is an each + case WithOnComplete( + pipe, + fn + ) => // TODO it is not clear this is safe in cascading 3, since oncomplete is an each WithOnComplete(maybeForce(pipe), fn) case WithDescriptionTypedPipe(pipe, descs) => WithDescriptionTypedPipe(maybeForce(pipe), descs) @@ -1054,18 +1114,18 @@ object OptimizationRules { def apply[T](on: Dag[TypedPipe]) = { case HashCoGroup(left, right: HashJoinable[a, b], joiner) => val newRight: HashJoinable[a, b] = right match { - case step@IdentityReduce(_, _, _, _, _) => + case step @ IdentityReduce(_, _, _, _, _) => step.copy(mapped = maybeForce(step.mapped)) - case step@UnsortedIdentityReduce(_, _, _, _, _) => + case step @ UnsortedIdentityReduce(_, _, _, _, _) => step.copy(mapped = maybeForce(step.mapped)) - case step@IteratorMappedReduce(_, _, _, _, _) => + case step @ IteratorMappedReduce(_, _, _, _, _) => step.copy(mapped = maybeForce(step.mapped)) } if (newRight != right) Some(HashCoGroup(left, newRight, joiner)) else None - case (cp@CrossPipe(_, _)) => Some(cp.viaHashJoin) - case (cv@CrossValue(_, _)) => Some(cv.viaHashJoin) - case _ => None + case (cp @ CrossPipe(_, _)) => Some(cp.viaHashJoin) + case (cv @ CrossValue(_, _)) => Some(cv.viaHashJoin) + case _ => None } } @@ -1078,16 +1138,14 @@ object OptimizationRules { val leftg = Grouped(left)(right.keyOrdering) val joiner2 = Joiner.toCogroupJoiner2(joiner) Some(CoGroupedPipe(CoGrouped.Pair(leftg, right, joiner2))) - case (cp@CrossPipe(_, _)) => Some(cp.viaHashJoin) - case (cv@CrossValue(_, _)) => Some(cv.viaHashJoin) - case _ => None + case (cp @ CrossPipe(_, _)) => Some(cp.viaHashJoin) + case (cv @ CrossValue(_, _)) => Some(cv.viaHashJoin) + case _ => None } } - /** - * Prefer to do mapValues/flatMapValues in a Reduce/Join - * so we can avoid some boxing in-and-out of cascading + * Prefer to do mapValues/flatMapValues in a Reduce/Join so we can avoid some boxing in-and-out of cascading */ object MapValuesInReducers extends PartialRule[TypedPipe] { @@ -1109,7 +1167,8 @@ object OptimizationRules { CoGroupedPipe(CoGrouped.MapGroup(cg, MapGroupMapValues(fn))) case FlatMapValues(CoGroupedPipe(cg), fn) => CoGroupedPipe(CoGrouped.MapGroup(cg, MapGroupFlatMapValues(fn))) - case f@Filter(_, _) if handleFilter(f).isDefined => handleFilter(f).getOrElse(sys.error("unreachable: already checked isDefined")) + case f @ Filter(_, _) if handleFilter(f).isDefined => + handleFilter(f).getOrElse(sys.error("unreachable: already checked isDefined")) case SumByLocalKeys(ReduceStepPipe(rs), sg) => ReduceStepPipe(ReduceStep.mapGroup(rs)(MapValueStream(SumAll(sg)))) case SumByLocalKeys(CoGroupedPipe(cg), sg) => @@ -1122,39 +1181,27 @@ object OptimizationRules { ////// /** - * Like kinds can be composed .map(f).map(g), - * filter(f).filter(g) etc... + * Like kinds can be composed .map(f).map(g), filter(f).filter(g) etc... */ val composeSame: Rule[TypedPipe] = - Rule.orElse( - List( - ComposeMap, - ComposeFilter, - ComposeFlatMap, - ComposeWithOnComplete)) + Rule.orElse(List(ComposeMap, ComposeFilter, ComposeFlatMap, ComposeWithOnComplete)) + /** - * If you are going to do a flatMap, following it or preceding it with map/filter - * you might as well compose into the flatMap + * If you are going to do a flatMap, following it or preceding it with map/filter you might as well compose + * into the flatMap */ val composeIntoFlatMap: Rule[TypedPipe] = - Rule.orElse( - List( - ComposeMapFlatMap, - ComposeFilterFlatMap, - ComposeFlatMap)) + Rule.orElse(List(ComposeMapFlatMap, ComposeFilterFlatMap, ComposeFlatMap)) val simplifyEmpty: Rule[TypedPipe] = - EmptyIsOftenNoOp.orElse( - EmptyIterableIsEmpty) + EmptyIsOftenNoOp.orElse(EmptyIterableIsEmpty) /** - * These are a list of rules to be applied in order (Dag.applySeq) - * that should generally always improve things on Map/Reduce-like - * platforms. + * These are a list of rules to be applied in order (Dag.applySeq) that should generally always improve + * things on Map/Reduce-like platforms. * - * These are rules we should apply to any TypedPipe before handing - * to cascading. These should be a bit conservative in that they - * should be highly likely to improve the graph. + * These are rules we should apply to any TypedPipe before handing to cascading. These should be a bit + * conservative in that they should be highly likely to improve the graph. */ val standardMapReduceRules: List[Rule[TypedPipe]] = List( @@ -1162,7 +1209,8 @@ object OptimizationRules { AddExplicitForks, RemoveUselessFork, // phase 1, compose flatMap/map, move descriptions down, defer merge, filter pushup etc... - IgnoreNoOpGroup.orElse(composeSame) + IgnoreNoOpGroup + .orElse(composeSame) .orElse(DescribeLater) .orElse(DeferMerge), // phase 2, combine different kinds of mapping operations into flatMaps, including redundant merges @@ -1178,7 +1226,8 @@ object OptimizationRules { MapValuesInReducers .orElse(FilterKeysEarly), // phase 4, remove duplicates forces/forks (e.g. .fork.fork or .forceToDisk.fork, ....) - RemoveDuplicateForceFork) + RemoveDuplicateForceFork + ) /** * a Convenience function to avoid needing to pass toLiteral diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala index 1f57ccbc9e..1abd31d573 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala @@ -16,18 +16,22 @@ package com.twitter.scalding package typed import cascading.tap.hadoop.PartitionTap -import cascading.tap.local.{ FileTap, PartitionTap => LocalPartitionTap } -import cascading.tap.{ SinkMode, Tap } +import cascading.tap.local.{FileTap, PartitionTap => LocalPartitionTap} +import cascading.tap.{SinkMode, Tap} import cascading.tuple.Fields /** * Trait to assist with creating partitioned sources. * - * Apart from the abstract members below, `hdfsScheme` and `localScheme` also need to be set. - * Note that for both of them the sink fields need to be set to only include the actual fields - * that should be written to file and not the partition fields. + * Apart from the abstract members below, `hdfsScheme` and `localScheme` also need to be set. Note that for + * both of them the sink fields need to be set to only include the actual fields that should be written to + * file and not the partition fields. */ -trait PartitionSchemed[P, T] extends SchemedSource with TypedSink[(P, T)] with Mappable[(P, T)] with HfsTapProvider { +trait PartitionSchemed[P, T] + extends SchemedSource + with TypedSink[(P, T)] + with Mappable[(P, T)] + with HfsTapProvider { def path: String def template: String def valueSetter: TupleSetter[T] @@ -48,17 +52,17 @@ trait PartitionSchemed[P, T] extends SchemedSource with TypedSink[(P, T)] with M override def sinkFields: Fields = fields.append(partitionFields) /** - * Combine both the partition and value converter to extract the data from a flat cascading tuple - * into a pair of `P` and `T`. + * Combine both the partition and value converter to extract the data from a flat cascading tuple into a + * pair of `P` and `T`. */ override def converter[U >: (P, T)] = PartitionUtil.converter[P, T, U](valueConverter, partitionConverter) - /** Flatten a pair of `P` and `T` into a cascading tuple.*/ + /** Flatten a pair of `P` and `T` into a cascading tuple. */ override def setter[U <: (P, T)] = PartitionUtil.setter[P, T, U](valueSetter, partitionSetter) - /** Creates the taps for local and hdfs mode.*/ + /** Creates the taps for local and hdfs mode. */ override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { case Local(_) => { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala index 1302eb01f6..a53135ff89 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala @@ -15,7 +15,7 @@ package com.twitter.scalding package typed -import cascading.tuple.{ Fields, Tuple, TupleEntry } +import cascading.tuple.{Fields, Tuple, TupleEntry} /** Utility functions to assist with creating partitioned sourced. */ object PartitionUtil { @@ -24,8 +24,8 @@ object PartitionUtil { def toFields(start: Int, end: Int): Fields = Dsl.strFields((start until end).map(_.toString)) - /** A tuple converter that splits a cascading tuple into a pair of types.*/ - def converter[P, T, U >: (P, T)](valueConverter: TupleConverter[T], partitionConverter: TupleConverter[P]) = { + /** A tuple converter that splits a cascading tuple into a pair of types. */ + def converter[P, T, U >: (P, T)](valueConverter: TupleConverter[T], partitionConverter: TupleConverter[P]) = TupleConverter.asSuperConverter[(P, T), U](new TupleConverter[(P, T)] { val arity = valueConverter.arity + partitionConverter.arity @@ -43,10 +43,12 @@ object PartitionUtil { (partitionConverter(partitionTE), valueConverter(valueTE)) } }) - } - /** A tuple setter for a pair of types which are flattened into a cascading tuple.*/ - def setter[P, T, U <: (P, T)](valueSetter: TupleSetter[T], partitionSetter: TupleSetter[P]): TupleSetter[U] = + /** A tuple setter for a pair of types which are flattened into a cascading tuple. */ + def setter[P, T, U <: (P, T)]( + valueSetter: TupleSetter[T], + partitionSetter: TupleSetter[P] + ): TupleSetter[U] = TupleSetter.asSubSetter[(P, T), U](new TupleSetter[(P, T)] { val arity = valueSetter.arity + partitionSetter.arity @@ -56,8 +58,7 @@ object PartitionUtil { val output = Tuple.size(partition.size + value.size) (0 until value.size).foreach(idx => output.set(idx, value.getObject(idx))) - (0 until partition.size).foreach(idx => - output.set(idx + value.size, partition.getObject(idx))) + (0 until partition.size).foreach(idx => output.set(idx + value.size, partition.getObject(idx))) output } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala index 516b8dd86a..4f8616f2c6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala @@ -16,18 +16,18 @@ package com.twitter.scalding package typed import java.util.Properties -import java.io.{ InputStream, OutputStream, Serializable } +import java.io.{InputStream, OutputStream, Serializable} import cascading.scheme.Scheme import cascading.scheme.hadoop.TextDelimited -import cascading.scheme.local.{ TextDelimited => LocalTextDelimited } +import cascading.scheme.local.{TextDelimited => LocalTextDelimited} import cascading.tuple.Fields /** * Scalding source to read or write partitioned delimited text. * - * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and - * `T` is the output to write out. Below is an example. + * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and `T` is the + * output to write out. Below is an example. * {{{ * val data = List( * (("a", "x"), ("i", 1)), @@ -38,24 +38,39 @@ import cascading.tuple.Fields * .write(PartitionedDelimited[(String, String), (String, Int)](args("out"), "col1=%s/col2=%s")) * }}} * - * For reading it produces a pair `(P, T)` where `P` is the partition data and `T` is data in the - * files. Below is an example. + * For reading it produces a pair `(P, T)` where `P` is the partition data and `T` is data in the files. Below + * is an example. * {{{ * val in: TypedPipe[((String, String), (String, Int))] = PartitionedDelimited[(String, String), (String, Int)](args("in"), "col1=%s/col2=%s") * }}} */ case class PartitionedDelimitedSource[P, T]( - path: String, template: String, separator: String, fields: Fields, skipHeader: Boolean = false, - writeHeader: Boolean = false, quote: String = "\"", strict: Boolean = true, safe: Boolean = true)(implicit mt: Manifest[T], val valueSetter: TupleSetter[T], val valueConverter: TupleConverter[T], - val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P]) extends PartitionSchemed[P, T] with Serializable { + path: String, + template: String, + separator: String, + fields: Fields, + skipHeader: Boolean = false, + writeHeader: Boolean = false, + quote: String = "\"", + strict: Boolean = true, + safe: Boolean = true +)(implicit + mt: Manifest[T], + val valueSetter: TupleSetter[T], + val valueConverter: TupleConverter[T], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends PartitionSchemed[P, T] + with Serializable { assert( fields.size == valueSetter.arity, - "The number of fields needs to be the same as the arity of the value setter") + "The number of fields needs to be the same as the arity of the value setter" + ) val types: Array[Class[_]] = { if (classOf[scala.Product].isAssignableFrom(mt.runtimeClass)) { //Assume this is a Tuple: - mt.typeArguments.map { _.runtimeClass }.toArray + mt.typeArguments.map(_.runtimeClass).toArray } else { //Assume there is only a single item Array(mt.runtimeClass) @@ -66,8 +81,10 @@ case class PartitionedDelimitedSource[P, T]( // see sinkFields in PartitionSchemed for other half of this work around. override def hdfsScheme = { val scheme = - HadoopSchemeInstance(new TextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe) - .asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new TextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) scheme.setSinkFields(fields) scheme } @@ -84,35 +101,47 @@ case class PartitionedDelimitedSource[P, T]( } /** - * Trait to assist with creating objects such as [[PartitionedTsv]] to read from separated files. - * Override separator, skipHeader, writeHeader as needed. + * Trait to assist with creating objects such as [[PartitionedTsv]] to read from separated files. Override + * separator, skipHeader, writeHeader as needed. */ trait PartitionedDelimited extends Serializable { def separator: String - def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter](path: String, template: String): PartitionedDelimitedSource[P, T] = - PartitionedDelimitedSource(path, template, separator, PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity)) + def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter]( + path: String, + template: String + ): PartitionedDelimitedSource[P, T] = + PartitionedDelimitedSource( + path, + template, + separator, + PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity) + ) - def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter](path: String, template: String, fields: Fields): PartitionedDelimitedSource[P, T] = + def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter]( + path: String, + template: String, + fields: Fields + ): PartitionedDelimitedSource[P, T] = PartitionedDelimitedSource(path, template, separator, fields) } -/** Partitioned typed tab separated source.*/ +/** Partitioned typed tab separated source. */ object PartitionedTsv extends PartitionedDelimited { val separator = "\t" } -/** Partitioned typed commma separated source.*/ +/** Partitioned typed commma separated source. */ object PartitionedCsv extends PartitionedDelimited { val separator = "," } -/** Partitioned typed pipe separated source.*/ +/** Partitioned typed pipe separated source. */ object PartitionedPsv extends PartitionedDelimited { val separator = "|" } -/** Partitioned typed `\1` separated source (commonly used by Pig).*/ +/** Partitioned typed `\1` separated source (commonly used by Pig). */ object PartitionedOsv extends PartitionedDelimited { val separator = "\u0001" } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala index c35a1c6908..dece665146 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala @@ -16,21 +16,21 @@ package com.twitter.scalding package typed import java.util.Properties -import java.io.{ InputStream, OutputStream } +import java.io.{InputStream, OutputStream} import cascading.scheme.Scheme import cascading.scheme.hadoop.TextLine -import cascading.scheme.local.{ TextLine => LocalTextLine } -import cascading.tap.{ Tap, SinkMode } +import cascading.scheme.local.{TextLine => LocalTextLine} +import cascading.tap.{SinkMode, Tap} import cascading.tap.hadoop.PartitionTap -import cascading.tap.local.{ FileTap, PartitionTap => LocalPartitionTap } +import cascading.tap.local.{FileTap, PartitionTap => LocalPartitionTap} import cascading.tuple.Fields /** * Scalding source to read or write partitioned text. * - * For writing it expects a pair of `(P, String)`, where `P` is the data used for partitioning and - * `String` is the output to write out. Below is an example. + * For writing it expects a pair of `(P, String)`, where `P` is the data used for partitioning and `String` is + * the output to write out. Below is an example. * {{{ * val data = List( * (("a", "x"), "line1"), @@ -41,20 +41,33 @@ import cascading.tuple.Fields * .write(PartitionTextLine[(String, String)](args("out"), "col1=%s/col2=%s")) * }}} * - * For reading it produces a pair `(P, (Long, String))` where `P` is the partition data, `Long` - * is the offset into the file and `String` is a line from the file. Below is an example. + * For reading it produces a pair `(P, (Long, String))` where `P` is the partition data, `Long` is the offset + * into the file and `String` is a line from the file. Below is an example. * {{{ * val in: TypedPipe[((String, String), (Long, String))] = PartitionTextLine[(String, String)](args("in"), "col1=%s/col2=%s") * }}} * - * @param path Base path of the partitioned directory - * @param template Template for the partitioned path - * @param encoding Text encoding of the file content + * @param path + * Base path of the partitioned directory + * @param template + * Template for the partitioned path + * @param encoding + * Text encoding of the file content */ case class PartitionedTextLine[P]( - path: String, template: String, encoding: String = TextLine.DEFAULT_CHARSET)(implicit val valueSetter: TupleSetter[String], val valueConverter: TupleConverter[(Long, String)], - val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P]) extends SchemedSource with TypedSink[(P, String)] with Mappable[(P, (Long, String))] with HfsTapProvider - with java.io.Serializable { + path: String, + template: String, + encoding: String = TextLine.DEFAULT_CHARSET +)(implicit + val valueSetter: TupleSetter[String], + val valueConverter: TupleConverter[(Long, String)], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends SchemedSource + with TypedSink[(P, String)] + with Mappable[(P, (Long, String))] + with HfsTapProvider + with java.io.Serializable { // The partition fields, offset by the value arity. val partitionFields = @@ -64,8 +77,10 @@ case class PartitionedTextLine[P]( // see sinkFields in PartitionSchemed for other half of this work around. override def hdfsScheme = { val scheme = - HadoopSchemeInstance(new TextLine(TextLine.DEFAULT_SOURCE_FIELDS, encoding) - .asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new TextLine(TextLine.DEFAULT_SOURCE_FIELDS, encoding) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) scheme.setSinkFields(PartitionUtil.toFields(0, valueSetter.arity)) scheme } @@ -88,7 +103,7 @@ case class PartitionedTextLine[P]( override def sinkFields: Fields = PartitionUtil.toFields(0, valueSetter.arity + partitionSetter.arity) - /** Creates the taps for local and hdfs mode.*/ + /** Creates the taps for local and hdfs mode. */ override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { case Local(_) => { @@ -110,13 +125,13 @@ case class PartitionedTextLine[P]( } /** - * Combine both the partition and value converter to extract the data from a flat cascading tuple - * into a pair of `P` and `(offset, line)`. + * Combine both the partition and value converter to extract the data from a flat cascading tuple into a + * pair of `P` and `(offset, line)`. */ override def converter[U >: (P, (Long, String))] = PartitionUtil.converter[P, (Long, String), U](valueConverter, partitionConverter) - /** Flatten a pair of `P` and `line` into a cascading tuple.*/ + /** Flatten a pair of `P` and `line` into a cascading tuple. */ override def setter[U <: (P, String)] = PartitionUtil.setter[P, String, U](valueSetter, partitionSetter) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Resolver.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Resolver.scala index 4e0cdd96db..d790c73333 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Resolver.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/Resolver.scala @@ -5,8 +5,7 @@ import java.io.Serializable import scala.util.hashing.MurmurHash3 /** - * This class is an like a higher kinded PartialFunction - * which we use to look up sources and sinks in a safe + * This class is an like a higher kinded PartialFunction which we use to look up sources and sinks in a safe * way */ abstract class Resolver[I[_], O[_]] extends Serializable { @@ -26,7 +25,8 @@ object Resolver extends Serializable { def apply[A](i: I[A]): Option[O[A]] = toHMap.get(i) } - private case class OrElse[I[_], O[_]](first: Resolver[I, O], second: Resolver[I, O]) extends Resolver[I, O] { + private case class OrElse[I[_], O[_]](first: Resolver[I, O], second: Resolver[I, O]) + extends Resolver[I, O] { override val hashCode: Int = MurmurHash3.productHash(this) def apply[A](i: I[A]): Option[O[A]] = { @@ -51,7 +51,8 @@ object Resolver extends Serializable { } } - private case class AndThen[X[_], Y[_], Z[_]](first: Resolver[X, Y], second: Resolver[Y, Z]) extends Resolver[X, Z] { + private case class AndThen[X[_], Y[_], Z[_]](first: Resolver[X, Y], second: Resolver[Y, Z]) + extends Resolver[X, Z] { override val hashCode: Int = MurmurHash3.productHash(this) def apply[A](i: X[A]): Option[Z[A]] = @@ -75,9 +76,8 @@ object Resolver extends Serializable { case HMapResolver(shm) => // dagon does not have a ++ :( val merged = fhm.keySet.foldLeft(shm) { (hmap, k) => - def addKey[A](k: I[A]): HMap[I, O] = { + def addKey[A](k: I[A]): HMap[I, O] = hmap + (k -> fhm(k)) - } addKey(k) } HMapResolver(merged) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala index de74afd394..be5902b8b6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala @@ -12,27 +12,24 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed -import com.twitter.algebird.{ Bytes, CMS, Batched } +import com.twitter.algebird.{Batched, Bytes, CMS} import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ -import com.twitter.scalding.serialization.{ OrderedSerialization, OrderedSerialization2 } +import com.twitter.scalding.serialization.{OrderedSerialization, OrderedSerialization2} import com.twitter.algebird.CMSMonoid // This was a bad design choice, we should have just put these in the CMSHasher object /** - * This class is generally only created by users - * with the TypedPipe.sketch method + * This class is generally only created by users with the TypedPipe.sketch method */ -case class Sketched[K, V](pipe: TypedPipe[(K, V)], - numReducers: Int, - delta: Double, - eps: Double, - seed: Int)(implicit val serialization: K => Array[Byte], - ordering: Ordering[K]) - extends MustHaveReducers { +case class Sketched[K, V](pipe: TypedPipe[(K, V)], numReducers: Int, delta: Double, eps: Double, seed: Int)( + implicit + val serialization: K => Array[Byte], + ordering: Ordering[K] +) extends MustHaveReducers { def reducers = Some(numReducers) @@ -43,13 +40,14 @@ case class Sketched[K, V](pipe: TypedPipe[(K, V)], lazy implicit val cms: CMSMonoid[Bytes] = CMS.monoid[Bytes](leps, ldelta, lseed) // every 10k items, compact into a CMS to prevent very slow mappers - lazy implicit val batchedSG: com.twitter.algebird.Semigroup[Batched[CMS[Bytes]]] = Batched.compactingSemigroup[CMS[Bytes]](10000) + lazy implicit val batchedSG: com.twitter.algebird.Semigroup[Batched[CMS[Bytes]]] = + Batched.compactingSemigroup[CMS[Bytes]](10000) pipe .map { case (k, _) => ((), Batched(cms.create(Bytes(localSer(k))))) } .sumByLocalKeys - .map { - case (_, batched) => batched.sum + .map { case (_, batched) => + batched.sum } // remove the Batched before going to the reducers .groupAll .sum @@ -58,30 +56,33 @@ case class Sketched[K, V](pipe: TypedPipe[(K, V)], } /** - * Like a hashJoin, this joiner does not see all the values V at one time, only one at a time. - * This is sufficient to implement join and leftJoin + * Like a hashJoin, this joiner does not see all the values V at one time, only one at a time. This is + * sufficient to implement join and leftJoin */ - def cogroup[V2, R](right: TypedPipe[(K, V2)])(joiner: (K, V, Iterable[V2]) => Iterator[R]): SketchJoined[K, V, V2, R] = + def cogroup[V2, R](right: TypedPipe[(K, V2)])( + joiner: (K, V, Iterable[V2]) => Iterator[R] + ): SketchJoined[K, V, V2, R] = new SketchJoined(this, right, numReducers)(joiner) /** - * Does a logical inner join but replicates the heavy keys of the left hand side - * across the reducers + * Does a logical inner join but replicates the heavy keys of the left hand side across the reducers */ def join[V2](right: TypedPipe[(K, V2)]): SketchJoined[K, V, V2, (V, V2)] = cogroup(right)(Joiner.hashInner2) + /** - * Does a logical left join but replicates the heavy keys of the left hand side - * across the reducers + * Does a logical left join but replicates the heavy keys of the left hand side across the reducers */ def leftJoin[V2](right: TypedPipe[(K, V2)]): SketchJoined[K, V, V2, (V, Option[V2])] = cogroup(right)(Joiner.hashLeft2) } -case class SketchJoined[K: Ordering, V, V2, R](left: Sketched[K, V], - right: TypedPipe[(K, V2)], - numReducers: Int)(joiner: (K, V, Iterable[V2]) => Iterator[R]) - extends MustHaveReducers { +case class SketchJoined[K: Ordering, V, V2, R]( + left: Sketched[K, V], + right: TypedPipe[(K, V2)], + numReducers: Int +)(joiner: (K, V, Iterable[V2]) => Iterator[R]) + extends MustHaveReducers { def reducers = Some(numReducers) @@ -94,34 +95,33 @@ case class SketchJoined[K: Ordering, V, V2, R](left: Sketched[K, V], val localNumReducers = numReducers val localMaxReducerFraction = maxReducerFraction - pipe.cross(left.sketch).flatMap{ - case ((k, w), cms) => - val maxPerReducer = ((cms.totalCount * localMaxReducerFraction) / localNumReducers) + 1 - val maxReplicas = (cms.frequency(Bytes(localSer(k))).estimate.toDouble / maxPerReducer) - //if the frequency is 0, maxReplicas.ceil will be 0 so we will filter out this key entirely - //if it's < maxPerReducer, the ceil will round maxReplicas up to 1 to ensure we still see it - val replicas = fn(maxReplicas.ceil.toInt.min(localNumReducers)) - replicas.map{ i => (i, k) -> w } + pipe.cross(left.sketch).flatMap { case ((k, w), cms) => + val maxPerReducer = ((cms.totalCount * localMaxReducerFraction) / localNumReducers) + 1 + val maxReplicas = cms.frequency(Bytes(localSer(k))).estimate.toDouble / maxPerReducer + //if the frequency is 0, maxReplicas.ceil will be 0 so we will filter out this key entirely + //if it's < maxPerReducer, the ceil will round maxReplicas up to 1 to ensure we still see it + val replicas = fn(maxReplicas.ceil.toInt.min(localNumReducers)) + replicas.map(i => (i, k) -> w) } } val toTypedPipe: TypedPipe[(K, R)] = { lazy val rand = new scala.util.Random(left.seed) - val lhs = flatMapWithReplicas(left.pipe){ n => (rand.nextInt(n) + 1) :: Nil } - val rhs = flatMapWithReplicas(right){ n => 1.to(n) } + val lhs = flatMapWithReplicas(left.pipe)(n => (rand.nextInt(n) + 1) :: Nil) + val rhs = flatMapWithReplicas(right)(n => 1.to(n)) - lhs - .group - .cogroup(rhs.group){ (k, itv, itu) => itv.flatMap{ v => joiner(k._2, v, itu) } } + lhs.group + .cogroup(rhs.group)((k, itv, itu) => itv.flatMap(v => joiner(k._2, v, itu))) .withReducers(numReducers) - .map{ case ((r, k), v) => (k, v) } + .map { case ((r, k), v) => (k, v) } } private implicit def intKeyOrd: Ordering[(Int, K)] = { val kord = implicitly[Ordering[K]] kord match { - case kos: OrderedSerialization[_] => new OrderedSerialization2(ordSer[Int], kos.asInstanceOf[OrderedSerialization[K]]) + case kos: OrderedSerialization[_] => + new OrderedSerialization2(ordSer[Int], kos.asInstanceOf[OrderedSerialization[K]]) case _ => Ordering.Tuple2[Int, K] } } @@ -129,7 +129,8 @@ case class SketchJoined[K: Ordering, V, V2, R](left: Sketched[K, V], } object SketchJoined { - implicit def toTypedPipe[K, V, V2, R](joined: SketchJoined[K, V, V2, R]): TypedPipe[(K, R)] = joined.toTypedPipe + implicit def toTypedPipe[K, V, V2, R](joined: SketchJoined[K, V, V2, R]): TypedPipe[(K, R)] = + joined.toTypedPipe implicit def toTypedPipeKeyed[K, V, V2, R](joined: SketchJoined[K, V, V2, R]): TypedPipe.Keyed[K, R] = new TypedPipe.Keyed(joined.toTypedPipe) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala index 445f3690cb..333a5f116a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import java.io.Serializable @@ -24,10 +24,9 @@ import cascading.tuple.Fields import com.twitter.scalding._ /** - * implicits for the type-safe DSL - * import TDsl._ to get the implicit conversions from Grouping/CoGrouping to Pipe, - * to get the .toTypedPipe method on standard cascading Pipes. - * to get automatic conversion of Mappable[T] to TypedPipe[T] + * implicits for the type-safe DSL import TDsl._ to get the implicit conversions from Grouping/CoGrouping to + * Pipe, to get the .toTypedPipe method on standard cascading Pipes. to get automatic conversion of + * Mappable[T] to TypedPipe[T] */ object TDsl extends Serializable with GeneratedTupleAdders { implicit def pipeTExtensions(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): PipeTExtensions = @@ -60,7 +59,9 @@ class PipeTExtensions(pipe: Pipe, flowDef: FlowDef, mode: Mode) extends Serializ * } * The above sums all the tuples and returns a TypedPipe[Int] which has the total sum. */ - def typed[T, U](fielddef: (Fields, Fields))(fn: TypedPipe[T] => TypedPipe[U])(implicit conv: TupleConverter[T], setter: TupleSetter[U]): Pipe = + def typed[T, U](fielddef: (Fields, Fields))( + fn: TypedPipe[T] => TypedPipe[U] + )(implicit conv: TupleConverter[T], setter: TupleSetter[U]): Pipe = fn(TypedPipe.from(pipe, fielddef._1)(flowDef, mode, conv)).toPipe(fielddef._2)(flowDef, mode, setter) def toTypedPipe[T](fields: Fields)(implicit conv: TupleConverter[T]): TypedPipe[T] = diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala index da620f5808..1a8bd50bd0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala @@ -18,7 +18,7 @@ package typed import scala.collection.JavaConverters._ import cascading.tap.partition.Partition -import cascading.tuple.{ Fields, TupleEntry } +import cascading.tuple.{Fields, TupleEntry} /** * Creates a partition using the given template string. @@ -28,9 +28,10 @@ import cascading.tuple.{ Fields, TupleEntry } case class TemplatePartition(partitionFields: Fields, template: String) extends Partition { assert( partitionFields.size == "%s".r.findAllIn(template).length, - "Number of partition fields %s does not correspond to template (%s)".format(partitionFields, template)) + "Number of partition fields %s does not correspond to template (%s)".format(partitionFields, template) + ) - /** Regex pattern created from the template to extract the partition values from a path.*/ + /** Regex pattern created from the template to extract the partition values from a path. */ lazy val pattern = template.replaceAll("%s", "(.*)").r.pattern /** Returns the path depth. In this case the number of partition fields. */ @@ -40,8 +41,7 @@ case class TemplatePartition(partitionFields: Fields, template: String) extends override def getPartitionFields(): Fields = partitionFields /** - * Converts the given partition string to field values and populates the supplied tuple entry - * with it. + * Converts the given partition string to field values and populates the supplied tuple entry with it. */ override def toTuple(partition: String, tupleEntry: TupleEntry): Unit = { val m = pattern.matcher(partition) @@ -51,8 +51,7 @@ case class TemplatePartition(partitionFields: Fields, template: String) extends } /** - * Given the specified tuple entry fill in the supplied template entry to create the partition - * path. + * Given the specified tuple entry fill in the supplied template entry to create the partition path. */ override def toPartition(tupleEntry: TupleEntry): String = { val fields = tupleEntry.asIterableOf(classOf[String]).asScala.toList diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala index a49269f0d4..994330761a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala @@ -10,19 +10,36 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed -import java.io.{ OutputStream, InputStream, Serializable } +import java.io.{InputStream, OutputStream, Serializable} import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields -import com.twitter.algebird.{ Aggregator, Batched, Monoid, Semigroup } +import com.twitter.algebird.{Aggregator, Batched, Monoid, Semigroup} import com.twitter.scalding.TupleConverter.singleConverter import com.twitter.scalding._ -import com.twitter.scalding.typed.functions.{ AsLeft, AsRight, Constant, ConstantKey, DropValue1, Identity, MakeKey, GetKey, GetValue, RandomFilter, RandomNextInt, Swap, TuplizeFunction, WithConstant, PartialFunctionToFilter, SubTypes } -import com.twitter.scalding.serialization.{ EquivSerialization, OrderedSerialization, UnitOrderedSerialization } +import com.twitter.scalding.typed.functions.{ + AsLeft, + AsRight, + Constant, + ConstantKey, + DropValue1, + GetKey, + GetValue, + Identity, + MakeKey, + PartialFunctionToFilter, + RandomFilter, + RandomNextInt, + SubTypes, + Swap, + TuplizeFunction, + WithConstant +} +import com.twitter.scalding.serialization.{EquivSerialization, OrderedSerialization, UnitOrderedSerialization} import com.twitter.scalding.serialization.OrderedSerialization.Result import com.twitter.scalding.serialization.macros.impl.BinaryOrdering import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ @@ -32,27 +49,33 @@ import scala.util.Try import scala.util.hashing.MurmurHash3 /** - * factory methods for TypedPipe, which is the typed representation of distributed lists in scalding. - * This object is here rather than in the typed package because a lot of code was written using - * the functions in the object, which we do not see how to hide with package object tricks. + * factory methods for TypedPipe, which is the typed representation of distributed lists in scalding. This + * object is here rather than in the typed package because a lot of code was written using the functions in + * the object, which we do not see how to hide with package object tricks. */ object TypedPipe extends Serializable { /** - * Create a TypedPipe from a cascading Pipe, some Fields and the type T - * Avoid this if you can. Prefer from(TypedSource). + * Create a TypedPipe from a cascading Pipe, some Fields and the type T Avoid this if you can. Prefer + * from(TypedSource). */ - def from[T](pipe: Pipe, fields: Fields)(implicit flowDef: FlowDef, mode: Mode, conv: TupleConverter[T]): TypedPipe[T] = { + def from[T](pipe: Pipe, fields: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + conv: TupleConverter[T] + ): TypedPipe[T] = { /* * This could be in TypedSource, but we don't want to encourage users * to work directly with Pipe */ - case class WrappingSource[T](pipe: Pipe, - fields: Fields, - @transient localFlow: FlowDef, // FlowDef is not serializable. We shouldn't need to, but being paranoid - mode: Mode, - conv: TupleConverter[T]) extends TypedSource[T] { + case class WrappingSource[T]( + pipe: Pipe, + fields: Fields, + @transient localFlow: FlowDef, // FlowDef is not serializable. We shouldn't need to, but being paranoid + mode: Mode, + conv: TupleConverter[T] + ) extends TypedSource[T] { def converter[U >: T]: TupleConverter[U] = TupleConverter.asSuperConverter[T, U](conv) @@ -60,8 +83,10 @@ object TypedPipe extends Serializable { def read(implicit fd: FlowDef, m: Mode): Pipe = { // This check is not likely to fail unless someone does something really strange. // for historical reasons, it is not checked by the typed system - require(m == mode, - s"Cannot switch Mode between TypedPipe.from and toPipe calls. Pipe: $pipe, pipe mode: $m, outer mode: $mode") + require( + m == mode, + s"Cannot switch Mode between TypedPipe.from and toPipe calls. Pipe: $pipe, pipe mode: $m, outer mode: $mode" + ) Dsl.flowDefToRichFlowDef(fd).mergeFrom(localFlow) pipe } @@ -86,25 +111,22 @@ object TypedPipe extends Serializable { if (iter.isEmpty) empty else IterablePipe[T](iter) /** - * Input must be a Pipe with exactly one Field - * Avoid this method and prefer from(TypedSource) if possible + * Input must be a Pipe with exactly one Field Avoid this method and prefer from(TypedSource) if possible */ def fromSingleField[T](pipe: Pipe)(implicit fd: FlowDef, mode: Mode): TypedPipe[T] = from(pipe, new Fields(0))(fd, mode, singleConverter[T]) /** - * Create an empty TypedPipe. This is sometimes useful when a method must return - * a TypedPipe, but sometimes at runtime we can check a condition and see that - * it should be empty. - * This is the zero of the Monoid[TypedPipe] + * Create an empty TypedPipe. This is sometimes useful when a method must return a TypedPipe, but sometimes + * at runtime we can check a condition and see that it should be empty. This is the zero of the + * Monoid[TypedPipe] */ def empty: TypedPipe[Nothing] = EmptyTypedPipe /** - * This enables pipe.hashJoin(that) or pipe.join(that) syntax - * This is a safe enrichment because hashJoinable and CoGroupable are - * only used in the argument position or to give cogroup, join, leftJoin, rightJoin, outerJoin - * methods. Since those methods are unlikely to be used on TypedPipe in the future, this + * This enables pipe.hashJoin(that) or pipe.join(that) syntax This is a safe enrichment because hashJoinable + * and CoGroupable are only used in the argument position or to give cogroup, join, leftJoin, rightJoin, + * outerJoin methods. Since those methods are unlikely to be used on TypedPipe in the future, this * enrichment seems safe. * * This method is the Vitaly-was-right method. @@ -134,8 +156,8 @@ object TypedPipe extends Serializable { if (sz == 0) TypedPipe.empty else if (sz == 1) ps(0) else { - val left = combine(ps.take(sz/2)) - val right = combine(ps.drop(sz/2)) + val left = combine(ps.take(sz / 2)) + val right = combine(ps.drop(sz / 2)) left ++ right } } @@ -145,7 +167,7 @@ object TypedPipe extends Serializable { private case object IdentityOrdering extends OrderedSerialization[Int] with EquivSerialization[Int] { val delegate = BinaryOrdering.ordSer[Int] - + override def compareBinary(a: InputStream, b: InputStream): Result = delegate.compareBinary(a, b) override def compare(x: Int, y: Int): Int = delegate.compare(x, y) override def dynamicSize(t: Int): Option[Int] = delegate.dynamicSize(t) @@ -155,100 +177,115 @@ object TypedPipe extends Serializable { override def hash(x: Int): Int = x } - final case class CoGroupedPipe[K, V](@transient cogrouped: CoGrouped[K, V]) extends TypedPipe[(K, V)] - final case class CounterPipe[A](pipe: TypedPipe[(A, Iterable[((String, String), Long)])]) extends TypedPipe[A] - final case class CrossPipe[T, U](left: TypedPipe[T], right: TypedPipe[U]) extends TypedPipe[(T, U)] { - def viaHashJoin: TypedPipe[(T, U)] = - left.withKey(()).hashJoin(right.withKey(())).values - } - final case class CrossValue[T, U](left: TypedPipe[T], right: ValuePipe[U]) extends TypedPipe[(T, U)] { - def viaHashJoin: TypedPipe[(T, U)] = - right match { - case EmptyValue => - EmptyTypedPipe - case LiteralValue(v) => - left.map(WithConstant(v)) - case ComputedValue(pipe) => - CrossPipe(left, pipe) - } - } - final case class DebugPipe[T](input: TypedPipe[T]) extends TypedPipe[T] - final case class FilterKeys[K, V](input: TypedPipe[(K, V)], @transient fn: K => Boolean) extends TypedPipe[(K, V)] - final case class Filter[T](input: TypedPipe[T], @transient fn: T => Boolean) extends TypedPipe[T] - final case class FlatMapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => TraversableOnce[U]) extends TypedPipe[(K, U)] - final case class FlatMapped[T, U](input: TypedPipe[T], @transient fn: T => TraversableOnce[U]) extends TypedPipe[U] - final case class ForceToDisk[T](input: TypedPipe[T]) extends TypedPipe[T] - final case class Fork[T](input: TypedPipe[T]) extends TypedPipe[T] - final case class HashCoGroup[K, V, W, R](left: TypedPipe[(K, V)], @transient right: HashJoinable[K, W], @transient joiner: (K, V, Iterable[W]) => Iterator[R]) extends TypedPipe[(K, R)] - final case class IterablePipe[T](iterable: Iterable[T]) extends TypedPipe[T] - final case class MapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => U) extends TypedPipe[(K, U)] - final case class Mapped[T, U](input: TypedPipe[T], @transient fn: T => U) extends TypedPipe[U] - final case class MergedTypedPipe[T](left: TypedPipe[T], right: TypedPipe[T]) extends TypedPipe[T] - final case class ReduceStepPipe[K, V1, V2](@transient reduce: ReduceStep[K, V1, V2]) extends TypedPipe[(K, V2)] - final case class SourcePipe[T](@transient source: TypedSource[T]) extends TypedPipe[T] - final case class SumByLocalKeys[K, V](input: TypedPipe[(K, V)], @transient semigroup: Semigroup[V]) extends TypedPipe[(K, V)] - final case class TrappedPipe[T](input: TypedPipe[T], @transient sink: Source with TypedSink[T], @transient conv: TupleConverter[T]) extends TypedPipe[T] - /** - * descriptions carry a boolean that is true if we should deduplicate the message. - * This is used for line numbers which are otherwise often duplicated - */ - final case class WithDescriptionTypedPipe[T](input: TypedPipe[T], descriptions: List[(String, Boolean)]) extends TypedPipe[T] - final case class WithOnComplete[T](input: TypedPipe[T], @transient fn: () => Unit) extends TypedPipe[T] - - case object EmptyTypedPipe extends TypedPipe[Nothing] { - // we can't let the default TypedPipe == go here, it will stack overflow on a pattern match - override def equals(that: Any): Boolean = - that match { - case e: EmptyTypedPipe.type => true - case _ => false - } - } - - implicit class InvariantTypedPipe[T](val pipe: TypedPipe[T]) extends AnyVal { - /** - * Returns the set of distinct elements in the TypedPipe - * This is the same as: .map((_, ())).group.sum.keys - * If you want a distinct while joining, consider: - * instead of: - * {@code - * a.join(b.distinct.asKeys) - * } - * manually do the distinct: - * {@code - * a.join(b.asKeys.sum) - * } - * The latter creates 1 map/reduce phase rather than 2 - */ - @annotation.implicitNotFound(msg = "For distinct method to work, the type in TypedPipe must have an Ordering.") - def distinct(implicit ord: Ordering[T]): TypedPipe[T] = - pipe.asKeys.sum.keys - - /** - * If any errors happen below this line, but before a groupBy, write to a TypedSink - */ - @deprecated("semantics of addTrap are hard to follow, prefer to use Either and manually write out error branchs", "0.18.0") - def addTrap(trapSink: Source with TypedSink[T])(implicit conv: TupleConverter[T]): TypedPipe[T] = - TypedPipe.TrappedPipe[T](pipe, trapSink, conv).withLine - } - - /** - * This is where all the methods that require TypedPipe[(K, V)] live. - * - * previously, these were directly on TypedPipe with the use of T <:< (K, V) - * however that complicates type inference on many functions. - */ + final case class CoGroupedPipe[K, V](@transient cogrouped: CoGrouped[K, V]) extends TypedPipe[(K, V)] + final case class CounterPipe[A](pipe: TypedPipe[(A, Iterable[((String, String), Long)])]) + extends TypedPipe[A] + final case class CrossPipe[T, U](left: TypedPipe[T], right: TypedPipe[U]) extends TypedPipe[(T, U)] { + def viaHashJoin: TypedPipe[(T, U)] = + left.withKey(()).hashJoin(right.withKey(())).values + } + final case class CrossValue[T, U](left: TypedPipe[T], right: ValuePipe[U]) extends TypedPipe[(T, U)] { + def viaHashJoin: TypedPipe[(T, U)] = + right match { + case EmptyValue => + EmptyTypedPipe + case LiteralValue(v) => + left.map(WithConstant(v)) + case ComputedValue(pipe) => + CrossPipe(left, pipe) + } + } + final case class DebugPipe[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class FilterKeys[K, V](input: TypedPipe[(K, V)], @transient fn: K => Boolean) + extends TypedPipe[(K, V)] + final case class Filter[T](input: TypedPipe[T], @transient fn: T => Boolean) extends TypedPipe[T] + final case class FlatMapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => TraversableOnce[U]) + extends TypedPipe[(K, U)] + final case class FlatMapped[T, U](input: TypedPipe[T], @transient fn: T => TraversableOnce[U]) + extends TypedPipe[U] + final case class ForceToDisk[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class Fork[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class HashCoGroup[K, V, W, R]( + left: TypedPipe[(K, V)], + @transient right: HashJoinable[K, W], + @transient joiner: (K, V, Iterable[W]) => Iterator[R] + ) extends TypedPipe[(K, R)] + final case class IterablePipe[T](iterable: Iterable[T]) extends TypedPipe[T] + final case class MapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => U) + extends TypedPipe[(K, U)] + final case class Mapped[T, U](input: TypedPipe[T], @transient fn: T => U) extends TypedPipe[U] + final case class MergedTypedPipe[T](left: TypedPipe[T], right: TypedPipe[T]) extends TypedPipe[T] + final case class ReduceStepPipe[K, V1, V2](@transient reduce: ReduceStep[K, V1, V2]) + extends TypedPipe[(K, V2)] + final case class SourcePipe[T](@transient source: TypedSource[T]) extends TypedPipe[T] + final case class SumByLocalKeys[K, V](input: TypedPipe[(K, V)], @transient semigroup: Semigroup[V]) + extends TypedPipe[(K, V)] + final case class TrappedPipe[T]( + input: TypedPipe[T], + @transient sink: Source with TypedSink[T], + @transient conv: TupleConverter[T] + ) extends TypedPipe[T] + + /** + * descriptions carry a boolean that is true if we should deduplicate the message. This is used for line + * numbers which are otherwise often duplicated + */ + final case class WithDescriptionTypedPipe[T](input: TypedPipe[T], descriptions: List[(String, Boolean)]) + extends TypedPipe[T] + final case class WithOnComplete[T](input: TypedPipe[T], @transient fn: () => Unit) extends TypedPipe[T] + + case object EmptyTypedPipe extends TypedPipe[Nothing] { + // we can't let the default TypedPipe == go here, it will stack overflow on a pattern match + override def equals(that: Any): Boolean = + that match { + case e: EmptyTypedPipe.type => true + case _ => false + } + } + + implicit class InvariantTypedPipe[T](val pipe: TypedPipe[T]) extends AnyVal { + + /** + * Returns the set of distinct elements in the TypedPipe This is the same as: .map((_, ())).group.sum.keys + * If you want a distinct while joining, consider: instead of: {@code + * a.join(b.distinct.asKeys) } manually do the distinct: {@code + * a.join(b.asKeys.sum) } The latter creates 1 map/reduce phase rather than 2 + */ + @annotation.implicitNotFound( + msg = "For distinct method to work, the type in TypedPipe must have an Ordering." + ) + def distinct(implicit ord: Ordering[T]): TypedPipe[T] = + pipe.asKeys.sum.keys + + /** + * If any errors happen below this line, but before a groupBy, write to a TypedSink + */ + @deprecated( + "semantics of addTrap are hard to follow, prefer to use Either and manually write out error branchs", + "0.18.0" + ) + def addTrap(trapSink: Source with TypedSink[T])(implicit conv: TupleConverter[T]): TypedPipe[T] = + TypedPipe.TrappedPipe[T](pipe, trapSink, conv).withLine + } + + /** + * This is where all the methods that require TypedPipe[(K, V)] live. + * + * previously, these were directly on TypedPipe with the use of T <:< (K, V) however that complicates type + * inference on many functions. + */ implicit class Keyed[K, V](val kvpipe: TypedPipe[(K, V)]) extends AnyVal { /** - * Sometimes useful for implementing custom joins with groupBy + mapValueStream when you know - * that the value/key can fit in memory. Beware. + * Sometimes useful for implementing custom joins with groupBy + mapValueStream when you know that the + * value/key can fit in memory. Beware. */ def eitherValues[R](that: TypedPipe[(K, R)]): TypedPipe[(K, Either[V, R])] = mapValues(AsLeft[V, R]()) ++ (that.mapValues(AsRight[V, R]())) /** - * If T is a (K, V) for some V, then we can use this function to filter. - * Prefer to use this if your filter only touches the key. + * If T is a (K, V) for some V, then we can use this function to filter. Prefer to use this if your filter + * only touches the key. * * This is here to match the function in KeyedListLike, where it is optimized */ @@ -260,17 +297,19 @@ object TypedPipe extends Serializable { TypedPipe.FlatMapValues(kvpipe, f).withLine /** - * flatten just the values - * This is more useful on KeyedListLike, but added here to reduce assymmetry in the APIs + * flatten just the values This is more useful on KeyedListLike, but added here to reduce assymmetry in + * the APIs */ def flattenValues[U](implicit ev: V <:< TraversableOnce[U]): TypedPipe[(K, U)] = { val st = SubTypes.tuple2_2[K, V, TraversableOnce[U]](SubTypes.fromEv(ev)) - kvpipe.widen(st.toEv) + kvpipe + .widen(st.toEv) .flatMapValues[U](Identity[TraversableOnce[U]]()) } /** - * This is the default means of grouping all pairs with the same key. Generally this triggers 1 Map/Reduce transition + * This is the default means of grouping all pairs with the same key. Generally this triggers 1 Map/Reduce + * transition */ def group(implicit ord: Ordering[K]): Grouped[K, V] = Grouped(kvpipe.withLine) @@ -279,16 +318,16 @@ object TypedPipe extends Serializable { def groupWith(ord: Ordering[K]): Grouped[K, V] = group(ord) /** - * These operations look like joins, but they do not force any communication - * of the current TypedPipe. They are mapping operations where this pipe is streamed - * through one item at a time. + * These operations look like joins, but they do not force any communication of the current TypedPipe. + * They are mapping operations where this pipe is streamed through one item at a time. * - * WARNING These behave semantically very differently than cogroup. - * This is because we handle (K,V) tuples on the left as we see them. - * The iterable on the right is over all elements with a matching key K, and it may be empty - * if there are no values for this key K. + * WARNING These behave semantically very differently than cogroup. This is because we handle (K,V) tuples + * on the left as we see them. The iterable on the right is over all elements with a matching key K, and + * it may be empty if there are no values for this key K. */ - def hashCogroup[K1 >: K, W, R](smaller: HashJoinable[K1, W])(joiner: (K1, V, Iterable[W]) => Iterator[R]): TypedPipe[(K1, R)] = + def hashCogroup[K1 >: K, W, R](smaller: HashJoinable[K1, W])( + joiner: (K1, V, Iterable[W]) => Iterator[R] + ): TypedPipe[(K1, R)] = TypedPipe.HashCoGroup(kvpipe.widen[(K1, V)], smaller, joiner).withLine /** Do an inner-join without shuffling this TypedPipe, but replicating argument to all tasks */ @@ -308,24 +347,23 @@ object TypedPipe extends Serializable { TypedPipe.MapValues(kvpipe, f).withLine /** - * Enables joining when this TypedPipe has some keys with many many values and - * but many with very few values. For instance, a graph where some nodes have - * millions of neighbors, but most have only a few. + * Enables joining when this TypedPipe has some keys with many many values and but many with very few + * values. For instance, a graph where some nodes have millions of neighbors, but most have only a few. * - * We build a (count-min) sketch of each key's frequency, and we use that - * to shard the heavy keys across many reducers. - * This increases communication cost in order to reduce the maximum time needed - * to complete the join. + * We build a (count-min) sketch of each key's frequency, and we use that to shard the heavy keys across + * many reducers. This increases communication cost in order to reduce the maximum time needed to complete + * the join. * - * {@code pipe.sketch(100).join(thatPipe) } - * will add an extra map/reduce job over a standard join to create the count-min-sketch. - * This will generally only be beneficial if you have really heavy skew, where without - * this you have 1 or 2 reducers taking hours longer than the rest. + * {@code pipe.sketch(100).join(thatPipe) } will add an extra map/reduce job over a standard join to + * create the count-min-sketch. This will generally only be beneficial if you have really heavy skew, + * where without this you have 1 or 2 reducers taking hours longer than the rest. */ - def sketch(reducers: Int, - eps: Double = 1.0E-5, //272k width = 1MB per row - delta: Double = 0.01, //5 rows (= 5 hashes) - seed: Int = 12345)(implicit serialization: K => Array[Byte], ordering: Ordering[K]): Sketched[K, V] = + def sketch( + reducers: Int, + eps: Double = 1.0e-5, //272k width = 1MB per row + delta: Double = 0.01, //5 rows (= 5 hashes) + seed: Int = 12345 + )(implicit serialization: K => Array[Byte], ordering: Ordering[K]): Sketched[K, V] = Sketched(kvpipe, reducers, delta, eps, seed) /** @@ -335,18 +373,15 @@ object TypedPipe extends Serializable { group.sum[V] /** - * This does a sum of values WITHOUT triggering a shuffle. - * the contract is, if followed by a group.sum the result is the same - * with or without this present, and it never increases the number of - * items. BUT due to the cost of caching, it might not be faster if - * there is poor key locality. + * This does a sum of values WITHOUT triggering a shuffle. the contract is, if followed by a group.sum the + * result is the same with or without this present, and it never increases the number of items. BUT due to + * the cost of caching, it might not be faster if there is poor key locality. * - * It is only useful for expert tuning, - * and best avoided unless you are struggling with performance problems. - * If you are not sure you need this, you probably don't. + * It is only useful for expert tuning, and best avoided unless you are struggling with performance + * problems. If you are not sure you need this, you probably don't. * - * The main use case is to reduce the values down before a key expansion - * such as is often done in a data cube. + * The main use case is to reduce the values down before a key expansion such as is often done in a data + * cube. */ def sumByLocalKeys(implicit sg: Semigroup[V]): TypedPipe[(K, V)] = TypedPipe.SumByLocalKeys(kvpipe, sg).withLine @@ -360,36 +395,45 @@ object TypedPipe extends Serializable { kvpipe.map(GetValue()) } - private case class TallyByFn[A](group: String, fn: A => String) extends Function1[A, (A, Iterable[((String, String), Long)])] { + private case class TallyByFn[A](group: String, fn: A => String) + extends Function1[A, (A, Iterable[((String, String), Long)])] { def apply(a: A) = (a, (((group, fn(a)), 1L)) :: Nil) } - private case class TallyFn[A](group: String, counter: String) extends Function1[A, (A, Iterable[((String, String), Long)])] { + private case class TallyFn[A](group: String, counter: String) + extends Function1[A, (A, Iterable[((String, String), Long)])] { private[this] val inc = ((group, counter), 1L) :: Nil def apply(a: A) = (a, inc) } - private case class TallyLeft[A, B](group: String, fn: A => Either[String, B]) extends Function1[A, (List[B], Iterable[((String, String), Long)])] { + private case class TallyLeft[A, B](group: String, fn: A => Either[String, B]) + extends Function1[A, (List[B], Iterable[((String, String), Long)])] { def apply(a: A) = fn(a) match { - case Right(b) => (b :: Nil, Nil) + case Right(b) => (b :: Nil, Nil) case Left(cnt) => (Nil, ((group, cnt), 1L) :: Nil) } } - implicit class TallyEnrichment[A, B <: Iterable[((String, String), Long)]](val pipe: TypedPipe[(A, B)]) extends AnyVal { + implicit class TallyEnrichment[A, B <: Iterable[((String, String), Long)]](val pipe: TypedPipe[(A, B)]) + extends AnyVal { + /** - * Increment hadoop counters with a (group, counter) by the amount in the second - * part of the tuple, and remove that second part + * Increment hadoop counters with a (group, counter) by the amount in the second part of the tuple, and + * remove that second part */ def tally: TypedPipe[A] = CounterPipe(pipe) } /** - * This is a def because it allocates a new memo on each call. This is - * important to avoid growing a memo indefinitely + * This is a def because it allocates a new memo on each call. This is important to avoid growing a memo + * indefinitely */ private def eqFn: RefPair[TypedPipe[Any], TypedPipe[Any]] => Boolean = { - def eqCoGroupable(left: CoGroupable[_, _], right: CoGroupable[_, _], rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean): Boolean = { + def eqCoGroupable( + left: CoGroupable[_, _], + right: CoGroupable[_, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = { import CoGrouped._ (left, right) match { case (Pair(la, lb, lfn), Pair(ra, rb, rfn)) => @@ -408,13 +452,21 @@ object TypedPipe extends Serializable { } } - def eqHashJoinable(left: HashJoinable[_, _], right: HashJoinable[_, _], rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean): Boolean = + def eqHashJoinable( + left: HashJoinable[_, _], + right: HashJoinable[_, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = (left, right) match { case (lrs: ReduceStep[_, _, _], rrs: ReduceStep[_, _, _]) => eqReduceStep(lrs, rrs, rec) } - def eqReduceStep(left: ReduceStep[_, _, _], right: ReduceStep[_, _, _], rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean): Boolean = { + def eqReduceStep( + left: ReduceStep[_, _, _], + right: ReduceStep[_, _, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = { val zeroLeft = ReduceStep.setInput(left, EmptyTypedPipe) val zeroRight = ReduceStep.setInput(right, EmptyTypedPipe) @@ -433,7 +485,7 @@ object TypedPipe extends Serializable { // have to deconstruct values val valEq = (valueA, valueB) match { case (ComputedValue(pA), ComputedValue(pB)) => rec(RefPair(pA, pB)) - case (l, r) => l == r + case (l, r) => l == r } valEq && rec(RefPair(pipeA, pipeB)) case (RefPair(DebugPipe(left), DebugPipe(right)), rec) => @@ -456,7 +508,7 @@ object TypedPipe extends Serializable { rec(RefPair(left, right)) case (RefPair(HashCoGroup(leftA, rightA, fnA), HashCoGroup(leftB, rightB, fnB)), rec) => (fnA == fnB) && rec(RefPair(leftA, leftB)) && eqHashJoinable(rightA, rightB, rec) - case (RefPair(IterablePipe(itA), IterablePipe(itB)), _) => itA == itB + case (RefPair(IterablePipe(itA), IterablePipe(itB)), _) => itA == itB case (RefPair(MapValues(leftIn, leftF), MapValues(rightIn, rightF)), rec) => // check the non-pipes first: (leftF == rightF) && rec(RefPair(leftIn, rightIn)) @@ -472,24 +524,27 @@ object TypedPipe extends Serializable { (leftSg == rightSg) && rec(RefPair(leftIn, rightIn)) case (RefPair(TrappedPipe(inA, sinkA, convA), TrappedPipe(inB, sinkB, convB)), rec) => (sinkA == sinkB) && (convA == convB) && rec(RefPair(inA, inB)) - case (RefPair(WithDescriptionTypedPipe(leftIn, leftDesc), WithDescriptionTypedPipe(rightIn, rightDesc)), rec) => + case ( + RefPair(WithDescriptionTypedPipe(leftIn, leftDesc), WithDescriptionTypedPipe(rightIn, rightDesc)), + rec + ) => // check the non-pipes first: (leftDesc == rightDesc) && rec(RefPair(leftIn, rightIn)) case (RefPair(WithOnComplete(leftIn, leftF), WithOnComplete(rightIn, rightF)), rec) => // check the non-pipes first: (leftF == rightF) && rec(RefPair(leftIn, rightIn)) case (RefPair(EmptyTypedPipe, EmptyTypedPipe), _) => true - case _ => false // we don't match on which subtype we are + case _ => false // we don't match on which subtype we are } } } /** - * Think of a TypedPipe as a distributed unordered list that may or may not yet - * have been materialized in memory or disk. + * Think of a TypedPipe as a distributed unordered list that may or may not yet have been materialized in + * memory or disk. * - * Represents a phase in a distributed computation on an input data source - * Wraps a cascading Pipe object, and holds the transformation done up until that point + * Represents a phase in a distributed computation on an input data source Wraps a cascading Pipe object, and + * holds the transformation done up until that point */ sealed abstract class TypedPipe[+T] extends Serializable with Product { @@ -521,9 +576,8 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } /** - * Increment diagnostic counters by 1 for each item in the pipe. - * The counter group will be the same for each item, the counter name - * is determined by the result of the `fn` passed in. + * Increment diagnostic counters by 1 for each item in the pipe. The counter group will be the same for each + * item, the counter name is determined by the result of the `fn` passed in. */ def tallyBy(group: String)(fn: T => String): TypedPipe[T] = map(TypedPipe.TallyByFn(group, fn)).tally @@ -537,52 +591,46 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { map(TypedPipe.TallyFn(group, counter)).tally /** - * Increment a diagnostic counter for each failure. This is like map, - * where the `fn` should return a `Right[U]` for each successful transformation - * and a `Left[String]` for each failure, with the String describing the failure. - * Each failure will be counted, and the result is just the successes. + * Increment a diagnostic counter for each failure. This is like map, where the `fn` should return a + * `Right[U]` for each successful transformation and a `Left[String]` for each failure, with the String + * describing the failure. Each failure will be counted, and the result is just the successes. */ def tallyLeft[B](group: String)(fn: T => Either[String, B]): TypedPipe[B] = map(TypedPipe.TallyLeft(group, fn)).tally.flatten /** - * Implements a cross product. The right side should be tiny - * This gives the same results as - * {code for { l <- list1; l2 <- list2 } yield (l, l2) } + * Implements a cross product. The right side should be tiny This gives the same results as {code for { l <- + * list1; l2 <- list2 } yield (l, l2) } */ def cross[U](tiny: TypedPipe[U]): TypedPipe[(T, U)] = TypedPipe.CrossPipe(this, tiny).withLine /** - * This is the fundamental mapper operation. - * It behaves in a way similar to List.flatMap, which means that each - * item is fed to the input function, which can return 0, 1, or many outputs - * (as a TraversableOnce) per input. The returned results will be iterated through once - * and then flattened into a single TypedPipe which is passed to the next step in the - * pipeline. + * This is the fundamental mapper operation. It behaves in a way similar to List.flatMap, which means that + * each item is fed to the input function, which can return 0, 1, or many outputs (as a TraversableOnce) per + * input. The returned results will be iterated through once and then flattened into a single TypedPipe + * which is passed to the next step in the pipeline. * - * This behavior makes it a powerful operator -- it can be used to filter records - * (by returning 0 items for a given input), it can be used the way map is used - * (by returning 1 item per input), it can be used to explode 1 input into many outputs, - * or even a combination of all of the above at once. + * This behavior makes it a powerful operator -- it can be used to filter records (by returning 0 items for + * a given input), it can be used the way map is used (by returning 1 item per input), it can be used to + * explode 1 input into many outputs, or even a combination of all of the above at once. */ def flatMap[U](f: T => TraversableOnce[U]): TypedPipe[U] = TypedPipe.FlatMapped(this, f).withLine /** - * Export back to a raw cascading Pipe. useful for interop with the scalding - * Fields API or with Cascading code. - * Avoid this if possible. Prefer to write to TypedSink. + * Export back to a raw cascading Pipe. useful for interop with the scalding Fields API or with Cascading + * code. Avoid this if possible. Prefer to write to TypedSink. */ - final def toPipe[U >: T](fieldNames: Fields)(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = + final def toPipe[U >: T]( + fieldNames: Fields + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = // we have to be cafeful to pass the setter we want since a low priority implicit can always be // found :( cascading_backend.CascadingBackend.toPipe[U](withLine, fieldNames)(flowDef, mode, setter) /** - * Merge two TypedPipes (no order is guaranteed) - * This is only realized when a group (or join) is - * performed. + * Merge two TypedPipes (no order is guaranteed) This is only realized when a group (or join) is performed. */ def ++[U >: T](other: TypedPipe[U]): TypedPipe[U] = TypedPipe.MergedTypedPipe(this, other).withLine @@ -590,8 +638,8 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { /** * Aggregate all items in this pipe into a single ValuePipe * - * Aggregators are composable reductions that allow you to glue together - * several reductions and process them in one pass. + * Aggregators are composable reductions that allow you to glue together several reductions and process them + * in one pass. * * Same as groupAll.aggregate.values */ @@ -599,10 +647,12 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { ComputedValue(groupAll.aggregate(agg).values) /** - * Put the items in this into the keys, and unit as the value in a Group - * in some sense, this is the dual of groupAll + * Put the items in this into the keys, and unit as the value in a Group in some sense, this is the dual of + * groupAll */ - @annotation.implicitNotFound(msg = "For asKeys method to work, the type in TypedPipe must have an Ordering.") + @annotation.implicitNotFound( + msg = "For asKeys method to work, the type in TypedPipe must have an Ordering." + ) def asKeys[U >: T](implicit ord: Ordering[U]): Grouped[U, Unit] = widen[U] .withValue(()) @@ -627,10 +677,7 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { SubTypes.fromEv(ev).liftCo[TypedPipe](this) /** - * Filter and map. See scala.collection.List.collect. - * {@code - * collect { case Some(x) => fn(x) } - * } + * Filter and map. See scala.collection.List.collect. {@code collect { case Some(x) => fn(x) } } */ def collect[U](fn: PartialFunction[T, U]): TypedPipe[U] = filter(PartialFunctionToFilter(fn)).map(fn) @@ -652,12 +699,14 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { /** * Returns the set of distinct elements identified by a given lambda extractor in the TypedPipe */ - @annotation.implicitNotFound(msg = "For distinctBy method to work, the type to distinct on in the TypedPipe must have an Ordering.") + @annotation.implicitNotFound( + msg = "For distinctBy method to work, the type to distinct on in the TypedPipe must have an Ordering." + ) def distinctBy[U](fn: T => U, numReducers: Option[Int] = None)(implicit ord: Ordering[U]): TypedPipe[T] = { val op = groupBy(fn).head val reduced = numReducers match { case Some(red) => op.withReducers(red) - case None => op + case None => op } reduced.map(GetValue()) } @@ -667,12 +716,9 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { map(AsLeft()) ++ (that.map(AsRight())) /** - * If you are going to create two branches or forks, - * it may be more efficient to call this method first - * which will create a node in the cascading graph. - * Without this, both full branches of the fork will be - * put into separate cascading pipes, which can, in some cases, - * be slower. + * If you are going to create two branches or forks, it may be more efficient to call this method first + * which will create a node in the cascading graph. Without this, both full branches of the fork will be put + * into separate cascading pipes, which can, in some cases, be slower. * * Ideally the planner would see this */ @@ -698,8 +744,8 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { def withFilter(f: T => Boolean): TypedPipe[T] = filter(f) /** - * Keep only items that don't satisfy the predicate. - * `filterNot` is the same as `filter` with a negated predicate. + * Keep only items that don't satisfy the predicate. `filterNot` is the same as `filter` with a negated + * predicate. */ def filterNot(f: T => Boolean): TypedPipe[T] = filter(!f(_)) @@ -709,15 +755,13 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { widen[TraversableOnce[U]].flatMap(Identity[TraversableOnce[U]]()) /** - * Force a materialization of this pipe prior to the next operation. - * This is useful if you filter almost everything before a hashJoin, for instance. - * This is useful for experts who see some heuristic of the planner causing - * slower performance. + * Force a materialization of this pipe prior to the next operation. This is useful if you filter almost + * everything before a hashJoin, for instance. This is useful for experts who see some heuristic of the + * planner causing slower performance. */ def forceToDisk: TypedPipe[T] = TypedPipe.ForceToDisk(this).withLine - /** Send all items to a single reducer */ def groupAll: Grouped[Unit, T] = groupBy(Constant(()))(UnitOrderedSerialization).withReducers(1) @@ -727,11 +771,10 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { map(MakeKey(g)).group /** - * Forces a shuffle by randomly assigning each item into one - * of the partitions. + * Forces a shuffle by randomly assigning each item into one of the partitions. * - * This is for the case where you mappers take a long time, and - * it is faster to shuffle them to more reducers and then operate. + * This is for the case where you mappers take a long time, and it is faster to shuffle them to more + * reducers and then operate. * * You probably want shard if you are just forcing a shuffle. */ @@ -750,17 +793,16 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } private[this] def defaultSeed: Long = System.identityHashCode(this) * 2654435761L ^ System.currentTimeMillis + /** - * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe - * does not require a reduce step. - * This method makes sure to fix the seed, otherwise restarts cause subtle errors. + * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe does not + * require a reduce step. This method makes sure to fix the seed, otherwise restarts cause subtle errors. */ def sample(fraction: Double): TypedPipe[T] = sample(fraction, defaultSeed) /** - * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe with - * a given seed. - * Does not require a reduce step. + * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe with a + * given seed. Does not require a reduce step. */ def sample(fraction: Double, seed: Long): TypedPipe[T] = { require(0.0 <= fraction && fraction <= 1.0, s"got $fraction which is an invalid fraction") @@ -768,74 +810,72 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } /** - * Used to force a shuffle into a given size of nodes. - * Only use this if your mappers are taking far longer than - * the time to shuffle. + * Used to force a shuffle into a given size of nodes. Only use this if your mappers are taking far longer + * than the time to shuffle. */ def shard(partitions: Int): TypedPipe[T] = groupRandomly(partitions).forceToReducers.values /** - * Reasonably common shortcut for cases of total associative/commutative reduction - * returns a ValuePipe with only one element if there is any input, otherwise EmptyValue. + * Reasonably common shortcut for cases of total associative/commutative reduction returns a ValuePipe with + * only one element if there is any input, otherwise EmptyValue. */ def sum[U >: T](implicit plus: Semigroup[U]): ValuePipe[U] = { // every 1000 items, compact. lazy implicit val batchedSG: Semigroup[Batched[U]] = Batched.compactingSemigroup[U](1000) // TODO: literals like this defeat caching in the planner - ComputedValue(map { t => ((), Batched[U](t)) } - .sumByLocalKeys - // remove the Batched before going to the reducers - // TODO: literals like this defeat caching in the planner - .map { case (_, batched) => batched.sum } - .groupAll - .forceToReducers - .sum - .values) + ComputedValue( + map(t => ((), Batched[U](t))).sumByLocalKeys + // remove the Batched before going to the reducers + // TODO: literals like this defeat caching in the planner + .map { case (_, batched) => batched.sum } + .groupAll + .forceToReducers + .sum + .values + ) } /** - * This is used when you are working with Execution[T] to create loops. - * You might do this to checkpoint and then flatMap Execution to continue - * from there. Probably only useful if you need to flatMap it twice to fan - * out the data into two children jobs. + * This is used when you are working with Execution[T] to create loops. You might do this to checkpoint and + * then flatMap Execution to continue from there. Probably only useful if you need to flatMap it twice to + * fan out the data into two children jobs. * - * This writes the current TypedPipe into a temporary file - * and then opens it after complete so that you can continue from that point + * This writes the current TypedPipe into a temporary file and then opens it after complete so that you can + * continue from that point */ def forceToDiskExecution: Execution[TypedPipe[T]] = Execution.forceToDisk(this) /** - * This gives an Execution that when run evaluates the TypedPipe, - * writes it to disk, and then gives you an Iterable that reads from - * disk on the submit node each time .iterator is called. - * Because of how scala Iterables work, mapping/flatMapping/filtering - * the Iterable forces a read of the entire thing. If you need it to - * be lazy, call .iterator and use the Iterator inside instead. + * This gives an Execution that when run evaluates the TypedPipe, writes it to disk, and then gives you an + * Iterable that reads from disk on the submit node each time .iterator is called. Because of how scala + * Iterables work, mapping/flatMapping/filtering the Iterable forces a read of the entire thing. If you need + * it to be lazy, call .iterator and use the Iterator inside instead. */ def toIterableExecution: Execution[Iterable[T]] = Execution.toIterable(this) /** use a TupleUnpacker to flatten U out into a cascading Tuple */ - def unpackToPipe[U >: T](fieldNames: Fields)(implicit fd: FlowDef, mode: Mode, up: TupleUnpacker[U]): Pipe = { + def unpackToPipe[U >: T]( + fieldNames: Fields + )(implicit fd: FlowDef, mode: Mode, up: TupleUnpacker[U]): Pipe = { val setter = up.newSetter(fieldNames) toPipe[U](fieldNames)(fd, mode, setter) } /** - * This attaches a function that is called at the end of the map phase on - * EACH of the tasks that are executing. - * This is for expert use only. You probably won't ever need it. Try hard - * to avoid it. Execution also has onComplete that can run when an Execution - * has completed. + * This attaches a function that is called at the end of the map phase on EACH of the tasks that are + * executing. This is for expert use only. You probably won't ever need it. Try hard to avoid it. Execution + * also has onComplete that can run when an Execution has completed. */ def onComplete(fn: () => Unit): TypedPipe[T] = TypedPipe.WithOnComplete[T](this, fn).withLine /** - * Safely write to a TypedSink[T]. If you want to write to a Source (not a Sink) - * you need to do something like: toPipe(fieldNames).write(dest) - * @return a pipe equivalent to the current pipe. + * Safely write to a TypedSink[T]. If you want to write to a Source (not a Sink) you need to do something + * like: toPipe(fieldNames).write(dest) + * @return + * a pipe equivalent to the current pipe. */ def write(dest: TypedSink[T])(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = { // We do want to record the line number that this occurred at @@ -845,23 +885,21 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } /** - * This is the functionally pure approach to building jobs. Note, - * that you have to call run on the result or flatMap/zip it - * into an Execution that is run for anything to happen here. + * This is the functionally pure approach to building jobs. Note, that you have to call run on the result or + * flatMap/zip it into an Execution that is run for anything to happen here. */ def writeExecution(dest: TypedSink[T]): Execution[Unit] = Execution.write(this, dest) /** - * If you want to write to a specific location, and then read from - * that location going forward, use this. + * If you want to write to a specific location, and then read from that location going forward, use this. */ def writeThrough[U >: T](dest: TypedSink[T] with TypedSource[U]): Execution[TypedPipe[U]] = Execution.write(this, dest, TypedPipe.from(dest)) /** - * If you want to writeThrough to a specific file if it doesn't already exist, - * and otherwise just read from it going forward, use this. + * If you want to writeThrough to a specific file if it doesn't already exist, and otherwise just read from + * it going forward, use this. */ def make[U >: T](dest: Source with TypedSink[T] with TypedSource[U]): Execution[TypedPipe[U]] = Execution.getMode.flatMap { mode => @@ -873,15 +911,14 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } } - /** - * ValuePipe may be empty, so, this attaches it as an Option - * cross is the same as leftCross(p).collect { case (t, Some(v)) => (t, v) } + * ValuePipe may be empty, so, this attaches it as an Option cross is the same as leftCross(p).collect { + * case (t, Some(v)) => (t, v) } */ def leftCross[V](p: ValuePipe[V]): TypedPipe[(T, Option[V])] = p match { - case EmptyValue => map(WithConstant(None)) - case LiteralValue(v) => map(WithConstant(Some(v))) + case EmptyValue => map(WithConstant(None)) + case LiteralValue(v) => map(WithConstant(Some(v))) case ComputedValue(pipe) => leftCross(pipe) } @@ -890,48 +927,29 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { withKey(()).hashLeftJoin(thatPipe.withKey(())).values /** - * common pattern of attaching a value and then map - * recommended style: - * {@code - * mapWithValue(vpu) { - * case (t, Some(u)) => op(t, u) - * case (t, None) => // if you never expect this: - * sys.error("unexpected empty value pipe") - * } - * } + * common pattern of attaching a value and then map recommended style: {@code mapWithValue(vpu) { case (t, + * Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty value + * pipe") } } */ def mapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => V): TypedPipe[V] = leftCross(value).map(TuplizeFunction(f)) /** - * common pattern of attaching a value and then flatMap - * recommended style: - * {@code - * flatMapWithValue(vpu) { - * case (t, Some(u)) => op(t, u) - * case (t, None) => // if you never expect this: - * sys.error("unexpected empty value pipe") - * } - * } + * common pattern of attaching a value and then flatMap recommended style: {@code flatMapWithValue(vpu) { + * case (t, Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty + * value pipe") } } */ def flatMapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => TraversableOnce[V]): TypedPipe[V] = leftCross(value).flatMap(TuplizeFunction(f)) /** - * common pattern of attaching a value and then filter - * recommended style: - * {@code - * filterWithValue(vpu) { - * case (t, Some(u)) => op(t, u) - * case (t, None) => // if you never expect this: - * sys.error("unexpected empty value pipe") - * } - * } + * common pattern of attaching a value and then filter recommended style: {@code filterWithValue(vpu) { case + * (t, Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty value + * pipe") } } */ def filterWithValue[U](value: ValuePipe[U])(f: (T, Option[U]) => Boolean): TypedPipe[T] = leftCross(value).filter(TuplizeFunction(f)).map(GetKey()) - /** * For each element, do a map-side (hash) left join to look up a value */ @@ -944,21 +962,29 @@ sealed abstract class TypedPipe[+T] extends Serializable with Product { } /** - * This class is for the syntax enrichment enabling - * .joinBy on TypedPipes. To access this, do - * import Syntax.joinOnMappablePipe + * This class is for the syntax enrichment enabling .joinBy on TypedPipes. To access this, do import + * Syntax.joinOnMappablePipe */ class MappablePipeJoinEnrichment[T](pipe: TypedPipe[T]) { - def joinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit ord: Ordering[K]): CoGrouped[K, (T, U)] = pipe.groupBy(g).withReducers(reducers).join(smaller.groupBy(h)) - def leftJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit ord: Ordering[K]): CoGrouped[K, (T, Option[U])] = pipe.groupBy(g).withReducers(reducers).leftJoin(smaller.groupBy(h)) - def rightJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit ord: Ordering[K]): CoGrouped[K, (Option[T], U)] = pipe.groupBy(g).withReducers(reducers).rightJoin(smaller.groupBy(h)) - def outerJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit ord: Ordering[K]): CoGrouped[K, (Option[T], Option[U])] = pipe.groupBy(g).withReducers(reducers).outerJoin(smaller.groupBy(h)) + def joinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (T, U)] = pipe.groupBy(g).withReducers(reducers).join(smaller.groupBy(h)) + def leftJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (T, Option[U])] = pipe.groupBy(g).withReducers(reducers).leftJoin(smaller.groupBy(h)) + def rightJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (Option[T], U)] = pipe.groupBy(g).withReducers(reducers).rightJoin(smaller.groupBy(h)) + def outerJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (Option[T], Option[U])] = + pipe.groupBy(g).withReducers(reducers).outerJoin(smaller.groupBy(h)) } /** - * These are named syntax extensions that users can optionally import. - * Avoid import Syntax._ + * These are named syntax extensions that users can optionally import. Avoid import Syntax._ */ object Syntax { - implicit def joinOnMappablePipe[T](p: TypedPipe[T]): MappablePipeJoinEnrichment[T] = new MappablePipeJoinEnrichment(p) + implicit def joinOnMappablePipe[T](p: TypedPipe[T]): MappablePipeJoinEnrichment[T] = + new MappablePipeJoinEnrichment(p) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala index 0c648d1a53..727479276e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala @@ -5,40 +5,47 @@ import scala.reflect.ClassTag /** * Some methods for comparing two typed pipes and finding out the difference between them. * - * Has support for the normal case where the typed pipes are pipes of objects usable as keys - * in scalding (have an ordering, proper equals and hashCode), as well as some special cases - * for dealing with Arrays and thrift objects. + * Has support for the normal case where the typed pipes are pipes of objects usable as keys in scalding (have + * an ordering, proper equals and hashCode), as well as some special cases for dealing with Arrays and thrift + * objects. * - * See diffByHashCode for comparing typed pipes of objects that have no ordering but a stable hash code - * (such as Scrooge thrift). + * See diffByHashCode for comparing typed pipes of objects that have no ordering but a stable hash code (such + * as Scrooge thrift). * * See diffByGroup for comparing typed pipes of objects that have no ordering *and* an unstable hash code. */ object TypedPipeDiff { /** - * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, - * only for cases where the counts are not equal. + * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, only for cases + * where the counts are not equal. * - * Requires that T have an ordering and a hashCode and equals that is stable across JVMs (not reference based). - * See diffArrayPipes for diffing pipes of arrays, since arrays do not meet these requirements by default. + * Requires that T have an ordering and a hashCode and equals that is stable across JVMs (not reference + * based). See diffArrayPipes for diffing pipes of arrays, since arrays do not meet these requirements by + * default. */ - def diff[T: Ordering](left: TypedPipe[T], right: TypedPipe[T], reducers: Option[Int] = None): UnsortedGrouped[T, (Long, Long)] = { - val lefts = left.map { x => (x, (1L, 0L)) } - val rights = right.map { x => (x, (0L, 1L)) } + def diff[T: Ordering]( + left: TypedPipe[T], + right: TypedPipe[T], + reducers: Option[Int] = None + ): UnsortedGrouped[T, (Long, Long)] = { + val lefts = left.map(x => (x, (1L, 0L))) + val rights = right.map(x => (x, (0L, 1L))) val counts = (lefts ++ rights).sumByKey val diff = counts.filter { case (key, (lCount, rCount)) => lCount != rCount } reducers.map(diff.withReducers).getOrElse(diff) } /** - * Same as diffByHashCode, but takes care to wrap the Array[T] in a wrapper, - * which has the correct hashCode and equals needed. This does not involve - * copying the arrays, just wrapping them, and is specialized for primitive arrays. + * Same as diffByHashCode, but takes care to wrap the Array[T] in a wrapper, which has the correct hashCode + * and equals needed. This does not involve copying the arrays, just wrapping them, and is specialized for + * primitive arrays. */ - def diffArrayPipes[T: ClassTag](left: TypedPipe[Array[T]], - right: TypedPipe[Array[T]], - reducers: Option[Int] = None): TypedPipe[(Array[T], (Long, Long))] = { + def diffArrayPipes[T: ClassTag]( + left: TypedPipe[Array[T]], + right: TypedPipe[Array[T]], + reducers: Option[Int] = None + ): TypedPipe[(Array[T], (Long, Long))] = { // cache this instead of reflecting on every single array val wrapFn = HashEqualsArrayWrapper.wrapByClassTagFn[T] @@ -50,37 +57,32 @@ object TypedPipeDiff { /** * NOTE: Prefer diff over this method if you can find or construct an Ordering[T]. * - * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, - * only for cases where the counts are not equal. + * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, only for cases + * where the counts are not equal. * - * This implementation does not require an ordering on T, but does require a function (groupByFn) - * that extracts a value of type K (which has an ordering) from a record of type T. + * This implementation does not require an ordering on T, but does require a function (groupByFn) that + * extracts a value of type K (which has an ordering) from a record of type T. * - * The groupByFn should be something that partitions records as evenly as possible, - * because all unique records that result in the same groupByFn value will be materialized into an in memory map. + * The groupByFn should be something that partitions records as evenly as possible, because all unique + * records that result in the same groupByFn value will be materialized into an in memory map. * - * groupByFn must be a pure function, such that: - * x == y implies that groupByFn(x) == groupByFn(y) + * groupByFn must be a pure function, such that: x == y implies that groupByFn(x) == groupByFn(y) * - * T must have a hash code suitable for use in a hash map on a single JVM (doesn't have to be stable cross JVM) - * K must have a hash code this *is* stable across JVMs. - * K must have an ordering. + * T must have a hash code suitable for use in a hash map on a single JVM (doesn't have to be stable cross + * JVM) K must have a hash code this *is* stable across JVMs. K must have an ordering. * - * Example groupByFns would be x => x.hashCode, assuming x's hashCode is stable across jvms, - * or maybe x => x.timestamp, if x's hashCode is not stable, assuming there's shouldn't be too - * many records with the same timestamp. + * Example groupByFns would be x => x.hashCode, assuming x's hashCode is stable across jvms, or maybe x => + * x.timestamp, if x's hashCode is not stable, assuming there's shouldn't be too many records with the same + * timestamp. */ - def diffByGroup[T, K: Ordering]( - left: TypedPipe[T], - right: TypedPipe[T], - reducers: Option[Int] = None)(groupByFn: T => K): TypedPipe[(T, (Long, Long))] = { + def diffByGroup[T, K: Ordering](left: TypedPipe[T], right: TypedPipe[T], reducers: Option[Int] = None)( + groupByFn: T => K + ): TypedPipe[(T, (Long, Long))] = { - val lefts = left.map { t => (groupByFn(t), Map(t -> (1L, 0L))) } - val rights = right.map { t => (groupByFn(t), Map(t -> (0L, 1L))) } + val lefts = left.map(t => (groupByFn(t), Map(t -> (1L, 0L)))) + val rights = right.map(t => (groupByFn(t), Map(t -> (0L, 1L)))) - val diff = (lefts ++ rights) - .sumByKey - .flattenValues + val diff = (lefts ++ rights).sumByKey.flattenValues .filter { case (k, (t, (lCount, rCount))) => lCount != rCount } reducers.map(diff.withReducers).getOrElse(diff).values @@ -94,26 +96,34 @@ object TypedPipeDiff { * This method does an exact diff, it does not use the hashCode as a proxy for equality. */ def diffByHashCode[T]( - left: TypedPipe[T], - right: TypedPipe[T], - reducers: Option[Int] = None): TypedPipe[(T, (Long, Long))] = diffByGroup(left, right, reducers)(_.hashCode) + left: TypedPipe[T], + right: TypedPipe[T], + reducers: Option[Int] = None + ): TypedPipe[(T, (Long, Long))] = diffByGroup(left, right, reducers)(_.hashCode) object Enrichments { implicit class Diff[T](val left: TypedPipe[T]) extends AnyVal { - def diff(right: TypedPipe[T], reducers: Option[Int] = None)(implicit ev: Ordering[T]): UnsortedGrouped[T, (Long, Long)] = + def diff(right: TypedPipe[T], reducers: Option[Int] = None)(implicit + ev: Ordering[T] + ): UnsortedGrouped[T, (Long, Long)] = TypedPipeDiff.diff(left, right, reducers) - def diffByGroup[K: Ordering](right: TypedPipe[T], reducers: Option[Int] = None)(groupByFn: T => K): TypedPipe[(T, (Long, Long))] = + def diffByGroup[K: Ordering](right: TypedPipe[T], reducers: Option[Int] = None)( + groupByFn: T => K + ): TypedPipe[(T, (Long, Long))] = TypedPipeDiff.diffByGroup(left, right, reducers)(groupByFn) - def diffByHashCode(right: TypedPipe[T], reducers: Option[Int] = None): TypedPipe[(T, (Long, Long))] = TypedPipeDiff.diffByHashCode(left, right, reducers) + def diffByHashCode(right: TypedPipe[T], reducers: Option[Int] = None): TypedPipe[(T, (Long, Long))] = + TypedPipeDiff.diffByHashCode(left, right, reducers) } implicit class DiffArray[T](val left: TypedPipe[Array[T]]) extends AnyVal { - def diffArrayPipes(right: TypedPipe[Array[T]], reducers: Option[Int] = None)(implicit ev: ClassTag[T]): TypedPipe[(Array[T], (Long, Long))] = + def diffArrayPipes(right: TypedPipe[Array[T]], reducers: Option[Int] = None)(implicit + ev: ClassTag[T] + ): TypedPipe[(Array[T], (Long, Long))] = TypedPipeDiff.diffArrayPipes(left, right, reducers) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala index d00b7cfde6..89b4df844b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed @@ -23,10 +23,10 @@ import cascading.pipe.Pipe import cascading.tuple.Fields object TypedSink extends java.io.Serializable { + /** - * Build a TypedSink by declaring a concrete type for the Source - * Here because of the late addition of TypedSink to scalding to make it - * easier to port legacy code + * Build a TypedSink by declaring a concrete type for the Source Here because of the late addition of + * TypedSink to scalding to make it easier to port legacy code */ def apply[T](s: Source)(implicit tset: TupleSetter[T]): TypedSink[T] = new TypedSink[T] { @@ -45,8 +45,8 @@ trait TypedSink[-T] extends java.io.Serializable { def sinkFields: Fields = Dsl.intFields(0 until setter.arity) /** - * pipe is assumed to have the schema above, otherwise an error may occur - * The exact same pipe is returned to match the legacy Source API. + * pipe is assumed to have the schema above, otherwise an error may occur The exact same pipe is returned to + * match the legacy Source API. */ def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe @@ -63,4 +63,3 @@ trait TypedSink[-T] extends java.io.Serializable { } } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala index 1c6409b594..57fbc5f46c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed @@ -23,11 +23,10 @@ import cascading.pipe.Pipe import cascading.tuple.Fields trait TypedSource[+T] extends java.io.Serializable { + /** - * Because TupleConverter cannot be covariant, we need to jump through this hoop. - * A typical implementation might be: - * (implicit conv: TupleConverter[T]) - * and then: + * Because TupleConverter cannot be covariant, we need to jump through this hoop. A typical implementation + * might be: (implicit conv: TupleConverter[T]) and then: * * override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) */ @@ -37,8 +36,8 @@ trait TypedSource[+T] extends java.io.Serializable { def sourceFields: Fields = Dsl.intFields(0 until converter.arity) /** - * Transform this TypedSource into another by mapping after. - * We don't call this map because of conflicts with Mappable, unfortunately + * Transform this TypedSource into another by mapping after. We don't call this map because of conflicts + * with Mappable, unfortunately */ def andThen[U](fn: T => U): TypedSource[U] = { val self = this // compiler generated self can cause problems with serialization diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala index 0f1e9c8c48..70b78b9d8e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import com.twitter.scalding.Execution @@ -28,13 +28,12 @@ object ValuePipe extends java.io.Serializable { } /** - * ValuePipe is special case of a TypedPipe of just a optional single element. - * It is like a distribute Option type - * It allows to perform scalar based operations on pipes like normalization. + * ValuePipe is special case of a TypedPipe of just a optional single element. It is like a distribute Option + * type It allows to perform scalar based operations on pipes like normalization. */ sealed trait ValuePipe[+T] extends java.io.Serializable { def leftCross[U](that: ValuePipe[U]): ValuePipe[(T, Option[U])] = that match { - case EmptyValue => map((_, None)) + case EmptyValue => map((_, None)) case LiteralValue(v2) => map((_, Some(v2))) // We don't know if a computed value is empty or not. We need to run the MR job: case _ => ComputedValue(toTypedPipe.leftCross(that)) @@ -44,11 +43,10 @@ sealed trait ValuePipe[+T] extends java.io.Serializable { def map[U](fn: T => U): ValuePipe[U] def filter(fn: T => Boolean): ValuePipe[T] + /** - * Identical to toOptionExecution.map(_.get) - * The result will be an exception if there is no value. - * The name here follows the convention of adding - * Execution to the name so in the repl in is removed + * Identical to toOptionExecution.map(_.get) The result will be an exception if there is no value. The name + * here follows the convention of adding Execution to the name so in the repl in is removed */ def getExecution: Execution[T] = toOptionExecution.flatMap { case Some(t) => Execution.from(t) @@ -58,26 +56,22 @@ sealed trait ValuePipe[+T] extends java.io.Serializable { } /** - * Like the above, but with a lazy parameter that is evaluated - * if the value pipe is empty - * The name here follows the convention of adding - * Execution to the name so in the repl in is removed + * Like the above, but with a lazy parameter that is evaluated if the value pipe is empty The name here + * follows the convention of adding Execution to the name so in the repl in is removed */ def getOrElseExecution[U >: T](t: => U): Execution[U] = toOptionExecution.map(_.getOrElse(t)) def toTypedPipe: TypedPipe[T] /** - * Convert this value to an Option. It is an error if somehow - * this is not either empty or has one value. - * The name here follows the convention of adding - * Execution to the name so in the repl in is removed + * Convert this value to an Option. It is an error if somehow this is not either empty or has one value. The + * name here follows the convention of adding Execution to the name so in the repl in is removed */ def toOptionExecution: Execution[Option[T]] = toTypedPipe.toIterableExecution.map { it => it.iterator.take(2).toList match { - case Nil => None + case Nil => None case h :: Nil => Some(h) - case items => sys.error("More than 1 item in an ValuePipe: " + items.toString) + case items => sys.error("More than 1 item in an ValuePipe: " + items.toString) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithDescription.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/WithDescription.scala index 816c068739..e8d4ee8f4a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithDescription.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/WithDescription.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed /** @@ -26,12 +26,13 @@ trait HasDescription { * Used for objects that may _set_ a description to be used in .dot and MR step names. */ trait WithDescription[+This <: WithDescription[This]] extends HasDescription { self: This => + /** never mutates this, instead returns a new item. */ def withDescription(description: String): This def withDescription(descriptionOpt: Option[String]): This = descriptionOpt match { case Some(description) => withDescription(description) - case None => self + case None => self } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala index 6efe25d947..ebff2c078f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala @@ -12,31 +12,31 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import java.io.Serializable /** - * used for types that may know how many reducers they need - * e.g. CoGrouped, Grouped, SortedGrouped, UnsortedGrouped + * used for types that may know how many reducers they need e.g. CoGrouped, Grouped, SortedGrouped, + * UnsortedGrouped */ trait HasReducers { def reducers: Option[Int] } /** - * used for types that must know how many reducers they need - * e.g. Sketched + * used for types that must know how many reducers they need e.g. Sketched */ trait MustHaveReducers extends HasReducers { def reducers: Some[Int] } /** - * used for objects that may _set_ how many reducers they need - * e.g. CoGrouped, Grouped, SortedGrouped, UnsortedGrouped + * used for objects that may _set_ how many reducers they need e.g. CoGrouped, Grouped, SortedGrouped, + * UnsortedGrouped */ trait WithReducers[+This <: WithReducers[This]] extends HasReducers { + /** never mutates this, instead returns a new item. */ def withReducers(reds: Int): This } @@ -49,7 +49,7 @@ object WithReducers extends Serializable { def maybeWithReducers[W <: WithReducers[W]](w: W, reds: Option[Int]): W = reds match { - case None => w + case None => w case Some(r) => w.withReducers(r) } @@ -58,9 +58,9 @@ object WithReducers extends Serializable { */ def maybeCombine(optR1: Option[Int], optR2: Option[Int]): Option[Int] = (optR1, optR2) match { - case (None, other) => other - case (other, None) => other - case (Some(r1), Some(r2)) => Some(r1 max r2) + case (None, other) => other + case (other, None) => other + case (Some(r1), Some(r2)) => Some(r1.max(r2)) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala index 2755c5c1ae..241f8e5b8f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.typed -import com.stripe.dagon.{ Dag, Id, Rule, Memoize, FunctionK } +import com.stripe.dagon.{Dag, FunctionK, Id, Memoize, Rule} import com.twitter.scalding.Execution import com.twitter.scalding.typed.functions.EqTypes import org.slf4j.LoggerFactory @@ -12,21 +12,18 @@ object WritePartitioner { type PairK[F[_], G[_], T] = (F[T], G[T]) /** - * This breaks a job at all the places it explicitly fans out, - * (and currently after each reduce/join). + * This breaks a job at all the places it explicitly fans out, (and currently after each reduce/join). */ def breakAtForks[M[+_]](ws: List[PairK[TypedPipe, TypedSink, _]])(implicit M: Materializer[M]): M[Unit] = { - val rules = List( - OptimizationRules.AddExplicitForks, - OptimizationRules.RemoveDuplicateForceFork) + val rules = List(OptimizationRules.AddExplicitForks, OptimizationRules.RemoveDuplicateForceFork) materialize[M](rules, ws) } /** * Partition a single TypedPipe. * - * This is really only useful for jobs with single final outputs since you want - * to partition the entire job, not a portion of it. + * This is really only useful for jobs with single final outputs since you want to partition the entire job, + * not a portion of it. */ def partitionSingle[A](phases: Seq[Rule[TypedPipe]], pipe: TypedPipe[A]): Execution[TypedPipe[A]] = { type Const[B] = EqTypes[B, A] @@ -42,14 +39,12 @@ object WritePartitioner { } /** - * This enables us to write the partitioning in terms of this - * applicative type that is equipped with two extra operations: - * materialized and write, but not a general flatMap + * This enables us to write the partitioning in terms of this applicative type that is equipped with two + * extra operations: materialized and write, but not a general flatMap * * so the only sequencing power we have is to materialize * - * This allows us to test the properties we want without - * having to deal with Execution, which is a black box + * This allows us to test the properties we want without having to deal with Execution, which is a black box * concerned with actually running jobs */ trait Materializer[M[+_]] { @@ -69,28 +64,31 @@ object WritePartitioner { def pure[A](a: A) = Execution.from(a) def map[A, B](ma: Execution[A])(fn: A => B) = ma.map(fn) def zip[A, B](ma: Execution[A], mb: Execution[B]): Execution[(A, B)] = ma.zip(mb) - def materialize[A](t: Execution[TypedPipe[A]]): Execution[TypedPipe[A]] = t.flatMap(_.forceToDiskExecution) + def materialize[A](t: Execution[TypedPipe[A]]): Execution[TypedPipe[A]] = + t.flatMap(_.forceToDiskExecution) def write[A](tp: Execution[TypedPipe[A]], sink: TypedSink[A]): Execution[Unit] = tp.flatMap(_.writeExecution(sink)) def sequence_[A](as: Seq[Execution[A]]): Execution[Unit] = Execution.sequence(as).unit } } - def materialize[M[+_]](phases: Seq[Rule[TypedPipe]], ws: List[PairK[TypedPipe, TypedSink, _]])(implicit mat: Materializer[M]): M[Unit] = { + def materialize[M[+_]](phases: Seq[Rule[TypedPipe]], ws: List[PairK[TypedPipe, TypedSink, _]])(implicit + mat: Materializer[M] + ): M[Unit] = { val writes = materialize1[M, TypedSink](phases, ws)(mat) val toSeq = writes.map { case (mt, sink) => mat.write(mt, sink) } mat.sequence_(toSeq) } - def materialize1[M[+_], S[_]](phases: Seq[Rule[TypedPipe]], - ws: List[PairK[TypedPipe, S, _]])(implicit mat: Materializer[M]): List[PairK[mat.TP, S, _]] = { + def materialize1[M[+_], S[_]](phases: Seq[Rule[TypedPipe]], ws: List[PairK[TypedPipe, S, _]])(implicit + mat: Materializer[M] + ): List[PairK[mat.TP, S, _]] = { val e = Dag.empty(OptimizationRules.toLiteral) logger.info(s"converting ${ws.size} writes into several parts") - val (finalDag, writeIds) = ws.foldLeft((e, List.empty[PairK[Id, S, _]])) { - case ((dag, writes), pair) => - val (dag1, id) = dag.addRoot(pair._1) - (dag1, (id, pair._2) :: writes) + val (finalDag, writeIds) = ws.foldLeft((e, List.empty[PairK[Id, S, _]])) { case ((dag, writes), pair) => + val (dag1, id) = dag.addRoot(pair._1) + (dag1, (id, pair._2) :: writes) } // Now apply the rules: logger.info(s"applying rules to graph of size: ${finalDag.allNodes.size}") @@ -99,24 +97,33 @@ object WritePartitioner { import TypedPipe.{ReduceStepPipe, HashCoGroup} - def handleHashCoGroup[K, V, V2, R](hj: HashCoGroup[K, V, V2, R], recurse: FunctionK[TypedPipe, mat.TP]): mat.TP[(K, R)] = { + def handleHashCoGroup[K, V, V2, R]( + hj: HashCoGroup[K, V, V2, R], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, R)] = { import TypedPipe._ val exright: M[HashJoinable[K, V2]] = hj.right match { - case step@IdentityReduce(_, _, _, _, _) => + case step @ IdentityReduce(_, _, _, _, _) => type TK[+Z] = TypedPipe[(K, Z)] val mappedV2 = step.evidence.subst[TK](step.mapped) mat.map(recurse(mappedV2)) { (tp: TypedPipe[(K, V2)]) => IdentityReduce[K, V2, V2](step.keyOrdering, tp, step.reducers, step.descriptions, implicitly) } - case step@UnsortedIdentityReduce(_, _, _, _, _) => + case step @ UnsortedIdentityReduce(_, _, _, _, _) => type TK[+Z] = TypedPipe[(K, Z)] val mappedV2 = step.evidence.subst[TK](step.mapped) mat.map(recurse(mappedV2)) { (tp: TypedPipe[(K, V2)]) => - UnsortedIdentityReduce[K, V2, V2](step.keyOrdering, tp, step.reducers, step.descriptions, implicitly) + UnsortedIdentityReduce[K, V2, V2]( + step.keyOrdering, + tp, + step.reducers, + step.descriptions, + implicitly + ) } - case step@IteratorMappedReduce(_, _, _, _, _) => + case step @ IteratorMappedReduce(_, _, _, _, _) => def go[A, B, C](imr: IteratorMappedReduce[A, B, C]) = - mat.map(recurse(imr.mapped)) { (tp: TypedPipe[(A, B)]) => imr.copy(mapped = tp) } + mat.map(recurse(imr.mapped))((tp: TypedPipe[(A, B)]) => imr.copy(mapped = tp)) go(step) } @@ -129,10 +136,18 @@ object WritePartitioner { def widen[A, B <: A](exb: M[B]): M[A] = exb - def handleReduceStep[K, V1, V2](rs: ReduceStep[K, V1, V2], recurse: FunctionK[TypedPipe, mat.TP]): mat.TP[(K, V2)] = - mat.map(recurse(rs.mapped)) { pipe => TypedPipe.ReduceStepPipe(ReduceStep.setInput[K, V1, V2](rs, pipe)) } + def handleReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, V2)] = + mat.map(recurse(rs.mapped)) { pipe => + TypedPipe.ReduceStepPipe(ReduceStep.setInput[K, V1, V2](rs, pipe)) + } - def handleCoGrouped[K, V](cg: CoGroupable[K, V], recurse: FunctionK[TypedPipe, mat.TP]): mat.TP[(K, V)] = { + def handleCoGrouped[K, V]( + cg: CoGroupable[K, V], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, V)] = { import CoGrouped._ import TypedPipe._ @@ -152,7 +167,7 @@ object WritePartitioner { } cg match { - case p@Pair(_, _, _) => + case p @ Pair(_, _, _) => def go[A, B, C](pair: Pair[K, A, B, C]): mat.TP[(K, C)] = { val mleft = handleCoGrouped(pair.larger, recurse) val mright = handleCoGrouped(pair.smaller, recurse) @@ -162,7 +177,7 @@ object WritePartitioner { } } widen(go(p)) - case wr@WithReducers(_, _) => + case wr @ WithReducers(_, _) => def go[V1 <: V](wr: WithReducers[K, V1]): mat.TP[(K, V)] = { val reds = wr.reds mat.map(handleCoGrouped(wr.on, recurse)) { (tp: TypedPipe[(K, V1)]) => @@ -172,13 +187,15 @@ object WritePartitioner { case CoGroupedPipe(cg) => CoGroupedPipe(WithReducers(cg, reds)) case kvPipe => - ReduceStepPipe(IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) - .withReducers(reds)) + ReduceStepPipe( + IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + .withReducers(reds) + ) } } } go(wr) - case wd@WithDescription(_, _) => + case wd @ WithDescription(_, _) => def go[V1 <: V](wd: WithDescription[K, V1]): mat.TP[(K, V)] = { val desc = wd.description mat.map(handleCoGrouped(wd.on, recurse)) { (tp: TypedPipe[(K, V1)]) => @@ -193,7 +210,7 @@ object WritePartitioner { } } go(wd) - case fk@CoGrouped.FilterKeys(_, _) => + case fk @ CoGrouped.FilterKeys(_, _) => def go[V1 <: V](fk: CoGrouped.FilterKeys[K, V1]): mat.TP[(K, V)] = { val fn = fk.fn mat.map(handleCoGrouped(fk.on, recurse)) { (tp: TypedPipe[(K, V1)]) => @@ -210,7 +227,7 @@ object WritePartitioner { } } go(fk) - case mg@MapGroup(_, _) => + case mg @ MapGroup(_, _) => def go[V1, V2 <: V](mg: MapGroup[K, V1, V2]): mat.TP[(K, V)] = { val fn = mg.fn mat.map(handleCoGrouped(mg.on, recurse)) { (tp: TypedPipe[(K, V1)]) => @@ -226,63 +243,62 @@ object WritePartitioner { } } go(mg) - case step@IdentityReduce(_, _, _, _, _) => + case step @ IdentityReduce(_, _, _, _, _) => widen(handleReduceStep(step, recurse)) // the widen trick sidesteps GADT bugs - case step@UnsortedIdentityReduce(_, _, _, _, _) => + case step @ UnsortedIdentityReduce(_, _, _, _, _) => widen(handleReduceStep(step, recurse)) - case step@IteratorMappedReduce(_, _, _, _, _) => + case step @ IteratorMappedReduce(_, _, _, _, _) => widen(handleReduceStep(step, recurse)) } } /** - * If cascading would consider the current pipe as a Logical reduce - * we can avoid some forces below. This method returns true - * if the pipe is ending on a reduce (not potentially a map-only job) + * If cascading would consider the current pipe as a Logical reduce we can avoid some forces below. This + * method returns true if the pipe is ending on a reduce (not potentially a map-only job) */ @annotation.tailrec def isLogicalReduce(tp: TypedPipe[Any]): Boolean = { import TypedPipe._ tp match { case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) => false - case CounterPipe(a) => isLogicalReduce(a) - case cp@CrossPipe(_, _) => isLogicalReduce(cp.viaHashJoin) - case cp@CrossValue(_, _) => isLogicalReduce(cp.viaHashJoin) - case DebugPipe(p) => isLogicalReduce(p) - case FilterKeys(p, _) => isLogicalReduce(p) - case Filter(p, _) => isLogicalReduce(p) - case FlatMapValues(p, _) => isLogicalReduce(p) - case FlatMapped(p, _) => isLogicalReduce(p) - case ForceToDisk(_) => false // not reducers for sure, could be a map-only job - case Fork(_) => false // TODO, not super clear - case HashCoGroup(left, _, _) => isLogicalReduce(left) - case MapValues(p, _) => isLogicalReduce(p) - case Mapped(p, _) => isLogicalReduce(p) - case MergedTypedPipe(_, _) => false - case ReduceStepPipe(_) => true - case SumByLocalKeys(p, _) => isLogicalReduce(p) - case TrappedPipe(p, _, _) => isLogicalReduce(p) - case CoGroupedPipe(_) => true - case WithOnComplete(p, _) => isLogicalReduce(p) + case CounterPipe(a) => isLogicalReduce(a) + case cp @ CrossPipe(_, _) => isLogicalReduce(cp.viaHashJoin) + case cp @ CrossValue(_, _) => isLogicalReduce(cp.viaHashJoin) + case DebugPipe(p) => isLogicalReduce(p) + case FilterKeys(p, _) => isLogicalReduce(p) + case Filter(p, _) => isLogicalReduce(p) + case FlatMapValues(p, _) => isLogicalReduce(p) + case FlatMapped(p, _) => isLogicalReduce(p) + case ForceToDisk(_) => false // not reducers for sure, could be a map-only job + case Fork(_) => false // TODO, not super clear + case HashCoGroup(left, _, _) => isLogicalReduce(left) + case MapValues(p, _) => isLogicalReduce(p) + case Mapped(p, _) => isLogicalReduce(p) + case MergedTypedPipe(_, _) => false + case ReduceStepPipe(_) => true + case SumByLocalKeys(p, _) => isLogicalReduce(p) + case TrappedPipe(p, _, _) => isLogicalReduce(p) + case CoGroupedPipe(_) => true + case WithOnComplete(p, _) => isLogicalReduce(p) case WithDescriptionTypedPipe(p, _) => isLogicalReduce(p) } } /** - * We use this state to track where we are as we recurse up the graph. - * Since we know at the very end we will write, we can avoid, for instance - * forcing a reduce operation that is followed only by a map and a write. + * We use this state to track where we are as we recurse up the graph. Since we know at the very end we + * will write, we can avoid, for instance forcing a reduce operation that is followed only by a map and a + * write. * - * Coupled with the isLogicalReduce above, we can emulate the behavior - * of the cascading planner as we recurse up. + * Coupled with the isLogicalReduce above, we can emulate the behavior of the cascading planner as we + * recurse up. */ sealed abstract class BelowState { def |(that: BelowState): BelowState = (this, that) match { - case (BelowState.Write, later) => later + case (BelowState.Write, later) => later case (BelowState.OnlyMapping, BelowState.Write) => BelowState.OnlyMapping - case (BelowState.OnlyMapping, mapOrMater) => mapOrMater - case (BelowState.Materialized, _) => BelowState.Materialized + case (BelowState.OnlyMapping, mapOrMater) => mapOrMater + case (BelowState.Materialized, _) => BelowState.Materialized } } object BelowState { @@ -291,112 +307,111 @@ object WritePartitioner { case object Materialized extends BelowState } type P[a] = (TypedPipe[a], BelowState) + /** - * Given a pipe, and the state below it, return the materialized - * version of that pipe. This should cause no more materializations - * than cascading would do, and indeed we test for this property + * Given a pipe, and the state below it, return the materialized version of that pipe. This should cause + * no more materializations than cascading would do, and indeed we test for this property */ - val fn = Memoize.functionK[P, mat.TP]( - new Memoize.RecursiveK[P, mat.TP] { - import TypedPipe._ - import BelowState._ + val fn = Memoize.functionK[P, mat.TP](new Memoize.RecursiveK[P, mat.TP] { + import TypedPipe._ + import BelowState._ - def toFunction[A] = { - case ((cp: CounterPipe[a], bs), rec) => - mat.map(rec((cp.pipe, bs)))(CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) - case ((c: CrossPipe[a, b], bs), rec) => - rec((c.viaHashJoin, bs)) - case ((cv@CrossValue(_, _), bs), rec) => - rec((cv.viaHashJoin, bs)) - case ((p: DebugPipe[a], bs), rec) => - mat.map(rec((p.input, bs)))(DebugPipe(_: TypedPipe[a])) - case ((p: FilterKeys[a, b], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(FilterKeys(_: TypedPipe[(a, b)], p.fn)) - case ((p: Filter[a], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(Filter(_: TypedPipe[a], p.fn)) - case ((Fork(of), bs), rec) => - // Treat forks as forceToDisk after - // optimizations (which should have removed unneeded forks - rec((ForceToDisk(of), bs)) - case ((p: FlatMapValues[a, b, c], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapValues(_: TypedPipe[(a, b)], p.fn)) - case ((p: FlatMapped[a, b], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapped(_: TypedPipe[a], p.fn)) - case ((ForceToDisk(src@IterablePipe(_)), bs), rec) => - // no need to put a checkpoint here: - rec((src, bs)) - case ((ForceToDisk(src@SourcePipe(_)), bs), rec) => - // no need to put a checkpoint here: - rec((src, bs)) - case ((p: ForceToDisk[a], bs), rec) => - val newBs = - if (isLogicalReduce(p.input)) OnlyMapping - else Materialized - val matP = rec((p.input, newBs)) - bs match { - case Write => - // there is no need force to disk immediately before a write - matP - case _ => mat.materialize(matP) - } - case ((it@IterablePipe(_), _), _) => - mat.pure(it) - case ((p: MapValues[a, b, c], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(MapValues(_: TypedPipe[(a, b)], p.fn)) - case ((p: Mapped[a, b], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(Mapped(_: TypedPipe[a], p.fn)) - case ((p: MergedTypedPipe[a], bs), rec) => - val mleft = rec((p.left, bs)) - val mright = rec((p.right, bs)) - val both = mat.zip(mleft, mright) - mat.map(both) { case (l, r) => MergedTypedPipe(l, r) } - case ((src@SourcePipe(_), _), _) => - mat.pure(src) - case ((p: SumByLocalKeys[a, b], bs), rec) => - mat.map(rec((p.input, bs | OnlyMapping)))(SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup)) - case ((p: TrappedPipe[a], bs), rec) => - // TODO: it is a bit unclear if a trap is allowed on the back of a reduce? - mat.map(rec((p.input, bs)))(TrappedPipe[a](_: TypedPipe[a], p.sink, p.conv)) - case ((p: WithDescriptionTypedPipe[a], bs), rec) => - mat.map(rec((p.input, bs)))(WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) - case ((p: WithOnComplete[a], bs), rec) => - mat.map(rec((p.input, bs)))(WithOnComplete(_: TypedPipe[a], p.fn)) - case ((EmptyTypedPipe, _), _) => - mat.pure(EmptyTypedPipe) - case ((hg: HashCoGroup[a, b, c, d], bs), rec) => - val withBs = new FunctionK[TypedPipe, P] { - def toFunction[A] = { tp => (tp, bs | OnlyMapping) } - } - // TODO: hashJoins may not be allowed in a reduce step in cascading, - // not clear - val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) - handleHashCoGroup(hg, recHG) - case ((CoGroupedPipe(cg), bs), rec) => - val withBs = new FunctionK[TypedPipe, P] { - def toFunction[A] = { tp => (tp, bs | Materialized) } - } - // TODO: hashJoins may not be allowed in a reduce step in cascading, - // not clear - val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) - val hcg = handleCoGrouped(cg, recHG) - bs match { - case BelowState.Materialized => mat.materialize(hcg) - case _ => hcg - } - case ((ReduceStepPipe(rs), bs), rec) => - val withBs = new FunctionK[TypedPipe, P] { - def toFunction[A] = { tp => (tp, bs | BelowState.Materialized) } - } - // TODO: hashJoins may not be allowed in a reduce step in cascading, - // not clear - val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) - val hrs = handleReduceStep(rs, recHG) - bs match { - case BelowState.Materialized => mat.materialize(hrs) - case _ => hrs - } - } - }) + def toFunction[A] = { + case ((cp: CounterPipe[a], bs), rec) => + mat.map(rec((cp.pipe, bs)))(CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) + case ((c: CrossPipe[a, b], bs), rec) => + rec((c.viaHashJoin, bs)) + case ((cv @ CrossValue(_, _), bs), rec) => + rec((cv.viaHashJoin, bs)) + case ((p: DebugPipe[a], bs), rec) => + mat.map(rec((p.input, bs)))(DebugPipe(_: TypedPipe[a])) + case ((p: FilterKeys[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FilterKeys(_: TypedPipe[(a, b)], p.fn)) + case ((p: Filter[a], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(Filter(_: TypedPipe[a], p.fn)) + case ((Fork(of), bs), rec) => + // Treat forks as forceToDisk after + // optimizations (which should have removed unneeded forks + rec((ForceToDisk(of), bs)) + case ((p: FlatMapValues[a, b, c], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapValues(_: TypedPipe[(a, b)], p.fn)) + case ((p: FlatMapped[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapped(_: TypedPipe[a], p.fn)) + case ((ForceToDisk(src @ IterablePipe(_)), bs), rec) => + // no need to put a checkpoint here: + rec((src, bs)) + case ((ForceToDisk(src @ SourcePipe(_)), bs), rec) => + // no need to put a checkpoint here: + rec((src, bs)) + case ((p: ForceToDisk[a], bs), rec) => + val newBs = + if (isLogicalReduce(p.input)) OnlyMapping + else Materialized + val matP = rec((p.input, newBs)) + bs match { + case Write => + // there is no need force to disk immediately before a write + matP + case _ => mat.materialize(matP) + } + case ((it @ IterablePipe(_), _), _) => + mat.pure(it) + case ((p: MapValues[a, b, c], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(MapValues(_: TypedPipe[(a, b)], p.fn)) + case ((p: Mapped[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(Mapped(_: TypedPipe[a], p.fn)) + case ((p: MergedTypedPipe[a], bs), rec) => + val mleft = rec((p.left, bs)) + val mright = rec((p.right, bs)) + val both = mat.zip(mleft, mright) + mat.map(both) { case (l, r) => MergedTypedPipe(l, r) } + case ((src @ SourcePipe(_), _), _) => + mat.pure(src) + case ((p: SumByLocalKeys[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup)) + case ((p: TrappedPipe[a], bs), rec) => + // TODO: it is a bit unclear if a trap is allowed on the back of a reduce? + mat.map(rec((p.input, bs)))(TrappedPipe[a](_: TypedPipe[a], p.sink, p.conv)) + case ((p: WithDescriptionTypedPipe[a], bs), rec) => + mat.map(rec((p.input, bs)))(WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) + case ((p: WithOnComplete[a], bs), rec) => + mat.map(rec((p.input, bs)))(WithOnComplete(_: TypedPipe[a], p.fn)) + case ((EmptyTypedPipe, _), _) => + mat.pure(EmptyTypedPipe) + case ((hg: HashCoGroup[a, b, c, d], bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | OnlyMapping) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + handleHashCoGroup(hg, recHG) + case ((CoGroupedPipe(cg), bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | Materialized) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + val hcg = handleCoGrouped(cg, recHG) + bs match { + case BelowState.Materialized => mat.materialize(hcg) + case _ => hcg + } + case ((ReduceStepPipe(rs), bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | BelowState.Materialized) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + val hrs = handleReduceStep(rs, recHG) + bs match { + case BelowState.Materialized => mat.materialize(hrs) + case _ => hrs + } + } + }) def write[A](p: PairK[Id, S, A]): (M[TypedPipe[A]], S[A]) = { val materialized: M[TypedPipe[A]] = fn((optDag.evaluate(p._1), BelowState.Write)) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala index 66bd90e0ad..a4d47a672e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala @@ -1,7 +1,9 @@ package com.twitter.scalding.typed.cascading_backend -import cascading.flow.{ FlowDef, Flow } +import cascading.flow.{Flow, FlowDef} import com.twitter.scalding.{ + source, + typed, CascadingLocal, CascadingMode, Config, @@ -12,33 +14,31 @@ import com.twitter.scalding.{ HadoopMode, JobStats, Mappable, - TypedPipe, - source, - typed + TypedPipe } -import com.twitter.scalding.{ CancellationHandler, CFuture, CPromise } +import com.twitter.scalding.{CFuture, CPromise, CancellationHandler} import com.twitter.scalding.typed.TypedSink import com.twitter.scalding.cascading_interop.FlowListenerPromise -import com.stripe.dagon.{ Rule, HMap } +import com.stripe.dagon.{HMap, Rule} import java.util.UUID import java.util.concurrent.LinkedBlockingQueue import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{ FileSystem, Path } +import org.apache.hadoop.fs.{FileSystem, Path} import org.slf4j.LoggerFactory -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext, Promise } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} import scala.util.control.NonFatal -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} -import Execution.{ Writer, ToWrite } +import Execution.{ToWrite, Writer} object AsyncFlowDefRunner { + /** * We send messages from other threads into the submit thread here */ private sealed trait FlowDefAction - private final case class RunFlowDef(conf: Config, - fd: FlowDef, - result: CPromise[(Long, JobStats)]) extends FlowDefAction + private final case class RunFlowDef(conf: Config, fd: FlowDef, result: CPromise[(Long, JobStats)]) + extends FlowDefAction private final case class StopFlow(flow: Flow[_], result: Promise[Unit]) extends FlowDefAction private case object Stop extends FlowDefAction @@ -54,7 +54,7 @@ object AsyncFlowDefRunner { override def run(): Unit = { val fs = mode match { case localMode: CascadingLocal => FileSystem.getLocal(new Configuration) - case hdfsMode: HadoopMode => FileSystem.get(hdfsMode.jobConf) + case hdfsMode: HadoopMode => FileSystem.get(hdfsMode.jobConf) } filesToCleanup.foreach { file: String => @@ -74,10 +74,8 @@ object AsyncFlowDefRunner { } /** - * This holds an internal thread to run - * This holds an internal thread to submit run - * a Config, Mode, FlowDef and return a Future holding the - * JobStats + * This holds an internal thread to run This holds an internal thread to submit run a Config, Mode, FlowDef + * and return a Future holding the JobStats */ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { import AsyncFlowDefRunner._ @@ -95,17 +93,21 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { private object FilesToCleanUp { def empty: FilesToCleanUp = FilesToCleanUp(Set.empty, Set.empty) } + /** - * @param filesToCleanup temporary files created by forceToDiskExecution - * @param initToOpt this is the mapping between user's TypedPipes and their optimized versions - * which are actually run. - * @param forcedPipes these are all the side effecting forcing of TypedPipes into simple - * SourcePipes or IterablePipes. These are for both toIterableExecution and forceToDiskExecution + * @param filesToCleanup + * temporary files created by forceToDiskExecution + * @param initToOpt + * this is the mapping between user's TypedPipes and their optimized versions which are actually run. + * @param forcedPipes + * these are all the side effecting forcing of TypedPipes into simple SourcePipes or IterablePipes. These + * are for both toIterableExecution and forceToDiskExecution */ private case class State( - filesToCleanup: FilesToCleanUp, - initToOpt: HMap[StateKey, TypedPipe], - forcedPipes: HMap[StateKey, WorkVal]) { + filesToCleanup: FilesToCleanUp, + initToOpt: HMap[StateKey, TypedPipe], + forcedPipes: HMap[StateKey, WorkVal] + ) { def addFilesToCleanup(conf: Config, s: Option[String]): State = s match { @@ -116,24 +118,26 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { } /** - * Returns true if we actually add this optimized pipe. We do this - * because we don't want to take the side effect twice. + * Returns true if we actually add this optimized pipe. We do this because we don't want to take the side + * effect twice. */ - def addForce[T](c: Config, - init: TypedPipe[T], - opt: TypedPipe[T], - p: Future[TypedPipe[T]]): (State, Boolean) = - + def addForce[T]( + c: Config, + init: TypedPipe[T], + opt: TypedPipe[T], + p: Future[TypedPipe[T]] + ): (State, Boolean) = forcedPipes.get((c, opt)) match { case None => - (copy(forcedPipes = forcedPipes + ((c, opt) -> p), - initToOpt = initToOpt + ((c, init) -> opt)), true) + ( + copy(forcedPipes = forcedPipes + ((c, opt) -> p), initToOpt = initToOpt + ((c, init) -> opt)), + true + ) case Some(_) => (copy(initToOpt = initToOpt + ((c, init) -> opt)), false) } def getForce[T](c: Config, init: TypedPipe[T]): Option[Future[TypedPipe[T]]] = - initToOpt.get((c, init)).map { opt => forcedPipes.get((c, opt)) match { case None => @@ -154,14 +158,14 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { s } private def getState: State = - updateState { s => (s, s) } + updateState(s => (s, s)) private val messageQueue: LinkedBlockingQueue[AsyncFlowDefRunner.FlowDefAction] = new LinkedBlockingQueue[AsyncFlowDefRunner.FlowDefAction]() /** - * Hadoop and/or cascading has some issues, it seems, with starting jobs - * from multiple threads. This thread does all the Flow starting. + * Hadoop and/or cascading has some issues, it seems, with starting jobs from multiple threads. This thread + * does all the Flow starting. */ private lazy val thread = new Thread(new Runnable { def run(): Unit = { @@ -251,11 +255,11 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { } /** - * This evaluates the fn in a Try, validates the sources - * calls runFlowDef, then clears the FlowStateMap + * This evaluates the fn in a Try, validates the sources calls runFlowDef, then clears the FlowStateMap */ - def validateAndRun(conf: Config)(fn: Config => FlowDef)( - implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] = { + def validateAndRun( + conf: Config + )(fn: Config => FlowDef)(implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] = { val tFlowDef = Try(fn(conf)).map { flowDef => FlowStateMap.validateSources(flowDef, mode) flowDef @@ -263,19 +267,18 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { tFlowDef match { case Success(flowDef) => - runFlowDef(conf, flowDef).map { - case (id, jobStats) => - FlowStateMap.clear(flowDef) - (id, ExecutionCounters.fromJobStats(jobStats)) + runFlowDef(conf, flowDef).map { case (id, jobStats) => + FlowStateMap.clear(flowDef) + (id, ExecutionCounters.fromJobStats(jobStats)) } case Failure(e) => CFuture.failed(e) } } - def execute( - conf: Config, - writes: List[ToWrite[_]])(implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] = { + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { import Execution.ToWrite._ @@ -310,19 +313,18 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { } } - sinkOpt.foreach { - case (sink, fp) => - // We write the optimized pipe - write(opt, sink) - val pipeFut = done.future.map(_ => fp()) - pipePromise.completeWith(pipeFut) + sinkOpt.foreach { case (sink, fp) => + // We write the optimized pipe + write(opt, sink) + val pipeFut = done.future.map(_ => fp()) + pipePromise.completeWith(pipeFut) } } def addIter[A](init: TypedPipe[A], optimized: Either[Iterable[A], Mappable[A]]): Unit = { val result = optimized match { case Left(iter) if iter.isEmpty => TypedPipe.EmptyTypedPipe - case Left(iter) => TypedPipe.IterablePipe(iter) - case Right(mappable) => TypedPipe.SourcePipe(mappable) + case Left(iter) => TypedPipe.IterablePipe(iter) + case Right(mappable) => TypedPipe.SourcePipe(mappable) } val fut = Future.successful(result) updateState(_.addForce(conf, init, result, fut)) @@ -332,18 +334,17 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { case OptimizedWrite(init, Force(opt)) => force(init, opt) case OptimizedWrite(init, ToIterable(opt)) => - def step[A](init: TypedPipe[A], opt: TypedPipe[A]): Unit = { + def step[A](init: TypedPipe[A], opt: TypedPipe[A]): Unit = opt match { - case TypedPipe.EmptyTypedPipe => addIter(init, Left(Nil)) - case TypedPipe.IterablePipe(as) => addIter(init, Left(as)) + case TypedPipe.EmptyTypedPipe => addIter(init, Left(Nil)) + case TypedPipe.IterablePipe(as) => addIter(init, Left(as)) case TypedPipe.SourcePipe(src: Mappable[A]) => addIter(init, Right(src)) - case other => + case other => // we need to write the pipe out first. force(init, opt) // now, when we go to check for the pipe later, it // will be a SourcePipe of a Mappable by construction } - } step(init, opt) case OptimizedWrite(_, SimpleWrite(pipe, sink)) => @@ -360,10 +361,9 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { cfuture } - def getForced[T]( - conf: Config, - initial: TypedPipe[T])(implicit cec: ConcurrentExecutionContext): Future[TypedPipe[T]] = - + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[TypedPipe[T]] = getState.getForce(conf, initial) match { case Some(fut) => fut case None => @@ -372,18 +372,16 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { Future.failed(new IllegalStateException(msg)) } - def getIterable[T]( - conf: Config, - initial: TypedPipe[T])(implicit cec: ConcurrentExecutionContext): Future[Iterable[T]] = - + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[T]] = getForced(conf, initial).flatMap { - case TypedPipe.EmptyTypedPipe => Future.successful(Nil) + case TypedPipe.EmptyTypedPipe => Future.successful(Nil) case TypedPipe.IterablePipe(iter) => Future.successful(iter) case TypedPipe.SourcePipe(src: Mappable[T]) => - Future.successful( - new Iterable[T] { - def iterator = src.toIterator(conf, mode) - }) + Future.successful(new Iterable[T] { + def iterator = src.toIterator(conf, mode) + }) case other => val msg = s"logic error: expected an Iterable pipe. ($conf, $initial) -> $other is not iterable" @@ -391,23 +389,23 @@ class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { } private def forceToDisk[T]( // linter:disable:UnusedParameter - uuid: UUID, - conf: Config, - pipe: TypedPipe[T] // note, we don't use this, but it fixes the type T - ): (typed.TypedSink[T], () => TypedPipe[T], Option[String]) = - + uuid: UUID, + conf: Config, + pipe: TypedPipe[T] // note, we don't use this, but it fixes the type T + ): (typed.TypedSink[T], () => TypedPipe[T], Option[String]) = mode match { case _: CascadingLocal => // Local or Test mode val inMemoryDest = new typed.MemorySink[T] + /** - * This is a bit tricky. readResults has to be called after the job has - * run, so we need to do this inside the function which will - * be called after the job has run + * This is a bit tricky. readResults has to be called after the job has run, so we need to do this + * inside the function which will be called after the job has run */ (inMemoryDest, () => TypedPipe.from(inMemoryDest.readResults), None) case _: HadoopMode => val temporaryPath: String = { - val tmpDir = conf.get("hadoop.tmp.dir") + val tmpDir = conf + .get("hadoop.tmp.dir") .orElse(conf.get("cascading.tmp.dir")) .getOrElse("/tmp") diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala index 894fe7d4d2..9556c2c714 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala @@ -2,18 +2,36 @@ package com.twitter.scalding.typed.cascading_backend import cascading.flow.FlowDef import cascading.operation.Debug -import cascading.pipe.{ CoGroup, Each, Pipe, HashJoin } -import cascading.tuple.{ Fields, Tuple => CTuple } -import com.stripe.dagon.{ FunctionK, HCache, Id, Rule, Dag } -import com.twitter.scalding.TupleConverter.{ singleConverter, tuple2Converter } -import com.twitter.scalding.TupleSetter.{ singleSetter, tup2Setter } +import cascading.pipe.{CoGroup, Each, HashJoin, Pipe} +import cascading.tuple.{Fields, Tuple => CTuple} +import com.stripe.dagon.{Dag, FunctionK, HCache, Id, Rule} +import com.twitter.scalding.TupleConverter.{singleConverter, tuple2Converter} +import com.twitter.scalding.TupleSetter.{singleSetter, tup2Setter} import com.twitter.scalding.{ - CleanupIdentityFunction, Config, Dsl, Execution, Field, FlowState, FlowStateMap, GroupBuilder, - HadoopMode, IncrementCounters, IterableSource, MapsideReduce, Mode, RichFlowDef, - RichPipe, TupleConverter, TupleGetter, TupleSetter, TypedBufferOp, WrappedJoiner, Write + CleanupIdentityFunction, + Config, + Dsl, + Execution, + Field, + FlowState, + FlowStateMap, + GroupBuilder, + HadoopMode, + IncrementCounters, + IterableSource, + MapsideReduce, + Mode, + RichFlowDef, + RichPipe, + TupleConverter, + TupleGetter, + TupleSetter, + TypedBufferOp, + WrappedJoiner, + Write } import com.twitter.scalding.typed._ -import com.twitter.scalding.typed.functions.{ FilterKeysToFilter, MapValuesToMap, FlatMapValuesToFlatMap } +import com.twitter.scalding.typed.functions.{FilterKeysToFilter, FlatMapValuesToFlatMap, MapValuesToMap} import com.twitter.scalding.serialization.{ Boxed, BoxedOrderedSerialization, @@ -30,9 +48,13 @@ object CascadingBackend { def areDefiniteInverse[A, B](t: TupleConverter[A], s: TupleSetter[B]): Boolean = (t, s) match { - case (TupleConverter.Single(TupleGetter.Casting()), TupleSetter.Single()) => true + case (TupleConverter.Single(TupleGetter.Casting()), TupleSetter.Single()) => true case (TupleConverter.TupleConverter1(TupleGetter.Casting()), TupleSetter.TupleSetter1()) => true - case (TupleConverter.TupleConverter2(TupleGetter.Casting(), TupleGetter.Casting()), TupleSetter.TupleSetter2()) => true + case ( + TupleConverter.TupleConverter2(TupleGetter.Casting(), TupleGetter.Casting()), + TupleSetter.TupleSetter2() + ) => + true // TODO we could add more, but we only use single and 2 actually case _ => false } @@ -53,11 +75,13 @@ object CascadingBackend { } private def valueConverter[V](optOrd: Option[Ordering[V]]): TupleConverter[V] = - optOrd.map { - case _: OrderedSerialization[_] => - TupleConverter.singleConverter[Boxed[V]].andThen(_.get) - case _ => TupleConverter.singleConverter[V] - }.getOrElse(TupleConverter.singleConverter[V]) + optOrd + .map { + case _: OrderedSerialization[_] => + TupleConverter.singleConverter[Boxed[V]].andThen(_.get) + case _ => TupleConverter.singleConverter[V] + } + .getOrElse(TupleConverter.singleConverter[V]) private def keyConverter[K](ord: Ordering[K]): TupleConverter[K] = ord match { @@ -76,27 +100,34 @@ object CascadingBackend { } /** - * If we are using OrderedComparable, we need to box the key - * to prevent other serializers from handling the key + * If we are using OrderedComparable, we need to box the key to prevent other serializers from handling the + * key */ - private def getBoxFnAndOrder[K](ordser: OrderedSerialization[K], flowDef: FlowDef): (K => Boxed[K], BoxedOrderedSerialization[K]) = { + private def getBoxFnAndOrder[K]( + ordser: OrderedSerialization[K], + flowDef: FlowDef + ): (K => Boxed[K], BoxedOrderedSerialization[K]) = { // We can only supply a cacheKey if the equals and hashcode are known sane - val (boxfn, cls) = Boxed.nextCached[K](if (ordser.isInstanceOf[EquivSerialization[_]]) Some(ordser) else None) + val (boxfn, cls) = + Boxed.nextCached[K](if (ordser.isInstanceOf[EquivSerialization[_]]) Some(ordser) else None) val boxordSer = BoxedOrderedSerialization(boxfn, ordser) - WrappedSerialization.rawSetBinary(List((cls, boxordSer)), - { - case (k: String, v: String) => - FlowStateMap.merge(flowDef, FlowState.withConfigSetting(k + cls, v)) - }) + WrappedSerialization.rawSetBinary( + List((cls, boxordSer)), + { case (k: String, v: String) => + FlowStateMap.merge(flowDef, FlowState.withConfigSetting(k + cls, v)) + } + ) (boxfn, boxordSer) } /** - * Check if the Ordering is an OrderedSerialization, if so box in a Boxed so hadoop and cascading - * can dispatch the right serialization + * Check if the Ordering is an OrderedSerialization, if so box in a Boxed so hadoop and cascading can + * dispatch the right serialization */ - private def maybeBox[K, V](ord: Ordering[K], flowDef: FlowDef)(op: (TupleSetter[(K, V)], Fields) => Pipe): Pipe = + private def maybeBox[K, V](ord: Ordering[K], flowDef: FlowDef)( + op: (TupleSetter[(K, V)], Fields) => Pipe + ): Pipe = ord match { case ordser: OrderedSerialization[K] => val (boxfn, boxordSer) = getBoxFnAndOrder[K](ordser, flowDef) @@ -115,14 +146,15 @@ object CascadingBackend { // the toPipe function directly, so we don't actually create the pipe until // the TupleSetter comes in. With this, we can make sure to use the right // TupleSetter on the final pipe - private case class CascadingPipe[+T](pipe: Pipe, - fields: Fields, - @transient localFlowDef: FlowDef, // not serializable. - converter: TupleConverter[_ <: T]) { + private case class CascadingPipe[+T]( + pipe: Pipe, + fields: Fields, + @transient localFlowDef: FlowDef, // not serializable. + converter: TupleConverter[_ <: T] + ) { /** - * merge the flowDef into this new flowdef an make sure the tuples - * have the structure defined by setter + * merge the flowDef into this new flowdef an make sure the tuples have the structure defined by setter */ def toPipe[U >: T](f: Fields, fd: FlowDef, setter: TupleSetter[U]): Pipe = { val resFd = new RichFlowDef(fd) @@ -159,11 +191,9 @@ object CascadingBackend { } /** - * we want to cache renderings of some TypedPipe to Pipe so cascading - * will see them as the same. Without this, it is very easy to have - * a lot of recomputation. Ideally we would plan an entire graph - * at once, and not need a static cache here, but currently we still - * plan one TypedPipe at a time. + * we want to cache renderings of some TypedPipe to Pipe so cascading will see them as the same. Without + * this, it is very easy to have a lot of recomputation. Ideally we would plan an entire graph at once, and + * not need a static cache here, but currently we still plan one TypedPipe at a time. */ private class CompilerCache { @@ -185,18 +215,21 @@ object CascadingBackend { /** * Method to compile scalding's `TypedPipe`s to cascading's `Pipe`s. * - * Since equal `TypedPipe`s define same computation we would like to compile them into referentially the same cascading's `Pipe` instance. - * This logic breaks if one typed pipe is really big and has two forked different computations both of which significantly decrease size of the data. - * If we will cache common part of this two computations in the same cascading's `Pipe` instance we end up with common part being materialized. - * Therefore for some kind of `TypedPipe`s we want to avoid their caching. + * Since equal `TypedPipe`s define same computation we would like to compile them into referentially the + * same cascading's `Pipe` instance. This logic breaks if one typed pipe is really big and has two forked + * different computations both of which significantly decrease size of the data. If we will cache common + * part of this two computations in the same cascading's `Pipe` instance we end up with common part being + * materialized. Therefore for some kind of `TypedPipe`s we want to avoid their caching. * - * `.cross` `TypedPipe` is one of examples of such `TypedPipe`s we never want to materialize and, therefore, cache. + * `.cross` `TypedPipe` is one of examples of such `TypedPipe`s we never want to materialize and, therefore, + * cache. * * `compile` logic is separated into next functions: - * - `transform` which defines main transformation logic, without any caching applied. - * This method accepts `rec` parameter which is being called to transform children pipes. - * - `withCachePolicy` which defines transformation logic with caching applied. - * - `notCached` to support use case with `.cross` pipe, where pipe itself shouldn't be cached but `left` and `right` sides of it should be. + * - `transform` which defines main transformation logic, without any caching applied. This method accepts + * `rec` parameter which is being called to transform children pipes. + * - `withCachePolicy` which defines transformation logic with caching applied. + * - `notCached` to support use case with `.cross` pipe, where pipe itself shouldn't be cached but `left` + * and `right` sides of it should be. */ private def compile(mode: Mode): FunctionK[TypedPipe, CascadingPipe] = new FunctionK[TypedPipe, CascadingPipe] { @@ -207,13 +240,13 @@ object CascadingBackend { private def withCachePolicy[U]: TypedPipe[U] => CascadingPipe[U] = { // Don't cache `CrossPipe`, but cache `left` and `right` side of it - case cp@CrossPipe(left, right) => + case cp @ CrossPipe(left, right) => notCached(excludes = Set(left, right))(cp) // Don't cache `Fork` and `WithDescriptionTypedPipe` // since if we do cache them `CrossPipe` will end up being cached as well - case tp@Fork(_) => + case tp @ Fork(_) => transform(tp, this) - case tp@WithDescriptionTypedPipe(_, _) => + case tp @ WithDescriptionTypedPipe(_, _) => transform(tp, this) // Cache all other typed pipes case tp => @@ -228,22 +261,27 @@ object CascadingBackend { } private def transform[T]( - pipe: TypedPipe[T], - rec: FunctionK[TypedPipe, CascadingPipe] + pipe: TypedPipe[T], + rec: FunctionK[TypedPipe, CascadingPipe] ): CascadingPipe[T] = pipe match { - case cp@CounterPipe(_) => + case cp @ CounterPipe(_) => def go[A](cp: CounterPipe[A]): CascadingPipe[A] = { val CascadingPipe(pipe0, initF, fd, conv) = rec(cp.pipe) val cpipe = RichPipe(pipe0) - .eachTo(initF -> f0)(new IncrementCounters[A](_, TupleConverter - .asSuperConverter(conv))) + .eachTo(initF -> f0)( + new IncrementCounters[A]( + _, + TupleConverter + .asSuperConverter(conv) + ) + ) CascadingPipe.single[A](cpipe, fd) } go(cp) - case cp@CrossPipe(_, _) => + case cp @ CrossPipe(_, _) => rec(cp.viaHashJoin) - case cv@CrossValue(_, _) => + case cv @ CrossValue(_, _) => rec(cv.viaHashJoin) case DebugPipe(p) => val inner = rec(p) @@ -251,14 +289,14 @@ object CascadingBackend { case EmptyTypedPipe => // just use an empty iterable pipe. rec(IterablePipe(List.empty[T])) - case fk@FilterKeys(_, _) => + case fk @ FilterKeys(_, _) => def go[K, V](node: FilterKeys[K, V]): CascadingPipe[(K, V)] = { val rewrite = Filter[(K, V)](node.input, FilterKeysToFilter(node.fn)) rec(rewrite) } go(fk) - case f@Filter(_, _) => + case f @ Filter(_, _) => // hand holding for type inference def go[T1 <: T](f: Filter[T1]): CascadingPipe[T] = { val Filter(input, fn) = f @@ -269,17 +307,20 @@ object CascadingBackend { } go(f) - case f@FlatMapValues(_, _) => + case f @ FlatMapValues(_, _) => def go[K, V, U](node: FlatMapValues[K, V, U]): CascadingPipe[T] = rec(FlatMapped[(K, V), (K, U)](node.input, FlatMapValuesToFlatMap(node.fn))) go(f) - case fm@FlatMapped(_, _) => + case fm @ FlatMapped(_, _) => // TODO we can optimize a flatmapped input directly and skip some tupleconverters def go[A, B <: T](fm: FlatMapped[A, B]): CascadingPipe[T] = { val CascadingPipe(pipe, initF, fd, conv) = rec(fm.input) - val fmpipe = RichPipe(pipe).flatMapTo[A, T](initF -> f0)(fm.fn)(TupleConverter - .asSuperConverter(conv), singleSetter) + val fmpipe = RichPipe(pipe).flatMapTo[A, T](initF -> f0)(fm.fn)( + TupleConverter + .asSuperConverter(conv), + singleSetter + ) CascadingPipe.single[B](fmpipe, fd) } @@ -295,25 +336,28 @@ object CascadingBackend { val fd = new FlowDef val pipe = IterableSource[T](iter, f0)(singleSetter, singleConverter).read(fd, mode) CascadingPipe.single[T](pipe, fd) - case f@MapValues(_, _) => + case f @ MapValues(_, _) => def go[K, A, B](fn: MapValues[K, A, B]): CascadingPipe[_ <: (K, B)] = rec(Mapped[(K, A), (K, B)](fn.input, MapValuesToMap(fn.fn))) go(f) - case m@Mapped(_, _) => + case m @ Mapped(_, _) => def go[A, B <: T](m: Mapped[A, B]): CascadingPipe[T] = { val Mapped(input, fn) = m val CascadingPipe(pipe, initF, fd, conv) = rec(input) - val fmpipe = RichPipe(pipe).mapTo[A, T](initF -> f0)(fn)(TupleConverter - .asSuperConverter(conv), singleSetter) + val fmpipe = RichPipe(pipe).mapTo[A, T](initF -> f0)(fn)( + TupleConverter + .asSuperConverter(conv), + singleSetter + ) CascadingPipe.single[B](fmpipe, fd) } go(m) - case m@MergedTypedPipe(_, _) => + case m @ MergedTypedPipe(_, _) => OptimizationRules.unrollMerge(m) match { - case Nil => rec(EmptyTypedPipe) + case Nil => rec(EmptyTypedPipe) case h :: Nil => rec(h) case nonEmpty => // TODO: a better optimization is to not materialize this @@ -323,7 +367,7 @@ object CascadingBackend { val flowDef = new FlowDef // if all of the converters are the same, we could skip some work // here, but need to be able to see that correctly - val pipes = nonEmpty.map { p => rec(p).toPipe(f0, flowDef, singleSetter) } + val pipes = nonEmpty.map(p => rec(p).toPipe(f0, flowDef, singleSetter)) val merged = new cascading.pipe.Merge(pipes.map(RichPipe.assignName): _*) CascadingPipe.single[T](merged, flowDef) } @@ -331,14 +375,16 @@ object CascadingBackend { val fd = new FlowDef val pipe = typedSrc.read(fd, mode) CascadingPipe[T](pipe, typedSrc.sourceFields, fd, typedSrc.converter[T]) - case sblk@SumByLocalKeys(_, _) => + case sblk @ SumByLocalKeys(_, _) => def go[K, V](sblk: SumByLocalKeys[K, V]): CascadingPipe[(K, V)] = { val cp = rec(sblk.input) val localFD = new FlowDef val cpKV: Pipe = cp.toPipe(kvFields, localFD, tup2Setter) - val msr = new MapsideReduce(sblk - .semigroup, new Fields("key"), valueField, None)(singleConverter[V], singleSetter[V]) - val kvpipe = RichPipe(cpKV).eachTo(kvFields -> kvFields) { _ => msr } + val msr = new MapsideReduce(sblk.semigroup, new Fields("key"), valueField, None)( + singleConverter[V], + singleSetter[V] + ) + val kvpipe = RichPipe(cpKV).eachTo(kvFields -> kvFields)(_ => msr) CascadingPipe(kvpipe, kvFields, localFD, tuple2Converter[K, V]) } @@ -360,11 +406,10 @@ object CascadingBackend { fd.addTrap(pipe, sink.createTap(Write)(mode)) CascadingPipe[u](pipe, sink.sinkFields, fd, conv) case WithDescriptionTypedPipe(input, descs) => - @annotation.tailrec def loop[A]( - t: TypedPipe[A], - acc: List[(String, Boolean)] + t: TypedPipe[A], + acc: List[(String, Boolean)] ): (TypedPipe[A], List[(String, Boolean)]) = t match { case WithDescriptionTypedPipe(i, descs) => @@ -381,12 +426,9 @@ object CascadingBackend { val next = new Each(cp.pipe, Fields.ALL, new CleanupIdentityFunction(fn)) cp.copy(pipe = next) - case hcg@HashCoGroup(_, _, _) => + case hcg @ HashCoGroup(_, _, _) => def go[K, V1, V2, R](hcg: HashCoGroup[K, V1, V2, R]): CascadingPipe[(K, R)] = - planHashJoin(hcg.left, - hcg.right, - hcg.joiner, - rec) + planHashJoin(hcg.left, hcg.right, hcg.joiner, rec) go(hcg) case ReduceStepPipe(rs) => @@ -405,22 +447,19 @@ object CascadingBackend { } /** - * These are rules we should apply to any TypedPipe before handing - * to cascading. These should be a bit conservative in that they - * should be highly likely to improve the graph. + * These are rules we should apply to any TypedPipe before handing to cascading. These should be a bit + * conservative in that they should be highly likely to improve the graph. */ def defaultOptimizationRules(config: Config): Seq[Rule[TypedPipe]] = { def std(forceHash: Rule[TypedPipe]) = - (OptimizationRules.standardMapReduceRules ::: + OptimizationRules.standardMapReduceRules ::: List( OptimizationRules.FilterLocally, // after filtering, we may have filtered to nothing, lets see OptimizationRules.simplifyEmpty, // add any explicit forces to the optimized graph - Rule.orElse(List( - forceHash, - OptimizationRules.RemoveDuplicateForceFork) - ))) + Rule.orElse(List(forceHash, OptimizationRules.RemoveDuplicateForceFork)) + ) config.getOptimizationPhases match { case Some(tryPhases) => tryPhases.get.phases @@ -432,13 +471,16 @@ object CascadingBackend { } } - final def toPipe[U](p: TypedPipe[U], fieldNames: Fields)(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = { + final def toPipe[U](p: TypedPipe[U], fieldNames: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + setter: TupleSetter[U] + ): Pipe = { - val phases = defaultOptimizationRules( - mode match { - case h: HadoopMode => Config.fromHadoop(h.jobConf) - case _ => Config.empty - }) + val phases = defaultOptimizationRules(mode match { + case h: HadoopMode => Config.fromHadoop(h.jobConf) + case _ => Config.empty + }) val (d, id) = Dag(p, OptimizationRules.toLiteral) val d1 = d.applySeq(phases) val p1 = d1.evaluate(id) @@ -448,15 +490,13 @@ object CascadingBackend { } /** - * This needs to be called, to plan any writes from the - * TypedPipe API. This is here to globally optimize the entire - * flow, and not optimize on a write-by-write basis. + * This needs to be called, to plan any writes from the TypedPipe API. This is here to globally optimize the + * entire flow, and not optimize on a write-by-write basis. * * This uses the FlowStateMap this method is idempotent. * - * It is called by default in ExecutionContext at the last - * step before building the Flow. Job also needs to call this - * method in validate to make sure validation works. + * It is called by default in ExecutionContext at the last step before building the Flow. Job also needs to + * call this method in validate to make sure validation works. */ def planTypedWrites(fd: FlowDef, mode: Mode): Unit = { def doWrites(writes: List[FlowStateMap.TypedWrite[_]]): Unit = { @@ -467,11 +507,10 @@ object CascadingBackend { require(tw.mode == mode, s"${tw.mode} should be equal to $mode") (nextDag, (id, tw.sink) :: items) } - val phases = defaultOptimizationRules( - mode match { - case h: HadoopMode => Config.fromHadoop(h.jobConf) - case _ => Config.empty - }) + val phases = defaultOptimizationRules(mode match { + case h: HadoopMode => Config.fromHadoop(h.jobConf) + case _ => Config.empty + }) val optDag = rootedDag.applySeq(phases) def doWrite[A](pair: (Id[A], TypedSink[A])): Unit = { val optPipe = optDag.evaluate(pair._1) @@ -487,8 +526,10 @@ object CascadingBackend { // We do writes twice because customer can use Typed API in their TypedSink implementation. val pendingWritesAfterPlan = FlowStateMap.removeWrites(fd).pendingTypedWrites if (pendingWritesAfterPlan.nonEmpty) { - logger.warn("Using Typed API in TypedSink implementation is prohibited and " + - "might be removed in later releases of Scalding.") + logger.warn( + "Using Typed API in TypedSink implementation is prohibited and " + + "might be removed in later releases of Scalding." + ) doWrites(pendingWritesAfterPlan) } @@ -500,70 +541,84 @@ object CascadingBackend { /** * Convert a cascading FlowDef to an Option[Execution[Unit]] * - * Return None if the Execution is empty. If the FlowDef includes - * things other than TypedPipe.writes, this will return Some - * failed Execution. + * Return None if the Execution is empty. If the FlowDef includes things other than TypedPipe.writes, this + * will return Some failed Execution. * - * This method is useful for people who have used the Typed-API - * with FlowDefs, but not Executions and want to convert to - * an Execution without rewriting all the code. An example - * of this is summingbird which constructs a single FlowDef - * for the entire plan it makes. For large plans from summingbird - * you may want to use write partitioning. + * This method is useful for people who have used the Typed-API with FlowDefs, but not Executions and want + * to convert to an Execution without rewriting all the code. An example of this is summingbird which + * constructs a single FlowDef for the entire plan it makes. For large plans from summingbird you may want + * to use write partitioning. */ - def flowDefToExecution(fd: FlowDef, partitionOptimizations: Option[Seq[Rule[TypedPipe]]]): Option[Execution[Unit]] = { + def flowDefToExecution( + fd: FlowDef, + partitionOptimizations: Option[Seq[Rule[TypedPipe]]] + ): Option[Execution[Unit]] = { val rfd = new RichFlowDef(fd) // TypedPipe jobs should not have modified // the FlowDef yet, only the FlowState should // be updated if (rfd.isEmpty) { - FlowStateMap.get(fd).flatMap { // Note, we want this to be a pure function so we don't mutate the FlowStateMap - case FlowState(srcs, confs, writes) if srcs.isEmpty && confs.isEmpty => - writes match { - case Nil => None - case nonEmpty => - partitionOptimizations match { - case None => - def write[A](w: FlowStateMap.TypedWrite[A]): Execution[Unit] = - w.pipe.writeExecution(w.sink) - - Some(Execution.sequence(nonEmpty.map(write(_))).unit) - case Some(rules) => - def toPair[A](f: FlowStateMap.TypedWrite[A]): WritePartitioner.PairK[TypedPipe, TypedSink, A] = - (f.pipe, f.sink) - - val pairs: List[WritePartitioner.PairK[TypedPipe, TypedSink, _]] = nonEmpty.map(toPair(_)) - Some(WritePartitioner.materialize[Execution](rules, pairs)) - } - } - case fs => - // we can't handle if there have been anything other than TypedPipe.write on the - // TypedPipe - Some(Execution.failed(new Exception(s"expected empty FlowState other than TypedWrites, found: $fs"))) - } - } - else Some(Execution.failed(new Exception(s"We can only convert Typed-API Jobs to Execution. Found non-empty FlowDef: $fd"))) + FlowStateMap + .get(fd) + .flatMap { // Note, we want this to be a pure function so we don't mutate the FlowStateMap + case FlowState(srcs, confs, writes) if srcs.isEmpty && confs.isEmpty => + writes match { + case Nil => None + case nonEmpty => + partitionOptimizations match { + case None => + def write[A](w: FlowStateMap.TypedWrite[A]): Execution[Unit] = + w.pipe.writeExecution(w.sink) + + Some(Execution.sequence(nonEmpty.map(write(_))).unit) + case Some(rules) => + def toPair[A]( + f: FlowStateMap.TypedWrite[A] + ): WritePartitioner.PairK[TypedPipe, TypedSink, A] = + (f.pipe, f.sink) + + val pairs: List[WritePartitioner.PairK[TypedPipe, TypedSink, _]] = nonEmpty.map(toPair(_)) + Some(WritePartitioner.materialize[Execution](rules, pairs)) + } + } + case fs => + // we can't handle if there have been anything other than TypedPipe.write on the + // TypedPipe + Some( + Execution.failed(new Exception(s"expected empty FlowState other than TypedWrites, found: $fs")) + ) + } + } else + Some( + Execution.failed( + new Exception(s"We can only convert Typed-API Jobs to Execution. Found non-empty FlowDef: $fd") + ) + ) } /** - * This converts the TypedPipe to a cascading Pipe doing the most direct - * possible translation we can. This is useful for testing or for expert - * cases where you want more direct control of the TypedPipe than - * the default method gives you. + * This converts the TypedPipe to a cascading Pipe doing the most direct possible translation we can. This + * is useful for testing or for expert cases where you want more direct control of the TypedPipe than the + * default method gives you. */ - final def toPipeUnoptimized[U](input: TypedPipe[U], - fieldNames: Fields)(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = { + final def toPipeUnoptimized[U](input: TypedPipe[U], fieldNames: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + setter: TupleSetter[U] + ): Pipe = { val compiler = cache.get(flowDef, mode) /** - * These rules are not optimizations, but actually required for Cascading to not - * throw. Cascading requires certain shapes of the graphs + * These rules are not optimizations, but actually required for Cascading to not throw. Cascading requires + * certain shapes of the graphs */ - val p = OptimizationRules(input, + val p = OptimizationRules( + input, OptimizationRules.DescribeLater .orElse(OptimizationRules.DeferMerge) - .orElse(OptimizationRules.DiamondToFlatMap)) + .orElse(OptimizationRules.DiamondToFlatMap) + ) val cp: CascadingPipe[U] = compiler(p) @@ -574,7 +629,10 @@ object CascadingBackend { .applyFlowConfigProperties(flowDef) } - private def planCoGroup[K, R](cg: CoGrouped[K, R], rec: FunctionK[TypedPipe, CascadingPipe]): CascadingPipe[(K, R)] = { + private def planCoGroup[K, R]( + cg: CoGrouped[K, R], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, R)] = { // This has the side effect of planning all inputs now // before we need to call them below @@ -605,26 +663,28 @@ object CascadingBackend { val newPipe = maybeBox[K, Any](ord, flowDef) { (tupset, ordKeyField) => if (firstCount == inputs.size) { + /** - * This is a self-join - * Cascading handles this by sending the data only once, spilling to disk if - * the groups don't fit in RAM, then doing the join on this one set of data. - * This is fundamentally different than the case where the first item is - * not repeated. That case is below + * This is a self-join Cascading handles this by sending the data only once, spilling to disk if the + * groups don't fit in RAM, then doing the join on this one set of data. This is fundamentally + * different than the case where the first item is not repeated. That case is below */ val NUM_OF_SELF_JOINS = firstCount - 1 - new CoGroup(assignName(toPipe[K, Any](inputs.head, kvFields, tupset)), + new CoGroup( + assignName(toPipe[K, Any](inputs.head, kvFields, tupset)), ordKeyField, NUM_OF_SELF_JOINS, outFields(firstCount), - WrappedJoiner(new DistinctCoGroupJoiner(firstCount, keyGetter(ord), joinFunction))) + WrappedJoiner(new DistinctCoGroupJoiner(firstCount, keyGetter(ord), joinFunction)) + ) } else if (firstCount == 1) { def keyId(idx: Int): String = "key%d".format(idx) + /** - * As long as the first one appears only once, we can handle self joins on the others: - * Cascading does this by maybe spilling all the streams other than the first item. - * This is handled by a different CoGroup constructor than the above case. + * As long as the first one appears only once, we can handle self joins on the others: Cascading does + * this by maybe spilling all the streams other than the first item. This is handled by a different + * CoGroup constructor than the above case. */ def renamePipe(idx: Int, p: TypedPipe[(K, Any)]): Pipe = toPipe[K, Any](p, List(keyId(idx), "value%d".format(idx)), tupset) @@ -646,16 +706,14 @@ object CascadingBackend { .map(makeFields) .toArray - val pipes: Array[Pipe] = distincts - .zipWithIndex - .map { case (item, idx) => assignName(renamePipe(idx, item)) } - .toArray + val pipes: Array[Pipe] = distincts.zipWithIndex.map { case (item, idx) => + assignName(renamePipe(idx, item)) + }.toArray val cjoiner = if (isize != dsize) { // avoid capturing anything other than the mapping ints: - val mapping: Map[Int, Int] = inputs.zipWithIndex.map { - case (item, idx) => - idx -> distincts.indexWhere(_ == item) + val mapping: Map[Int, Int] = inputs.zipWithIndex.map { case (item, idx) => + idx -> distincts.indexWhere(_ == item) }.toMap new CoGroupedJoiner(isize, keyGetter(ord), joinFunction) { @@ -668,45 +726,43 @@ object CascadingBackend { new CoGroup(pipes, groupFields, outFields(dsize), WrappedJoiner(cjoiner)) } else { + /** - * This is non-trivial to encode in the type system, so we throw this exception - * at the planning phase. + * This is non-trivial to encode in the type system, so we throw this exception at the planning phase. */ - sys.error("Except for self joins, where you are joining something with only itself,\n" + - "left-most pipe can only appear once. Firsts: " + - inputs.collect { case x if x == inputs.head => x }.toString) + sys.error( + "Except for self joins, where you are joining something with only itself,\n" + + "left-most pipe can only appear once. Firsts: " + + inputs.collect { case x if x == inputs.head => x }.toString + ) } } /* - * the CoGrouped only populates the first two fields, the second two - * are null. We then project out at the end of the method. - */ + * the CoGrouped only populates the first two fields, the second two + * are null. We then project out at the end of the method. + */ val pipeWithRedAndDescriptions = { RichPipe.setReducers(newPipe, cg.reducers.getOrElse(-1)) RichPipe.setPipeDescriptions(newPipe, cg.descriptions) newPipe.project(kvFields) } - CascadingPipe[(K, R)]( - pipeWithRedAndDescriptions, - kvFields, - flowDef, - tuple2Converter[K, R]) + CascadingPipe[(K, R)](pipeWithRedAndDescriptions, kvFields, flowDef, tuple2Converter[K, R]) } /** - * TODO: most of the complexity of this method should be rewritten - * as an optimization rule that works on the scalding typed AST. - * the code in here gets pretty complex and depending on the details - * of cascading and also how we compile to cascading. + * TODO: most of the complexity of this method should be rewritten as an optimization rule that works on the + * scalding typed AST. the code in here gets pretty complex and depending on the details of cascading and + * also how we compile to cascading. * - * But the optimization is somewhat general: we often want a checkpoint - * before a hashjoin is replicated + * But the optimization is somewhat general: we often want a checkpoint before a hashjoin is replicated */ - private def planHashJoin[K, V1, V2, R](left: TypedPipe[(K, V1)], - right: HashJoinable[K, V2], - joiner: (K, V1, Iterable[V2]) => Iterator[R], - rec: FunctionK[TypedPipe, CascadingPipe]): CascadingPipe[(K, R)] = { + private def planHashJoin[K, V1, V2, R]( + left: TypedPipe[(K, V1)], + right: HashJoinable[K, V2], + joiner: (K, V1, Iterable[V2]) => Iterator[R], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, R)] = { val fd = new FlowDef val leftPipe = rec(left).toPipe(kvFields, fd, tup2Setter) @@ -719,42 +775,46 @@ object CascadingBackend { Field.singleOrdered("key")(keyOrdering), mappedPipe, Field.singleOrdered("key1")(keyOrdering), - WrappedJoiner(new HashJoiner(singleValuePerRightKey, right.joinFunction, joiner))) + WrappedJoiner(new HashJoiner(singleValuePerRightKey, right.joinFunction, joiner)) + ) - CascadingPipe[(K, R)]( - RichPipe(hashPipe).project(kvFields), - kvFields, - fd, - tuple2Converter[K, R]) + CascadingPipe[(K, R)](RichPipe(hashPipe).project(kvFields), kvFields, fd, tuple2Converter[K, R]) } private def planReduceStep[K, V1, V2]( - rs: ReduceStep[K, V1, V2], - rec: FunctionK[TypedPipe, CascadingPipe]): CascadingPipe[(K, V2)] = { + rs: ReduceStep[K, V1, V2], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, V2)] = { val mapped = rec(rs.mapped) def groupOp(gb: GroupBuilder => GroupBuilder): CascadingPipe[_ <: (K, V2)] = groupOpWithValueSort(None)(gb) - def groupOpWithValueSort(valueSort: Option[Ordering[V1]])(gb: GroupBuilder => GroupBuilder): CascadingPipe[_ <: (K, V2)] = { + def groupOpWithValueSort( + valueSort: Option[Ordering[V1]] + )(gb: GroupBuilder => GroupBuilder): CascadingPipe[_ <: (K, V2)] = { val flowDef = new FlowDef val pipe = maybeBox[K, V1](rs.keyOrdering, flowDef) { (tupleSetter, fields) => - val (sortOpt, ts) = valueSort.map { - case ordser: OrderedSerialization[V1 @unchecked] => - // We get in here when we do a secondary sort - // and that sort is an ordered serialization - // We now need a boxed serializer for this type - // Then we set the comparator on the field, and finally we box the value with our tupleSetter - val (boxfn, boxordSer) = getBoxFnAndOrder[V1](ordser, flowDef) - val valueF = new Fields("value") - valueF.setComparator("value", new CascadingBinaryComparator(boxordSer)) - val ts2 = tupleSetter.asInstanceOf[TupleSetter[(K, Boxed[V1])]].contraMap { kv1: (K, V1) => (kv1._1, boxfn(kv1._2)) } - (Some(valueF), ts2) - case vs => - val vord = Field.singleOrdered("value")(vs) - (Some(vord), tupleSetter) - }.getOrElse((None, tupleSetter)) + val (sortOpt, ts) = valueSort + .map { + case ordser: OrderedSerialization[V1 @unchecked] => + // We get in here when we do a secondary sort + // and that sort is an ordered serialization + // We now need a boxed serializer for this type + // Then we set the comparator on the field, and finally we box the value with our tupleSetter + val (boxfn, boxordSer) = getBoxFnAndOrder[V1](ordser, flowDef) + val valueF = new Fields("value") + valueF.setComparator("value", new CascadingBinaryComparator(boxordSer)) + val ts2 = tupleSetter.asInstanceOf[TupleSetter[(K, Boxed[V1])]].contraMap { kv1: (K, V1) => + (kv1._1, boxfn(kv1._2)) + } + (Some(valueF), ts2) + case vs => + val vord = Field.singleOrdered("value")(vs) + (Some(vord), tupleSetter) + } + .getOrElse((None, tupleSetter)) val p = mapped.toPipe(kvFields, flowDef, TupleSetter.asSubSetter(ts)) @@ -769,20 +829,20 @@ object CascadingBackend { } rs match { - case ir@IdentityReduce(_, _, None, descriptions, _) => + case ir @ IdentityReduce(_, _, None, descriptions, _) => type CP[V] = CascadingPipe[_ <: (K, V)] // Not doing anything ir.evidence.subst[CP](mapped.copy(pipe = RichPipe.setPipeDescriptions(mapped.pipe, descriptions))) - case uir@UnsortedIdentityReduce(_, _, None, descriptions, _) => + case uir @ UnsortedIdentityReduce(_, _, None, descriptions, _) => type CP[V] = CascadingPipe[_ <: (K, V)] // Not doing anything uir.evidence.subst[CP](mapped.copy(pipe = RichPipe.setPipeDescriptions(mapped.pipe, descriptions))) case IdentityReduce(_, _, Some(reds), descriptions, _) => - groupOp { _.reducers(reds).setDescriptions(descriptions) } + groupOp(_.reducers(reds).setDescriptions(descriptions)) case UnsortedIdentityReduce(_, _, Some(reds), descriptions, _) => // This is weird, but it is sometimes used to force a partition - groupOp { _.reducers(reds).setDescriptions(descriptions) } - case ivsr@IdentityValueSortedReduce(_, _, _, _, _, _) => + groupOp(_.reducers(reds).setDescriptions(descriptions)) + case ivsr @ IdentityValueSortedReduce(_, _, _, _, _, _) => groupOpWithValueSort(Some(ivsr.valueSort)) { gb => // If its an ordered serialization we need to unbox val mappedGB = @@ -797,27 +857,45 @@ object CascadingBackend { .reducers(ivsr.reducers.getOrElse(-1)) .setDescriptions(ivsr.descriptions) } - case vsr@ValueSortedReduce(_, _, _, _, _, _) => + case vsr @ ValueSortedReduce(_, _, _, _, _, _) => val optVOrdering = Some(vsr.valueSort) groupOpWithValueSort(optVOrdering) { // If its an ordered serialization we need to unbox // the value before handing it to the users operation - _.every(new cascading.pipe.Every(_, valueField, - new TypedBufferOp[K, V1, V2](keyConverter(vsr.keyOrdering), - valueConverter(optVOrdering), - vsr.reduceFn, - valueField), Fields.REPLACE)) + _.every( + new cascading.pipe.Every( + _, + valueField, + new TypedBufferOp[K, V1, V2]( + keyConverter(vsr.keyOrdering), + valueConverter(optVOrdering), + vsr.reduceFn, + valueField + ), + Fields.REPLACE + ) + ) .reducers(vsr.reducers.getOrElse(-1)) .setDescriptions(vsr.descriptions) } - case imr@IteratorMappedReduce(_, _, _, _, _) => + case imr @ IteratorMappedReduce(_, _, _, _, _) => groupOp { - _.every(new cascading.pipe.Every(_, valueField, - new TypedBufferOp(keyConverter(imr.keyOrdering), TupleConverter.singleConverter[V1], imr.reduceFn, valueField), Fields.REPLACE)) + _.every( + new cascading.pipe.Every( + _, + valueField, + new TypedBufferOp( + keyConverter(imr.keyOrdering), + TupleConverter.singleConverter[V1], + imr.reduceFn, + valueField + ), + Fields.REPLACE + ) + ) .reducers(imr.reducers.getOrElse(-1)) .setDescriptions(imr.descriptions) } } } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala index 828aac8312..5d2c744bc1 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala @@ -1,19 +1,21 @@ package com.twitter.scalding.typed.cascading_backend -import cascading.pipe.joiner.{ Joiner => CJoiner, JoinerClosure } -import cascading.tuple.{ Tuple => CTuple } -import com.twitter.scalding.{ TupleGetter } +import cascading.pipe.joiner.{Joiner => CJoiner, JoinerClosure} +import cascading.tuple.{Tuple => CTuple} +import com.twitter.scalding.TupleGetter import com.twitter.scalding.serialization.Externalizer import scala.collection.JavaConverters._ import com.twitter.scalding.typed.MultiJoinFunction -abstract class CoGroupedJoiner[K](inputSize: Int, - getter: TupleGetter[K], - @transient inJoinFunction: MultiJoinFunction[K, Any]) extends CJoiner { +abstract class CoGroupedJoiner[K]( + inputSize: Int, + getter: TupleGetter[K], + @transient inJoinFunction: MultiJoinFunction[K, Any] +) extends CJoiner { /** - * We have a test that should fail if Externalizer is not used here. - * you can test failure of that test by replacing Externalizer with Some + * We have a test that should fail if Externalizer is not used here. you can test failure of that test by + * replacing Externalizer with Some */ val joinFunction = Externalizer(inJoinFunction) val distinctSize: Int @@ -27,12 +29,11 @@ abstract class CoGroupedJoiner[K](inputSize: Int, } override def getIterator(jc: JoinerClosure) = { - val iters = (0 until distinctSize).map { jc.getIterator(_).asScala.buffered } + val iters = (0 until distinctSize).map(jc.getIterator(_).asScala.buffered) // This use of `_.get` is safe, but difficult to prove in the types. @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - val keyTuple = iters - .collectFirst { case iter if iter.nonEmpty => iter.head } - .get // One of these must have a key + val keyTuple = + iters.collectFirst { case iter if iter.nonEmpty => iter.head }.get // One of these must have a key val key = getter.get(keyTuple, 0) def unbox(it: Iterator[CTuple]): Iterator[Any] = @@ -46,14 +47,17 @@ abstract class CoGroupedJoiner[K](inputSize: Int, } val rest = restIndices.map(toIterable(_)) - joinFunction.get(key, leftMost, rest).map { rval => - // There always has to be the same number of resulting fields as input - // or otherwise the flow planner will throw - val res = CTuple.size(distinctSize) - res.set(0, key) - res.set(1, rval) - res - }.asJava + joinFunction + .get(key, leftMost, rest) + .map { rval => + // There always has to be the same number of resulting fields as input + // or otherwise the flow planner will throw + val res = CTuple.size(distinctSize) + res.set(0, key) + res.set(1, rval) + res + } + .asJava } override def numJoins = distinctSize - 1 diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala index 4d420af669..f1335b562e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala @@ -4,10 +4,11 @@ import com.twitter.scalding.TupleGetter import com.twitter.scalding.typed.MultiJoinFunction // If all the input pipes are unique, this works: -class DistinctCoGroupJoiner[K](count: Int, - getter: TupleGetter[K], - @transient joinF: MultiJoinFunction[K, Any]) - extends CoGroupedJoiner[K](count, getter, joinF) { +class DistinctCoGroupJoiner[K]( + count: Int, + getter: TupleGetter[K], + @transient joinF: MultiJoinFunction[K, Any] +) extends CoGroupedJoiner[K](count, getter, joinF) { val distinctSize = count def distinctIndexOf(idx: Int) = idx } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala index 8638120632..36de0bfa77 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed.cascading_backend -import cascading.pipe.joiner.{ Joiner => CJoiner, JoinerClosure } -import cascading.tuple.{ Tuple => CTuple } +import cascading.pipe.joiner.{Joiner => CJoiner, JoinerClosure} +import cascading.tuple.{Tuple => CTuple} import com.twitter.scalding.serialization.Externalizer import com.twitter.scalding.typed.MultiJoinFunction @@ -27,9 +27,10 @@ import scala.collection.JavaConverters._ * Only intended to be use to implement the hashCogroup on TypedPipe/Grouped */ class HashJoiner[K, V, W, R]( - rightHasSingleValue: Boolean, - rightGetter: MultiJoinFunction[K, W], - joiner: (K, V, Iterable[W]) => Iterator[R]) extends CJoiner { + rightHasSingleValue: Boolean, + rightGetter: MultiJoinFunction[K, W], + joiner: (K, V, Iterable[W]) => Iterator[R] +) extends CJoiner { private[this] val joinEx = Externalizer(joiner) @@ -60,7 +61,8 @@ class HashJoiner[K, V, W, R]( left.flatMap { kv => val leftV = kv.getObject(1).asInstanceOf[V] // get just the Vs - joinEx.get(key, leftV, rightIterable) + joinEx + .get(key, leftV, rightIterable) .map { rval => // There always has to be four resulting fields // or otherwise the flow planner will throw diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala index 11c54fe7b5..c4350294ce 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala @@ -1,8 +1,7 @@ package com.twitter.scalding.typed.functions /** - * This is a more powerful version of =:= that can allow - * us to remove casts and also not have any runtime cost + * This is a more powerful version of =:= that can allow us to remove casts and also not have any runtime cost * for our function calls in some cases of trivial functions */ sealed abstract class EqTypes[A, B] extends java.io.Serializable { @@ -34,4 +33,3 @@ object EqTypes extends java.io.Serializable { // in scala 2.13, this won't need a cast, but the cast is safe reflexive[A].asInstanceOf[EqTypes[A, B]] } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala index ba858e3bb0..de87cb2fe6 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed.functions import java.io.Serializable @@ -29,12 +29,13 @@ sealed trait FlatMappedFn[-A, +B] extends (A => TraversableOnce[B]) with Seriali case Single(FlatMapping.Identity(ev)) => type F[T] = FlatMapping[Z, T] Single(ev.subst[F](fn)) - case notId => fn match { - case FlatMapping.Identity(ev) => - type F[T] = FlatMappedFn[T, B] - ev.reverse.subst[F](this) - case notIdFn => Series(notIdFn, notId) // only make a Series without either side being identity - } + case notId => + fn match { + case FlatMapping.Identity(ev) => + type F[T] = FlatMappedFn[T, B] + ev.reverse.subst[F](this) + case notIdFn => Series(notIdFn, notId) // only make a Series without either side being identity + } } final def combine[C](next: FlatMappedFn[B, C]): FlatMappedFn[A, C] = { @@ -71,7 +72,7 @@ sealed trait FlatMappedFn[-A, +B] extends (A => TraversableOnce[B]) with Seriali val filter: A1 => TraversableOnce[A1] = FlatMapFunctions.FromFilter(f) type F[T] = A1 => TraversableOnce[T] ev.subst[F](filter) - case Single(Map(f)) => FlatMapFunctions.FromMap(f) + case Single(Map(f)) => FlatMapFunctions.FromMap(f) case Single(FlatM(f)) => f case Series(Identity(ev), rest) => type F[T] = T => TraversableOnce[B1] @@ -99,18 +100,18 @@ object FlatMappedFn extends Serializable { def asId[A, B](f: FlatMappedFn[A, B]): Option[EqTypes[_ >: A, _ <: B]] = f match { case Single(FlatMapping.Identity(ev)) => Some(ev) - case _ => None + case _ => None } def asFilter[A, B](f: FlatMappedFn[A, B]): Option[(A => Boolean, EqTypes[(_ >: A), (_ <: B)])] = f match { - case Single(filter@FlatMapping.Filter(_, _)) => Some((filter.fn, filter.ev)) - case _ => None + case Single(filter @ FlatMapping.Filter(_, _)) => Some((filter.fn, filter.ev)) + case _ => None } def apply[A, B](fn: A => TraversableOnce[B]): FlatMappedFn[A, B] = fn match { case fmf: FlatMappedFn[A, B] => fmf - case rawfn => Single(FlatMapping.FlatM(rawfn)) + case rawfn => Single(FlatMapping.FlatM(rawfn)) } def identity[T]: FlatMappedFn[T, T] = Single(FlatMapping.Identity[T, T](EqTypes.reflexive[T])) @@ -122,5 +123,6 @@ object FlatMappedFn extends Serializable { Single(FlatMapping.Map(fn)) final case class Single[A, B](fn: FlatMapping[A, B]) extends FlatMappedFn[A, B] - final case class Series[A, B, C](first: FlatMapping[A, B], next: FlatMappedFn[B, C]) extends FlatMappedFn[A, C] + final case class Series[A, B, C](first: FlatMapping[A, B], next: FlatMappedFn[B, C]) + extends FlatMappedFn[A, C] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala index 279825aeb2..8e11763268 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala @@ -3,11 +3,7 @@ package com.twitter.scalding.typed.functions import java.io.Serializable /** - * This is one of 4 core, non composed operations: - * identity - * filter - * map - * flatMap + * This is one of 4 core, non composed operations: identity filter map flatMap */ sealed abstract class FlatMapping[-A, +B] extends Serializable @@ -23,4 +19,3 @@ object FlatMapping extends Serializable { final case class Map[A, B](fn: A => B) extends FlatMapping[A, B] final case class FlatM[A, B](fn: A => TraversableOnce[B]) extends FlatMapping[A, B] } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala index 5f999de777..06d9df895e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.typed.functions -import com.twitter.algebird.{ Aggregator, Ring, Semigroup, Fold } +import com.twitter.algebird.{Aggregator, Fold, Ring, Semigroup} import java.util.Random import java.io.Serializable @@ -29,7 +29,7 @@ case class MakeKey[K, V](fn: V => K) extends Function1[V, (K, V)] { case class MapOptionToFlatMap[A, B](fn: A => Option[B]) extends Function1[A, List[B]] { def apply(a: A) = fn(a) match { - case None => Nil + case None => Nil case Some(a) => a :: Nil } } @@ -38,7 +38,8 @@ case class PartialFunctionToFilter[A, B](fn: PartialFunction[A, B]) extends Func def apply(a: A) = fn.isDefinedAt(a) } -case class MapValueStream[A, B](fn: Iterator[A] => Iterator[B]) extends Function2[Any, Iterator[A], Iterator[B]] { +case class MapValueStream[A, B](fn: Iterator[A] => Iterator[B]) + extends Function2[Any, Iterator[A], Iterator[B]] { def apply(k: Any, vs: Iterator[A]) = fn(vs) } @@ -109,7 +110,8 @@ case class FoldIterator[A, B](fold: Fold[A, B]) extends Function1[Iterator[A], I def apply(as: Iterator[A]) = Iterator.single(fold.overTraversable(as)) } -case class FoldWithKeyIterator[K, A, B](foldfn: K => Fold[A, B]) extends Function2[K, Iterator[A], Iterator[B]] { +case class FoldWithKeyIterator[K, A, B](foldfn: K => Fold[A, B]) + extends Function2[K, Iterator[A], Iterator[B]] { def apply(k: K, as: Iterator[A]) = Iterator.single(foldfn(k).overTraversable(as)) } @@ -168,9 +170,8 @@ case class SemigroupFromProduct[T](ring: Ring[T]) extends Semigroup[T] { } /** - * This is a semigroup that throws IllegalArgumentException if - * there is more than one item. This is used to trigger optimizations - * where the user knows there is at most one value per key. + * This is a semigroup that throws IllegalArgumentException if there is more than one item. This is used to + * trigger optimizations where the user knows there is at most one value per key. */ case class RequireSingleSemigroup[T]() extends Semigroup[T] { def plus(a: T, b: T) = throw new IllegalArgumentException(s"expected only one item, calling plus($a, $b)") @@ -221,22 +222,25 @@ case class FilterKeysToFilter[K](fn: K => Boolean) extends Function1[(K, Any), B def apply(kv: (K, Any)) = fn(kv._1) } -case class FlatMapValuesToFlatMap[K, A, B](fn: A => TraversableOnce[B]) extends Function1[(K, A), TraversableOnce[(K, B)]] { +case class FlatMapValuesToFlatMap[K, A, B](fn: A => TraversableOnce[B]) + extends Function1[(K, A), TraversableOnce[(K, B)]] { def apply(ka: (K, A)) = { val k = ka._1 fn(ka._2).map((k, _)) } } -case class MergeFlatMaps[A, B](fns: Iterable[A => TraversableOnce[B]]) extends Function1[A, TraversableOnce[B]] { - def apply(a: A) = fns.iterator.flatMap { fn => fn(a) } +case class MergeFlatMaps[A, B](fns: Iterable[A => TraversableOnce[B]]) + extends Function1[A, TraversableOnce[B]] { + def apply(a: A) = fns.iterator.flatMap(fn => fn(a)) } case class MapValuesToMap[K, A, B](fn: A => B) extends Function1[(K, A), (K, B)] { def apply(ka: (K, A)) = (ka._1, fn(ka._2)) } -case class EmptyGuard[K, A, B](fn: (K, Iterator[A]) => Iterator[B]) extends Function2[K, Iterator[A], Iterator[B]] { +case class EmptyGuard[K, A, B](fn: (K, Iterator[A]) => Iterator[B]) + extends Function2[K, Iterator[A], Iterator[B]] { def apply(k: K, as: Iterator[A]) = if (as.nonEmpty) fn(k, as) else Iterator.empty } @@ -249,7 +253,8 @@ case class MapGroupMapValues[A, B, C](fn: B => C) extends Function2[A, Iterator[ def apply(a: A, bs: Iterator[B]) = bs.map(fn) } -case class MapGroupFlatMapValues[A, B, C](fn: B => TraversableOnce[C]) extends Function2[A, Iterator[B], Iterator[C]] { +case class MapGroupFlatMapValues[A, B, C](fn: B => TraversableOnce[C]) + extends Function2[A, Iterator[B], Iterator[C]] { def apply(a: A, bs: Iterator[B]) = bs.flatMap(fn) } @@ -263,13 +268,16 @@ object FlatMapFunctions extends Serializable { case class FromMap[A, B](fn: A => B) extends Function1[A, Iterator[B]] { def apply(a: A) = Iterator.single(fn(a)) } - case class FromFilterCompose[A, B](fn: A => Boolean, next: A => TraversableOnce[B]) extends Function1[A, TraversableOnce[B]] { + case class FromFilterCompose[A, B](fn: A => Boolean, next: A => TraversableOnce[B]) + extends Function1[A, TraversableOnce[B]] { def apply(a: A) = if (fn(a)) next(a) else Iterator.empty } - case class FromMapCompose[A, B, C](fn: A => B, next: B => TraversableOnce[C]) extends Function1[A, TraversableOnce[C]] { + case class FromMapCompose[A, B, C](fn: A => B, next: B => TraversableOnce[C]) + extends Function1[A, TraversableOnce[C]] { def apply(a: A) = next(fn(a)) } - case class FromFlatMapCompose[A, B, C](fn: A => TraversableOnce[B], next: B => TraversableOnce[C]) extends Function1[A, TraversableOnce[C]] { + case class FromFlatMapCompose[A, B, C](fn: A => TraversableOnce[B], next: B => TraversableOnce[C]) + extends Function1[A, TraversableOnce[C]] { def apply(a: A) = fn(a).flatMap(next) } } @@ -282,9 +290,10 @@ object ComposedFunctions extends Serializable { case class ComposedFilterFn[-A](fn0: A => Boolean, fn1: A => Boolean) extends Function1[A, Boolean] { def apply(a: A) = fn0(a) && fn1(a) } + /** - * This is only called at the end of a task, so might as well make it stack safe since a little - * extra runtime cost won't matter + * This is only called at the end of a task, so might as well make it stack safe since a little extra + * runtime cost won't matter */ case class ComposedOnComplete(fn0: () => Unit, fn1: () => Unit) extends Function0[Unit] { def apply(): Unit = { @@ -296,7 +305,7 @@ object ComposedFunctions extends Serializable { notComposed() stack match { case h :: tail => loop(h, tail) - case Nil => () + case Nil => () } } @@ -305,8 +314,9 @@ object ComposedFunctions extends Serializable { } case class ComposedMapGroup[A, B, C, D]( - f: (A, Iterator[B]) => Iterator[C], - g: (A, Iterator[C]) => Iterator[D]) extends Function2[A, Iterator[B], Iterator[D]] { + f: (A, Iterator[B]) => Iterator[C], + g: (A, Iterator[C]) => Iterator[D] + ) extends Function2[A, Iterator[B], Iterator[D]] { def apply(a: A, bs: Iterator[B]) = { val cs = f(a, bs) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala index e1b8a8ad6f..a3dd42471a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala @@ -1,8 +1,7 @@ package com.twitter.scalding.typed.functions /** - * This is a more powerful version of <:< that can allow - * us to remove casts and also not have any runtime cost + * This is a more powerful version of <:< that can allow us to remove casts and also not have any runtime cost * for our function calls in some cases of trivial functions */ sealed abstract class SubTypes[-A, +B] extends java.io.Serializable { @@ -19,7 +18,9 @@ sealed abstract class SubTypes[-A, +B] extends java.io.Serializable { type G[-T] = SubTypes[F[T], F[B]] subst[G](SubTypes.fromSubType[F[B], F[B]]) } - /** create a new evidence for a contravariant type F[_] + + /** + * create a new evidence for a contravariant type F[_] */ def liftContra[F[-_]]: SubTypes[F[B], F[A]] = { type G[-T] = SubTypes[F[B], F[T]] @@ -56,8 +57,7 @@ object SubTypes extends java.io.Serializable { } def compose[A, B, C](sub0: SubTypes[A, B], sub1: SubTypes[B, C]): SubTypes[A, C] = { - type SubC[-X] = SubTypes[X, C] - sub0.subst[SubC](sub1) + type SubC[-X] = SubTypes[X, C] + sub0.subst[SubC](sub1) } } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala index 2052418d38..80c1a605a0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala @@ -15,8 +15,7 @@ class AtomicBox[T <: AnyRef](init: T) { ref.getAndSet(t) /** - * use a pure function to update the state. - * fn may be called more than once + * use a pure function to update the state. fn may be called more than once */ def update[R](fn: T => (T, R)): R = { @@ -33,4 +32,3 @@ class AtomicBox[T <: AnyRef](init: T) { def get(): T = ref.get } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala index 8b8971fa08..a75cc1fc1f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala @@ -4,7 +4,7 @@ import cascading.flow.FlowDef import cascading.pipe.Pipe import com.twitter.scalding.typed._ import com.twitter.scalding.Mode -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext, Promise } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} import scala.util.{Failure, Success} trait MemorySource[A] { @@ -19,10 +19,15 @@ object MemorySource { def read()(implicit ec: ConcurrentExecutionContext) = toFn(ec) } - def readOption[T](optSrc: Option[MemorySource[T]], name: String)(implicit ec: ConcurrentExecutionContext): Future[Iterator[T]] = + def readOption[T](optSrc: Option[MemorySource[T]], name: String)(implicit + ec: ConcurrentExecutionContext + ): Future[Iterator[T]] = optSrc match { case Some(src) => src.read() - case None => Future.failed(new Exception(s"Source: $name not wired. Please provide an input with MemoryMode.addSource")) + case None => + Future.failed( + new Exception(s"Source: $name not wired. Please provide an input with MemoryMode.addSource") + ) } } @@ -32,25 +37,23 @@ trait MemorySink[A] { } object MemorySink { + /** - * This is a sink that writes into local memory which you can read out - * by a future + * This is a sink that writes into local memory which you can read out by a future * - * this needs to be reset between each write (so it only works for a single - * write per Execution) + * this needs to be reset between each write (so it only works for a single write per Execution) */ class LocalVar[A] extends MemorySink[A] { private[this] val box: AtomicBox[Promise[Iterable[A]]] = new AtomicBox(Promise[Iterable[A]]()) /** - * This is a future that completes when a write comes. If no write - * happens before a reset, the future fails + * This is a future that completes when a write comes. If no write happens before a reset, the future + * fails */ def read(): Future[Iterable[A]] = box.get().future /** - * This takes the current future and resets the promise - * making it safe for another write. + * This takes the current future and resets the promise making it safe for another write. */ def reset(): Option[Iterable[A]] = { val current = box.swap(Promise[Iterable[A]]()) @@ -62,7 +65,10 @@ object MemorySink { case Some(Success(res)) => Some(res) case Some(Failure(err)) => - throw new IllegalStateException("We should never reach this because, we only complete with failure below", err) + throw new IllegalStateException( + "We should never reach this because, we only complete with failure below", + err + ) case None => // make sure we complete the original future so readers don't block forever current.failure(new Exception(s"sink never written to before reset() called $this")) @@ -72,40 +78,38 @@ object MemorySink { def write(data: Iterable[A])(implicit ec: ConcurrentExecutionContext): Future[Unit] = Future { - box.update { p => (p.success(data), ()) } + box.update(p => (p.success(data), ())) } } } /** - * These are just used as type markers which are connected - * to inputs via the MemoryMode + * These are just used as type markers which are connected to inputs via the MemoryMode */ case class SourceT[T](ident: String) extends TypedSource[T] { + /** * note that ??? in scala is the same as not implemented * - * These methods are not needed for use with the Execution API, and indeed - * don't make sense outside of cascading, but backwards compatibility - * currently requires them on TypedSource. Ideally we will find another solution - * to this in the future + * These methods are not needed for use with the Execution API, and indeed don't make sense outside of + * cascading, but backwards compatibility currently requires them on TypedSource. Ideally we will find + * another solution to this in the future */ def converter[U >: T] = ??? def read(implicit flowDef: FlowDef, mode: Mode): Pipe = ??? } /** - * These are just used as type markers which are connected - * to outputs via the MemoryMode + * These are just used as type markers which are connected to outputs via the MemoryMode */ case class SinkT[T](indent: String) extends TypedSink[T] { + /** * note that ??? in scala is the same as not implemented * - * These methods are not needed for use with the Execution API, and indeed - * don't make sense outside of cascading, but backwards compatibility - * currently requires them on TypedSink. Ideally we will find another solution - * to this in the future + * These methods are not needed for use with the Execution API, and indeed don't make sense outside of + * cascading, but backwards compatibility currently requires them on TypedSink. Ideally we will find another + * solution to this in the future */ def setter[U <: T] = ??? def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = ??? diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala index 56d3c4f3f4..47932fcfba 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala @@ -1,18 +1,18 @@ package com.twitter.scalding.typed.memory_backend -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext } -import com.twitter.scalding.{ Execution, Mode } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} +import com.twitter.scalding.{Execution, Mode} import com.twitter.scalding.typed._ import Execution.Writer -final case class MemoryMode(srcs: Resolver[TypedSource, MemorySource], sinks: Resolver[TypedSink, MemorySink]) extends Mode { +final case class MemoryMode(srcs: Resolver[TypedSource, MemorySource], sinks: Resolver[TypedSink, MemorySink]) + extends Mode { def newWriter(): Writer = new MemoryWriter(this) /** - * Add a new source resolver whose sources take precedence over any currently registered - * sources + * Add a new source resolver whose sources take precedence over any currently registered sources */ def addSourceResolver(res: Resolver[TypedSource, MemorySource]): MemoryMode = MemoryMode(res.orElse(srcs), sinks) @@ -27,8 +27,7 @@ final case class MemoryMode(srcs: Resolver[TypedSource, MemorySource], sinks: Re addSource(src, MemorySource.FromIterable(iter)) /** - * Add a new sink resolver whose sinks take precedence over any currently registered - * sinks + * Add a new sink resolver whose sinks take precedence over any currently registered sinks */ def addSinkResolver(res: Resolver[TypedSink, MemorySink]): MemoryMode = MemoryMode(srcs, res.orElse(sinks)) @@ -39,10 +38,17 @@ final case class MemoryMode(srcs: Resolver[TypedSource, MemorySource], sinks: Re /** * This has a side effect of mutating the corresponding MemorySink */ - def writeSink[T](t: TypedSink[T], iter: Iterable[T])(implicit ec: ConcurrentExecutionContext): Future[Unit] = + def writeSink[T](t: TypedSink[T], iter: Iterable[T])(implicit + ec: ConcurrentExecutionContext + ): Future[Unit] = sinks(t) match { case Some(sink) => sink.write(iter) - case None => Future.failed(new Exception(s"missing sink for $t, with first 10 values to write: ${iter.take(10).toList.toString}...")) + case None => + Future.failed( + new Exception( + s"missing sink for $t, with first 10 values to write: ${iter.take(10).toList.toString}..." + ) + ) } } @@ -50,4 +56,3 @@ object MemoryMode { def empty: MemoryMode = apply(Resolver.empty[TypedSource, MemorySource], Resolver.empty[TypedSink, MemorySink]) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala index d3e0315848..bd7401e481 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala @@ -1,19 +1,17 @@ package com.twitter.scalding.typed.memory_backend import scala.collection.mutable.ArrayBuffer -import com.stripe.dagon.{ Memoize, FunctionK } +import com.stripe.dagon.{FunctionK, Memoize} import com.twitter.scalding.typed._ import com.twitter.scalding.Config object MemoryPlanner { + /** - * This builds an new memoizing planner - * that reads from the given MemoryMode. + * This builds an new memoizing planner that reads from the given MemoryMode. * - * Note, this assumes all forks are made explicit - * in the graph, so it is up to any caller - * to make sure that optimization rule has first - * been applied + * Note, this assumes all forks are made explicit in the graph, so it is up to any caller to make sure that + * optimization rule has first been applied */ def planner(conf: Config, srcs: Resolver[TypedSource, MemorySource]): FunctionK[TypedPipe, Op] = Memoize.functionK(new Memoize.RecursiveK[TypedPipe, Op] { @@ -42,7 +40,10 @@ object MemoryPlanner { case (fk @ FilterKeys(_, _), rec) => def go[K, V](node: FilterKeys[K, V]): Op[(K, V)] = { val FilterKeys(pipe, fn) = node - rec(pipe).concatMap { case (k, v) => if (fn(k)) { (k, v) :: Nil } else Nil } + rec(pipe).concatMap { case (k, v) => + if (fn(k)) { (k, v) :: Nil } + else Nil + } } go(fk) @@ -89,7 +90,7 @@ object MemoryPlanner { case (SourcePipe(src), _) => val optsrc = srcs(src) - Op.Source({ cec => MemorySource.readOption(optsrc, src.toString)(cec) }) + Op.Source { cec => MemorySource.readOption(optsrc, src.toString)(cec) } case (slk @ SumByLocalKeys(_, _), rec) => def sum[K, V](sblk: SumByLocalKeys[K, V]) = { @@ -101,12 +102,12 @@ object MemoryPlanner { while (iter.hasNext) { val (k, v) = iter.next map(k) = map.get(k) match { - case None => v + case None => v case Some(v1) => sg.plus(v1, v) } } val res = new ArrayBuffer[(K, V)](map.size) - map.foreach { res += _ } + map.foreach(res += _) res } } @@ -131,23 +132,25 @@ object MemoryPlanner { def go[K, V1, V2, R](hcg: HashCoGroup[K, V1, V2, R]) = { val leftOp = rec(hcg.left) val rightOp = rec(ReduceStepPipe(HashJoinable.toReduceStep(hcg.right))) - Op.Join[(K, V1), (K, V2), (K, R)](leftOp, rightOp, { (v1s, v2s) => - val kv2 = v2s.groupBy(_._1) - val result = new ArrayBuffer[(K, R)]() - v1s.foreach { - case (k, v1) => + Op.Join[(K, V1), (K, V2), (K, R)]( + leftOp, + rightOp, + { (v1s, v2s) => + val kv2 = v2s.groupBy(_._1) + val result = new ArrayBuffer[(K, R)]() + v1s.foreach { case (k, v1) => val v2 = kv2.getOrElse(k, Nil).map(_._2) result ++= hcg.joiner(k, v1, v2).map((k, _)) + } + result } - result - }) + ) } go(hcg) case (CoGroupedPipe(cg), rec) => - def go[K, V](cg: CoGrouped[K, V]) = { + def go[K, V](cg: CoGrouped[K, V]) = Op.BulkJoin(cg.inputs.map(rec(_)), cg.joinFunction) - } go(cg) case (ReduceStepPipe(ir @ IdentityReduce(_, _, _, descriptions, _)), rec) => @@ -167,7 +170,7 @@ object MemoryPlanner { case (ReduceStepPipe(IdentityValueSortedReduce(_, pipe, ord, _, _, _)), rec) => def go[K, V](p: TypedPipe[(K, V)], ord: Ordering[V]) = { val op = rec(p) - Op.Reduce[K, V, V](op, { (k, vs) => vs }, Some(ord)) + Op.Reduce[K, V, V](op, (k, vs) => vs, Some(ord)) } go(pipe, ord) case (ReduceStepPipe(ValueSortedReduce(_, pipe, ord, fn, _, _)), rec) => @@ -178,4 +181,3 @@ object MemoryPlanner { }) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala index b51119e700..5b76c4d37d 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala @@ -1,42 +1,41 @@ package com.twitter.scalding.typed.memory_backend -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext, Promise } -import com.stripe.dagon.{ HMap, Rule } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import com.stripe.dagon.{HMap, Rule} import com.twitter.scalding.typed._ -import com.twitter.scalding.{ CancellationHandler, CFuture } -import com.twitter.scalding.{ Config, Execution, ExecutionCounters } -import Execution.{ ToWrite, Writer } +import com.twitter.scalding.{CFuture, CancellationHandler} +import com.twitter.scalding.{Config, Execution, ExecutionCounters} +import Execution.{ToWrite, Writer} + /** - * This is the state of a single outer Execution execution running - * in memory mode + * This is the state of a single outer Execution execution running in memory mode */ class MemoryWriter(mem: MemoryMode) extends Writer { def start(): Unit = () + /** * This is called by an Execution to end processing */ def finished(): Unit = () - private[this] case class State( - id: Long, - forced: HMap[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F]) { + private[this] case class State(id: Long, forced: HMap[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F]) { def simplifiedForce[A](t: TypedPipe[A], it: Future[Iterable[A]]): State = copy(forced = forced.updated(t, it)) } - private[this] val state = new AtomicBox[State](State(0, HMap.empty[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F])) + private[this] val state = + new AtomicBox[State](State(0, HMap.empty[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F])) /** - * do a batch of writes, possibly optimizing, and return a new unique - * Long. + * do a batch of writes, possibly optimizing, and return a new unique Long. * * empty writes are legitimate and should still return a Long */ - def execute( - conf: Config, - writes: List[ToWrite[_]])(implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] = { + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { val planner = MemoryPlanner.planner(conf, mem.srcs) @@ -61,10 +60,9 @@ class MemoryWriter(mem: MemoryMode) extends Writer { } /** - * TODO - * If we have a typed pipe rooted twice, it is not clear it has fanout. If it does not - * we will not materialize it, so both branches can't own it. Since we only emit Iterable - * out, this may be okay because no external readers can modify, but worth thinking of + * TODO If we have a typed pipe rooted twice, it is not clear it has fanout. If it does not we will not + * materialize it, so both branches can't own it. Since we only emit Iterable out, this may be okay + * because no external readers can modify, but worth thinking of */ val idActs: (Long, List[Action]) = state.update { s => val (nextState, acts) = optimizedWrites.foldLeft((s, List.empty[Action])) { @@ -111,7 +109,7 @@ class MemoryWriter(mem: MemoryMode) extends Writer { } val (id, acts) = idActs // now we run the actions: - val fut = Future.traverse(acts) { fn => fn() }.map(_ => (id, ExecutionCounters.empty)) + val fut = Future.traverse(acts)(fn => fn()).map(_ => (id, ExecutionCounters.empty)) // wrap the future in a CFuture -- this is uncancellable in memory mode CFuture.uncancellable(fut) } @@ -119,26 +117,28 @@ class MemoryWriter(mem: MemoryMode) extends Writer { /** * This should only be called after a call to execute */ - def getForced[T]( - conf: Config, - initial: TypedPipe[T])(implicit cec: ConcurrentExecutionContext): Future[TypedPipe[T]] = + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[TypedPipe[T]] = state.get.forced.get(initial) match { - case None => Future.failed(new Exception(s"$initial not forced")) + case None => Future.failed(new Exception(s"$initial not forced")) case Some(f) => f.map(TypedPipe.from(_)) } - private def getSource[A](src: TypedSource[A])(implicit cec: ConcurrentExecutionContext): Future[Iterable[A]] = + private def getSource[A](src: TypedSource[A])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[A]] = MemorySource.readOption(mem.srcs(src), src.toString).map(_.toList) /** * This should only be called after a call to execute */ - def getIterable[T]( - conf: Config, - initial: TypedPipe[T])(implicit cec: ConcurrentExecutionContext): Future[Iterable[T]] = initial match { - case TypedPipe.EmptyTypedPipe => Future.successful(Nil) + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[T]] = initial match { + case TypedPipe.EmptyTypedPipe => Future.successful(Nil) case TypedPipe.IterablePipe(iter) => Future.successful(iter) - case TypedPipe.SourcePipe(src) => getSource(src) - case other => getForced(conf, other).flatMap(getIterable(conf, _)) + case TypedPipe.SourcePipe(src) => getSource(src) + case other => getForced(conf, other).flatMap(getIterable(conf, _)) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala index 676b0b11db..f6a645508a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala @@ -1,10 +1,10 @@ package com.twitter.scalding.typed.memory_backend import com.twitter.scalding.typed._ -import java.util.{ ArrayList, Collections } +import java.util.{ArrayList, Collections} import scala.collection.JavaConverters._ -import scala.collection.mutable.{ ArrayBuffer, Map => MMap } -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext, Promise } +import scala.collection.mutable.{ArrayBuffer, Map => MMap} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} sealed trait Op[+O] { def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[_ <: O]] @@ -13,7 +13,7 @@ sealed trait Op[+O] { transform { in: IndexedSeq[O] => val res = ArrayBuffer[O1]() val it = in.iterator - while(it.hasNext) { + while (it.hasNext) { val i = it.next fn(i).foreach(res += _) } @@ -51,7 +51,7 @@ object Op { case None => val promise = Promise[ArrayBuffer[_ <: O]]() (Some(promise), Right(promise)) - case s@Some(promise) => + case s @ Some(promise) => (s, Left(promise)) } @@ -85,7 +85,7 @@ object Op { input.result.map { array => val res: ArrayBuffer[O] = array.asInstanceOf[ArrayBuffer[O]] var pos = 0 - while(pos < array.length) { + while (pos < array.length) { res.update(pos, fn(array(pos))) pos = pos + 1 } @@ -99,7 +99,7 @@ object Op { val array = array0.asInstanceOf[ArrayBuffer[I]] var pos = 0 var writePos = 0 - while(pos < array.length) { + while (pos < array.length) { val item = array(pos) if (fn(item)) { array(writePos) = item @@ -127,10 +127,10 @@ object Op { } final case class Reduce[K, V1, V2]( - input: Op[(K, V1)], - fn: (K, Iterator[V1]) => Iterator[V2], - ord: Option[Ordering[V1]] - ) extends Op[(K, V2)] { + input: Op[(K, V1)], + fn: (K, Iterator[V1]) => Iterator[V2], + ord: Option[Ordering[V1]] + ) extends Op[(K, V2)] { def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[(K, V2)]] = input.result.map { kvs => @@ -149,18 +149,16 @@ object Op { valuesByKey.foreach { case (k, vs) => ord.foreach(Collections.sort[V1](vs, _)) val v2iter = fn(k, vs.iterator.asScala) - while(v2iter.hasNext) { + while (v2iter.hasNext) { res += ((k, v2iter.next)) } } res - } + } } - final case class Join[A, B, C]( - opA: Op[A], - opB: Op[B], - fn: (IndexedSeq[A], IndexedSeq[B]) => ArrayBuffer[C]) extends Op[C] { + final case class Join[A, B, C](opA: Op[A], opB: Op[B], fn: (IndexedSeq[A], IndexedSeq[B]) => ArrayBuffer[C]) + extends Op[C] { def result(implicit cec: ConcurrentExecutionContext) = { // start both futures in parallel @@ -170,9 +168,11 @@ object Op { } } - final case class BulkJoin[K, A](ops: List[Op[(K, Any)]], joinF: MultiJoinFunction[K, A]) extends Op[(K, A)] { + final case class BulkJoin[K, A](ops: List[Op[(K, Any)]], joinF: MultiJoinFunction[K, A]) + extends Op[(K, A)] { def result(implicit cec: ConcurrentExecutionContext) = - Future.traverse(ops)(_.result) + Future + .traverse(ops)(_.result) .map { items => // TODO this is not by any means optimal. // we could copy into arrays then sort by key and iterate diff --git a/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala index f1bf4ba070..6b30c0328c 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class AlgebraJob(args: Args) extends Job(args) { Tsv("input", ('x, 'y, 'z, 'w)) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala index 23711c7184..9acb3d5272 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} case class ArgHelperTest(testFn: Seq[DescribedArg] => Unit) extends ArgHelper { override def helpRequest(describedArgs: Seq[DescribedArg]): Nothing = { @@ -69,7 +69,8 @@ class ArgHelpTest extends WordSpec with Matchers { it should { "fail when all args are not described" in { val args = List(OptionalArg("name", "Name of person"), OptionalArg("phone", "Person's phone")) - val config = Config.unitTestDefault.setArgs(Args(List("--name", "Bill", "--phone", "111", "--address", "123"))) + val config = + Config.unitTestDefault.setArgs(Args(List("--name", "Bill", "--phone", "111", "--address", "123"))) intercept[DescriptionValidationException] { ArgHelp.validatedDescribe(args, job.unit).waitFor(config, Local(true)).get diff --git a/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala index f53c92a84b..16aa6a383a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import cascading.pipe.joiner._ @@ -38,7 +38,7 @@ class InnerProductJob(args: Args) extends Job(args) { .map(('s1, 's2) -> 'score) { v: (Int, Int) => v._1 * v._2 } - .groupBy('x1, 'x2) { _.sum[Double]('score) } + .groupBy('x1, 'x2)(_.sum[Double]('score)) .write(Tsv("output")) } @@ -49,7 +49,9 @@ class BlockJoinPipeTest extends WordSpec with Matchers { val in2 = List(("0", "1", "1"), ("1", "0", "2"), ("2", "4", "5")) val correctOutput = Set((0, 1, 2.0), (0, 0, 1.0), (1, 1, 4.0), (2, 1, 8.0)) - def runJobWithArguments(left: Int = 1, right: Int = 1, joiner: String = "i")(callback: Buffer[(Int, Int, Double)] => Unit): Unit = { + def runJobWithArguments(left: Int = 1, right: Int = 1, joiner: String = "i")( + callback: Buffer[(Int, Int, Double)] => Unit + ): Unit = JobTest(new InnerProductJob(_)) .source(Tsv("input0"), in1) .source(Tsv("input1"), in2) @@ -61,7 +63,6 @@ class BlockJoinPipeTest extends WordSpec with Matchers { } .run .finish() - } "correctly compute product with 1 left block and 1 right block" in { runJobWithArguments() { outBuf => diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala index 828db0c621..3b5106cb8f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import java.io.BufferedWriter import java.io.FileWriter @@ -25,24 +25,26 @@ import cascading.cascade.Cascade import cascading.flow.FlowSkipIfSinkNotStale class Job1(args: Args) extends Job(args) { - Tsv(args("input0"), ('line)).pipe.map[String, String]('line -> 'line)((x: String) => "job1:" + x).write(Tsv(args("output0"), fields = 'line)) + Tsv(args("input0"), 'line).pipe + .map[String, String]('line -> 'line)((x: String) => "job1:" + x) + .write(Tsv(args("output0"), fields = 'line)) } class Job2(args: Args) extends Job(args) { - Tsv(args("output0"), ('line)).pipe.map[String, String]('line -> 'line)((x: String) => "job2" + x).write(Tsv(args("output1"))) + Tsv(args("output0"), 'line).pipe + .map[String, String]('line -> 'line)((x: String) => "job2" + x) + .write(Tsv(args("output1"))) } class CascadeTestJob(args: Args) extends CascadeJob(args) { val jobs = List(new Job1(args), new Job2(args)) - override def preProcessCascade(cascade: Cascade) = { + override def preProcessCascade(cascade: Cascade) = cascade.setFlowSkipStrategy(new FlowSkipIfSinkNotStale()) - } - override def postProcessCascade(cascade: Cascade) = { + override def postProcessCascade(cascade: Cascade) = println(cascade.getCascadeStats()) - } } @@ -52,7 +54,7 @@ class TwoPhaseCascadeTest extends WordSpec with Matchers with FieldConversions { .arg("input0", "input0") .arg("output0", "output0") .arg("output1", "output1") - .source(Tsv("input0", ('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) + .source(Tsv("input0", 'line), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) .sink[String](Tsv("output1")) { ob => "verify output got changed by both flows" in { ob.toList shouldBe List("job2job1:line1", "job2job1:line2", "job2job1:line3", "job2job1:line4") @@ -76,10 +78,16 @@ class TwoPhaseCascadeTest extends WordSpec with Matchers with FieldConversions { val output1 = File.createTempFile("cascading-job-output1-", "") output1.mkdir() - val args = Array[String]("com.twitter.scalding.CascadeTestJob", "--local", - "--input0", input0.getAbsolutePath, - "--output0", output0.getAbsolutePath, - "--output1", output1.getAbsolutePath) + val args = Array[String]( + "com.twitter.scalding.CascadeTestJob", + "--local", + "--input0", + input0.getAbsolutePath, + "--output0", + output0.getAbsolutePath, + "--output1", + output1.getAbsolutePath + ) Tool.main(args) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala index e158ba0b6f..dfae14822b 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ WordSpec, Matchers } +import org.scalatest.{Matchers, WordSpec} class StarJoinJob(args: Args) extends Job(args) { val in0 = Tsv("input0").read.mapTo((0, 1) -> ('x0, 'a)) { input: (Int, Int) => input } @@ -23,11 +23,12 @@ class StarJoinJob(args: Args) extends Job(args) { val in2 = Tsv("input2").read.mapTo((0, 1) -> ('x2, 'c)) { input: (Int, Int) => input } val in3 = Tsv("input3").read.mapTo((0, 1) -> ('x3, 'd)) { input: (Int, Int) => input } - in0.coGroupBy('x0) { - _.coGroup('x1, in1, OuterJoinMode) - .coGroup('x2, in2, OuterJoinMode) - .coGroup('x3, in3, OuterJoinMode) - } + in0 + .coGroupBy('x0) { + _.coGroup('x1, in1, OuterJoinMode) + .coGroup('x2, in2, OuterJoinMode) + .coGroup('x3, in3, OuterJoinMode) + } .project('x0, 'a, 'b, 'c, 'd) .write(Tsv("output")) } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala index ea0db7005d..cc028623d1 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import com.twitter.scalding.filecache.{ HadoopCachedFile, URIHasher } +import com.twitter.scalding.filecache.{HadoopCachedFile, URIHasher} import java.net.URI import org.apache.hadoop.mapreduce.MRJobConfig -import org.scalatest.{ WordSpec, Matchers } +import org.scalatest.{Matchers, WordSpec} import org.scalacheck.Arbitrary import org.scalacheck.Properties import org.scalacheck.Prop.forAll @@ -30,8 +30,7 @@ class ConfigTest extends WordSpec with Matchers { "A Config" should { "cascadingAppJar works" in { val cls = getClass - Config.default.setCascadingAppJar(cls) - .getCascadingAppJar should contain (Success(cls)) + Config.default.setCascadingAppJar(cls).getCascadingAppJar should contain(Success(cls)) } "default has serialization set" in { val sers = Config.default.get("io.serializations").get.split(",").toList @@ -44,9 +43,9 @@ class ConfigTest extends WordSpec with Matchers { val date = RichDate.now val (oldDate, newConf) = Config.empty.maybeSetSubmittedTimestamp(date) oldDate shouldBe empty - newConf.getSubmittedTimestamp should contain (date) + newConf.getSubmittedTimestamp should contain(date) val (stillOld, new2) = newConf.maybeSetSubmittedTimestamp(date + Seconds(1)) - stillOld should contain (date) + stillOld should contain(date) new2 shouldBe newConf } "adding UniqueIDs works" in { @@ -66,8 +65,7 @@ class ConfigTest extends WordSpec with Matchers { } "Default serialization should have tokens" in { Config.default.getCascadingSerializationTokens should not be empty - Config.default.getCascadingSerializationTokens - .values + Config.default.getCascadingSerializationTokens.values .map(Class.forName) .filter(c => c.isPrimitive || c.isArray) shouldBe empty @@ -81,8 +79,17 @@ class ConfigTest extends WordSpec with Matchers { // the only Kryo classes we don't assign tokens for are the primitives + array (kryoClasses -- tokenClasses).forall { c => // primitives cannot be forName'd - val prim = Set(classOf[Boolean], classOf[Byte], classOf[Short], - classOf[Int], classOf[Long], classOf[Float], classOf[Double], classOf[Char], classOf[Unit]) + val prim = Set( + classOf[Boolean], + classOf[Byte], + classOf[Short], + classOf[Int], + classOf[Long], + classOf[Float], + classOf[Double], + classOf[Char], + classOf[Unit] + ) .map(_.getName) prim(c) || Class.forName(c).isArray @@ -91,8 +98,7 @@ class ConfigTest extends WordSpec with Matchers { "addDistributedCacheFile works" in { val (cachedFile, path) = ConfigTest.makeCachedFileAndPath("test.txt") - Config - .empty + Config.empty .addDistributedCacheFiles(cachedFile) .get(MRJobConfig.CACHE_FILES) shouldBe Some(path) } @@ -100,13 +106,11 @@ class ConfigTest extends WordSpec with Matchers { val (cachedFileFirst, pathFirst) = ConfigTest.makeCachedFileAndPath("first.txt") val (cachedFileSecond, pathSecond) = ConfigTest.makeCachedFileAndPath("second.txt") - Config - .empty + Config.empty .addDistributedCacheFiles(cachedFileFirst, cachedFileSecond) .get(MRJobConfig.CACHE_FILES) shouldBe Some(s"$pathFirst,$pathSecond") - Config - .empty + Config.empty .addDistributedCacheFiles(cachedFileFirst) .addDistributedCacheFiles(cachedFileSecond) .get(MRJobConfig.CACHE_FILES) shouldBe Some(s"$pathFirst,$pathSecond") @@ -142,12 +146,14 @@ object ConfigProps extends Properties("Config") { property("++ == c2.orElse(c1)") = forAll { (c1: Config, c2: Config, keys: Set[String]) => val merged = c1 ++ c2 val testKeys = c1.toMap.keySet | c2.toMap.keySet ++ keys - testKeys.forall { k => merged.get(k) == c2.get(k).orElse(c1.get(k)) } + testKeys.forall(k => merged.get(k) == c2.get(k).orElse(c1.get(k))) } property("adding many UniqueIDs works") = forAll { (l: List[String]) => - val uids = l.filterNot { s => s.isEmpty || s.contains(",") }.map(UniqueID(_)) - (uids.foldLeft(Config.empty) { (conf, id) => - conf.addUniqueId(id) - }.getUniqueIds == uids.toSet) + val uids = l.filterNot(s => s.isEmpty || s.contains(",")).map(UniqueID(_)) + (uids + .foldLeft(Config.empty) { (conf, id) => + conf.addUniqueId(id) + } + .getUniqueIds == uids.toSet) } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala index 897df9e1c1..0da65fe482 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala @@ -12,23 +12,24 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import cascading.tuple.Fields import cascading.tuple.TupleEntry -import com.twitter.algebird.{ Fold, Semigroup } +import com.twitter.algebird.{Fold, Semigroup} import com.twitter.scalding.source.DailySuffixTsv import com.twitter.scalding.typed.TypedPipeGen -import java.lang.{ Integer => JInt } -import org.scalacheck.{ Arbitrary, Gen } +import java.lang.{Integer => JInt} +import org.scalacheck.{Arbitrary, Gen} import org.scalatest.prop.PropertyChecks class NumberJoinerJob(args: Args) extends Job(args) { val in0 = TypedTsv[(Int, Int)]("input0").read.rename((0, 1) -> ('x0, 'y0)) val in1 = Tsv("input1").read.mapTo((0, 1) -> ('x1, 'y1)) { input: (Long, Long) => input } - in0.joinWithSmaller('x0 -> 'x1, in1) + in0 + .joinWithSmaller('x0 -> 'x1, in1) .write(Tsv("output")) } @@ -42,9 +43,9 @@ class NumberJoinTest extends WordSpec with Matchers { .sink[(Int, Int, Long, Long)](Tsv("output")) { outBuf => val unordered = outBuf.toSet unordered should have size 3 - unordered should contain (0, 1, 0L, 1L) - unordered should contain (1, 2, 1L, 3L) - unordered should contain (2, 4, 2L, 9L) + unordered should contain(0, 1, 0L, 1L) + unordered should contain(1, 2, 1L, 3L) + unordered should contain(2, 4, 2L, 9L) } .run .runHadoop @@ -54,17 +55,20 @@ class NumberJoinTest extends WordSpec with Matchers { } class SpillingJob(args: Args) extends Job(args) { - TypedTsv[(Int, Int)]("input").read.rename((0, 1) -> ('n, 'v)) + TypedTsv[(Int, Int)]("input").read + .rename((0, 1) -> ('n, 'v)) .groupBy('n) { group => group.spillThreshold(3).sum[Int]('v).size - }.write(Tsv("output")) + } + .write(Tsv("output")) } class SpillingTest extends WordSpec with Matchers { "A SpillingJob" should { val src = (0 to 9).map(_ -> 1) ++ List(0 -> 4) - val result = src.groupBy(_._1) - .mapValues { v => (v.map(_._2).sum, v.size) } + val result = src + .groupBy(_._1) + .mapValues(v => (v.map(_._2).sum, v.size)) .map { case (a, (b, c)) => (a, b, c) } .toSet @@ -74,7 +78,8 @@ class SpillingTest extends WordSpec with Matchers { .source(TypedTsv[(Int, Int)]("input"), src) .sink[(Int, Int, Int)](Tsv("output")) { outBuf => outBuf.toSet shouldBe result - }.run + } + .run .runHadoop .finish() } @@ -89,9 +94,9 @@ class GroupRandomlyJob(args: Args) extends Job(args) { import GroupRandomlyJob.NumShards Tsv("fakeInput").read - .mapTo(0 -> 'num) { (line: String) => line.toInt } - .groupRandomly(NumShards) { _.max('num) } - .groupAll { _.size } + .mapTo(0 -> 'num)((line: String) => line.toInt) + .groupRandomly(NumShards)(_.max('num)) + .groupAll(_.size) .write(Tsv("fakeOutput")) } @@ -99,10 +104,10 @@ class GroupRandomlyJobTest extends WordSpec with Matchers { import GroupRandomlyJob.NumShards "A GroupRandomlyJob" should { - val input = (0 to 10000).map { i => Tuple1(i.toString) } + val input = (0 to 10000).map(i => Tuple1(i.toString)) JobTest(new GroupRandomlyJob(_)) .source(Tsv("fakeInput"), input) - .sink[(Int)](Tsv("fakeOutput")) { outBuf => + .sink[Int](Tsv("fakeOutput")) { outBuf => val numShards = outBuf(0) numShards shouldBe NumShards } @@ -112,11 +117,10 @@ class GroupRandomlyJobTest extends WordSpec with Matchers { } class ShuffleJob(args: Args) extends Job(args) { - Tsv("fakeInput") - .read - .mapTo(0 -> 'num) { (line: String) => line.toInt } + Tsv("fakeInput").read + .mapTo(0 -> 'num)((line: String) => line.toInt) .shuffle(shards = 1, seed = 42L) - .groupAll{ _.toList[Int]('num -> 'num) } + .groupAll(_.toList[Int]('num -> 'num)) .write(Tsv("fakeOutput")) } @@ -124,7 +128,7 @@ class ShuffleJobTest extends WordSpec with Matchers { val expectedShuffle: List[Int] = List(10, 5, 9, 12, 0, 1, 4, 8, 11, 6, 2, 3, 7) "A ShuffleJob" should { - val input = (0 to 12).map { Tuple1(_) } + val input = (0 to 12).map(Tuple1(_)) JobTest(new ShuffleJob(_)) .source(Tsv("fakeInput"), input) .sink[(List[Int])](Tsv("fakeOutput")) { outBuf => @@ -136,14 +140,15 @@ class ShuffleJobTest extends WordSpec with Matchers { } class MapToGroupBySizeSumMaxJob(args: Args) extends Job(args) { - TextLine(args("input")).read. + TextLine(args("input")).read + . //1 is the line mapTo(1 -> ('kx, 'x)) { line: String => val x = line.toDouble ((x > 0.5), x) - }. - groupBy('kx) { _.size.sum[Double]('x -> 'sx).max('x) }. - write(Tsv(args("output"))) + } + .groupBy('kx)(_.size.sum[Double]('x -> 'sx).max('x)) + .write(Tsv(args("output"))) } class MapToGroupBySizeSumMaxTest extends WordSpec with Matchers { @@ -152,13 +157,13 @@ class MapToGroupBySizeSumMaxTest extends WordSpec with Matchers { //Here is our input data: val input = (0 to 100).map { i: Int => (i.toString, r.nextDouble.toString) } //Here is our expected output: - val goldenOutput = input.map { - case (line: String, x: String) => + val goldenOutput = input + .map { case (line: String, x: String) => val xv = x.toDouble; ((xv > 0.5), xv) - }. - groupBy { case (kx: Boolean, x: Double) => kx }. - mapValues { vals => + } + .groupBy { case (kx: Boolean, x: Double) => kx } + .mapValues { vals => val vlist = vals.map { case (k: Boolean, x: Double) => x }.toList val size = vlist.size val sum = vlist.sum @@ -171,9 +176,8 @@ class MapToGroupBySizeSumMaxTest extends WordSpec with Matchers { .arg("output", "fakeOutput") .source(TextLine("fakeInput"), input) .sink[(Boolean, Int, Double, Double)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k: Boolean, sz: Int, sm: Double, mx: Double) => - (k, (sz, sm, mx)) + val actualOutput = outBuf.map { case (k: Boolean, sz: Int, sm: Double, mx: Double) => + (k, (sz, sm, mx)) }.toMap "produce correct size, sum, max" in { goldenOutput shouldBe actualOutput @@ -186,15 +190,27 @@ class MapToGroupBySizeSumMaxTest extends WordSpec with Matchers { class PartitionJob(args: Args) extends Job(args) { Tsv("input", new Fields("age", "weight")) - .partition('age -> 'isAdult) { (_: Int) > 18 } { _.average('weight) } + .partition('age -> 'isAdult)((_: Int) > 18)(_.average('weight)) .project('isAdult, 'weight) .write(Tsv("output")) } class PartitionJobTest extends WordSpec with Matchers { "A PartitionJob" should { - val input = List((3, 23), (23, 154), (15, 123), (53, 143), (7, 85), (19, 195), - (42, 187), (35, 165), (68, 121), (13, 103), (17, 173), (2, 13)) + val input = List( + (3, 23), + (23, 154), + (15, 123), + (53, 143), + (7, 85), + (19, 195), + (42, 187), + (35, 165), + (68, 121), + (13, 103), + (17, 173), + (2, 13) + ) val (adults, minors) = input.partition { case (age, _) => age > 18 } val Seq(adultWeights, minorWeights) = Seq(adults, minors).map { list => @@ -202,7 +218,8 @@ class PartitionJobTest extends WordSpec with Matchers { } val expectedOutput = Map( true -> adultWeights.sum / adultWeights.size.toDouble, - false -> minorWeights.sum / minorWeights.size.toDouble) + false -> minorWeights.sum / minorWeights.size.toDouble + ) JobTest(new com.twitter.scalding.PartitionJob(_)) .source(Tsv("input", new Fields("age", "weight")), input) .sink[(Boolean, Double)](Tsv("output")) { outBuf => @@ -216,17 +233,21 @@ class PartitionJobTest extends WordSpec with Matchers { class MRMJob(args: Args) extends Job(args) { val in = Tsv("input").read.mapTo((0, 1) -> ('x, 'y)) { xy: (Int, Int) => xy } // XOR reduction (insane, I guess: - in.groupBy('x) { _.reduce('y) { (left: Int, right: Int) => left ^ right } } + in.groupBy('x)(_.reduce('y)((left: Int, right: Int) => left ^ right)) .write(Tsv("outputXor")) // set val setPipe = in.groupBy('x) { - _.mapReduceMap('y -> 'y) { (input: Int) => Set(input) } { (left: Set[Int], right: Set[Int]) => left ++ right } { (output: Set[Int]) => output.toList } + _.mapReduceMap('y -> 'y)((input: Int) => Set(input)) { (left: Set[Int], right: Set[Int]) => + left ++ right + }((output: Set[Int]) => output.toList) } - setPipe.flatten[Int]('y -> 'y) + setPipe + .flatten[Int]('y -> 'y) .write(Tsv("outputSet")) - setPipe.flattenTo[Int]('y -> 'y) + setPipe + .flattenTo[Int]('y -> 'y) .write(Tsv("outputSetTo")) } @@ -248,7 +269,7 @@ class MRMTest extends WordSpec with Matchers { } .sink[Int](Tsv("outputSetTo")) { outBuf => "use flattenTo" in { - outBuf.toList.sorted shouldBe (input.map { _._2 }.sorted) + outBuf.toList.sorted shouldBe (input.map(_._2).sorted) } } .run @@ -257,11 +278,9 @@ class MRMTest extends WordSpec with Matchers { } class JoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read + val p1 = Tsv(args("input1")).read .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } - val p2 = Tsv(args("input2")) - .read + val p2 = Tsv(args("input2")).read .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } p1.joinWithSmaller('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) @@ -281,9 +300,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k: String, v1: Int, v2: Int) => - (k, (v1, v2)) + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap "join tuples with the same key" in { correctOutput shouldBe actualOutput @@ -309,13 +327,15 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { assert( TypedPipeChecker.inMemoryToList(leftWithRight).sorted == - TypedPipeChecker.inMemoryToList(rightWithLeft).sorted) + TypedPipeChecker.inMemoryToList(rightWithLeft).sorted + ) } } "correctly work with mapValueStream" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValueStream { _ => Iterator.single(scala.util.Random.nextInt()) @@ -338,7 +358,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with mapValues" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValues(identity) @@ -356,7 +377,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with mapGroup" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapGroup { (_, _) => Iterator.single(scala.util.Random.nextInt()) @@ -388,9 +410,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { } val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sumLeft + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft val right = TypedPipe.from(List("c" -> 3, "b" -> 2)) @@ -406,9 +426,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with sum" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sum + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum val right = TypedPipe.from(List("c" -> 3, "b" -> 2)) @@ -424,11 +442,12 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with foldWithKey" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .foldWithKey { _ => - Fold.foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } } @@ -446,10 +465,11 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with foldLeft" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group - .foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } val right = @@ -471,7 +491,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with mapValueStream" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValueStream { _ => Iterator.single(scala.util.Random.nextInt()) @@ -491,7 +512,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with mapValues" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValues(identity) @@ -509,7 +531,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with mapGroup" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapGroup { (_, _) => Iterator.single(scala.util.Random.nextInt()) @@ -538,9 +561,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { } val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sumLeft + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft val right = TypedPipe.from(List("c" -> 3, "b" -> 2)) @@ -556,9 +577,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with sum" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sum + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum val right = TypedPipe.from(List("c" -> 3, "b" -> 2)) @@ -574,11 +593,12 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with foldWithKey" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .foldWithKey { _ => - Fold.foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } } @@ -596,10 +616,11 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { "correctly work with foldLeft" in { val left = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group - .foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } val right = @@ -624,7 +645,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValueStream { _ => Iterator.single(scala.util.Random.nextInt()) @@ -644,7 +666,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapValues(identity) @@ -662,7 +685,8 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .mapGroup { (_, _) => Iterator.single(scala.util.Random.nextInt()) @@ -691,9 +715,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sumLeft + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft val res = left.rightJoin(right) @@ -709,9 +731,7 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) - .group - .sum + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum val res = left.rightJoin(right) @@ -727,11 +747,12 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group .foldWithKey { _ => - Fold.foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } } @@ -749,10 +770,11 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { TypedPipe.from(List("c" -> 3, "b" -> 2)) val right = - TypedPipe.from(List("a" -> 1, "b" -> 1)) + TypedPipe + .from(List("a" -> 1, "b" -> 1)) .group - .foldLeft(Value()) { - case (acc, int) => acc.copy(Some(int)) + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) } val res = left.rightJoin(right) @@ -767,16 +789,14 @@ class JoinTest extends WordSpec with Matchers with PropertyChecks { } class CollidingKeyJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read + val p1 = Tsv(args("input1")).read .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } // An an extra fake key to do a join - .map('k1 -> 'k2) { (k: String) => k + k } - val p2 = Tsv(args("input2")) - .read + .map('k1 -> 'k2)((k: String) => k + k) + val p2 = Tsv(args("input2")).read .mapTo((0, 1) -> ('k1, 'v2)) { v: (String, Int) => v } // An an extra fake key to do a join - .map('k1 -> 'k3) { (k: String) => k + k } + .map('k1 -> 'k3)((k: String) => k + k) p1.joinWithSmaller(('k1, 'k2) -> ('k1, 'k3), p2) .write(Tsv(args("output"))) } @@ -794,9 +814,8 @@ class CollidingKeyJoinTest extends WordSpec with Matchers { .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) .sink[(String, Int, String, Int, String)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k: String, v1: Int, k2: String, v2: Int, k3: String) => - (k, (v1, k2, v2, k3)) + val actualOutput = outBuf.map { case (k: String, v1: Int, k2: String, v2: Int, k3: String) => + (k, (v1, k2, v2, k3)) }.toMap "join tuples with the same key" in { correctOutput shouldBe actualOutput @@ -808,11 +827,9 @@ class CollidingKeyJoinTest extends WordSpec with Matchers { } class TinyJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read + val p1 = Tsv(args("input1")).read .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } - val p2 = Tsv(args("input2")) - .read + val p2 = Tsv(args("input2")).read .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } p1.joinWithTiny('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) @@ -832,9 +849,8 @@ class TinyJoinTest extends WordSpec with Matchers { .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k: String, v1: Int, v2: Int) => - (k, (v1, v2)) + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap (idx + ": join tuples with the same key") in { actualOutput shouldBe correctOutput @@ -848,11 +864,9 @@ class TinyJoinTest extends WordSpec with Matchers { } class TinyCollisionJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read + val p1 = Tsv(args("input1")).read .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } - val p2 = Tsv(args("input2")) - .read + val p2 = Tsv(args("input2")).read .mapTo((0, 1) -> ('k1, 'v2)) { v: (String, Int) => v } p1.joinWithTiny('k1 -> 'k1, p2) .write(Tsv(args("output"))) @@ -871,9 +885,8 @@ class TinyCollisionJoinTest extends WordSpec with Matchers { .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k: String, v1: Int, v2: Int) => - (k, (v1, v2)) + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap "join tuples with the same key" in { correctOutput shouldBe actualOutput @@ -889,7 +902,8 @@ class TinyThenSmallJoin(args: Args) extends Job(args) { val pipe1 = Tsv("in1", ('x1, 'y1)).read val pipe2 = Tsv("in2", ('x2, 'y2)).read - pipe0.joinWithTiny('x0 -> 'x1, pipe1) + pipe0 + .joinWithTiny('x0 -> 'x1, pipe1) .joinWithSmaller('x0 -> 'x2, pipe2) .map(('y0, 'y1, 'y2) -> ('y0, 'y1, 'y2)) { v: (TC, TC, TC) => (v._1.n, v._2.n, v._3.n) @@ -905,8 +919,7 @@ class TinyThenSmallJoinTest extends WordSpec with Matchers with FieldConversions val input0 = List((1, TC(2)), (2, TC(3)), (3, TC(4))) val input1 = List((1, TC(20)), (2, TC(30)), (3, TC(40))) val input2 = List((1, TC(200)), (2, TC(300)), (3, TC(400))) - val correct = List((1, 2, 1, 20, 1, 200), - (2, 3, 2, 30, 2, 300), (3, 4, 3, 40, 3, 400)) + val correct = List((1, 2, 1, 20, 1, 200), (2, 3, 2, 30, 2, 300), (3, 4, 3, 40, 3, 400)) var idx = 0 JobTest(new TinyThenSmallJoin(_)) .source(Tsv("in0", ('x0, 'y0)), input0) @@ -932,7 +945,7 @@ class LeftJoinJob(args: Args) extends Job(args) { p1.leftJoinWithSmaller('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) // Null sent to TSV will not be read in properly - .map('v2 -> 'v2) { v: AnyRef => Option(v).map { _.toString }.getOrElse("NULL") } + .map('v2 -> 'v2) { v: AnyRef => Option(v).map(_.toString).getOrElse("NULL") } .write(Tsv(args("output"))) } @@ -940,8 +953,7 @@ class LeftJoinTest extends WordSpec with Matchers { "A LeftJoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) - val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), - "c" -> (3, "5")) + val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), "c" -> (3, "5")) var idx = 0 JobTest(new LeftJoinJob(_)) .arg("input1", "fakeInput1") @@ -975,7 +987,7 @@ class LeftJoinWithLargerJob(args: Args) extends Job(args) { p1.joinWithLarger('k1 -> 'k2, p2, new cascading.pipe.joiner.LeftJoin) .project('k1, 'v1, 'v2) // Null sent to TSV will not be read in properly - .map('v2 -> 'v2) { v: AnyRef => Option(v).map { _.toString }.getOrElse("NULL") } + .map('v2 -> 'v2) { v: AnyRef => Option(v).map(_.toString).getOrElse("NULL") } .write(Tsv(args("output"))) } @@ -983,8 +995,7 @@ class LeftJoinWithLargerTest extends WordSpec with Matchers { "A LeftJoinWithLargerJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) - val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), - "c" -> (3, "5")) + val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), "c" -> (3, "5")) var idx = 0 JobTest(new LeftJoinWithLargerJob(_)) .arg("input1", "fakeInput1") @@ -1011,15 +1022,17 @@ class LeftJoinWithLargerTest extends WordSpec with Matchers { class MergeTestJob(args: Args) extends Job(args) { val in = TextLine(args("in")).read.mapTo(1 -> ('x, 'y)) { line: String => - val p = line.split(" ").map { _.toDouble } + val p = line.split(" ").map(_.toDouble) (p(0), p(1)) } - val big = in.filter('x) { (x: Double) => (x > 0.5) } - val small = in.filter('x) { (x: Double) => (x <= 0.5) } - (big ++ small).groupBy('x) { _.max('y) } + val big = in.filter('x)((x: Double) => (x > 0.5)) + val small = in.filter('x)((x: Double) => (x <= 0.5)) + (big ++ small) + .groupBy('x)(_.max('y)) .write(Tsv(args("out"))) // Self merge should work - (big ++ big).groupBy('x) { _.max('y) } + (big ++ big) + .groupBy('x)(_.max('y)) .write(Tsv("out2")) } @@ -1027,16 +1040,15 @@ class MergeTest extends WordSpec with Matchers { "A MergeTest" should { val r = new java.util.Random //Here is our input data: - val input = (0 to 100).map { i => (i.toString, r.nextDouble.toString + " " + r.nextDouble.toString) } + val input = (0 to 100).map(i => (i.toString, r.nextDouble.toString + " " + r.nextDouble.toString)) //Here is our expected output: - val parsed = input.map { - case (line: String, x: String) => - val t = x.split(" ").map { _.toDouble } - (t(0), t(1)) + val parsed = input.map { case (line: String, x: String) => + val t = x.split(" ").map(_.toDouble) + (t(0), t(1)) } val big = parsed.filter(_._1 > 0.5) val small = parsed.filter(_._1 <= 0.5) - val golden = (big ++ small).groupBy{ _._1 }.mapValues { itup => (itup.map{ _._2 }.max) } + val golden = (big ++ small).groupBy(_._1).mapValues(itup => (itup.map(_._2).max)) //Now we have the expected input and output: JobTest(new MergeTestJob(_)) .arg("in", "fakeInput") @@ -1049,7 +1061,7 @@ class MergeTest extends WordSpec with Matchers { } .sink[(Double, Double)](Tsv("out2")) { outBuf => "correctly self merge" in { - outBuf.toMap shouldBe (big.groupBy(_._1).mapValues{ iter => iter.map(_._2).max }) + outBuf.toMap shouldBe big.groupBy(_._1).mapValues { iter => iter.map(_._2).max } } } .run @@ -1058,13 +1070,15 @@ class MergeTest extends WordSpec with Matchers { } class SizeAveStdJob(args: Args) extends Job(args) { - TextLine(args("input")).mapTo('x, 'y) { line => - val p = line.split(" ").map { _.toDouble }.slice(0, 2) - (p(0), p(1)) - }.map('x -> 'x) { (x: Double) => (4 * x).toInt } + TextLine(args("input")) + .mapTo('x, 'y) { line => + val p = line.split(" ").map(_.toDouble).slice(0, 2) + (p(0), p(1)) + } + .map('x -> 'x)((x: Double) => (4 * x).toInt) .groupBy('x) { _.sizeAveStdev('y -> ('size, 'yave, 'ystdev)) - //Make sure this doesn't ruin the calculation + //Make sure this doesn't ruin the calculation .sizeAveStdev('y -> ('size2, 'yave2, 'ystdev2)) .average('y) } @@ -1075,12 +1089,11 @@ class SizeAveStdJob(args: Args) extends Job(args) { class SizeAveStdSpec extends WordSpec with Matchers { "A sizeAveStd job" should { val r = new java.util.Random - def powerLawRand = { + def powerLawRand = // Generates a 1/x powerlaw with a max value or 1e40 scala.math.pow(1e40, r.nextDouble) - } //Here is our input data: - val input = (0 to 10000).map { i => (i.toString, r.nextDouble.toString + " " + powerLawRand.toString) } + val input = (0 to 10000).map(i => (i.toString, r.nextDouble.toString + " " + powerLawRand.toString)) val output = input .map { numline => val vec = numline._2.split(" ").map(_.toDouble) @@ -1092,7 +1105,7 @@ class SizeAveStdSpec extends WordSpec with Matchers { val size = all.size.toLong val ave = all.sum / size //Compute the standard deviation: - val vari = all.map { x => (x - ave) * (x - ave) }.sum / (size) + val vari = all.map(x => (x - ave) * (x - ave)).sum / size val stdev = scala.math.sqrt(vari) (size, ave, stdev) } @@ -1121,12 +1134,13 @@ class SizeAveStdSpec extends WordSpec with Matchers { } class DoubleGroupJob(args: Args) extends Job(args) { - TextLine(args("in")).mapTo('x, 'y) { line => - val p = line.split(" ") - (p(0), p(1)) - } - .groupBy('x) { _.size } - .groupBy('size) { _.size('cnt) } + TextLine(args("in")) + .mapTo('x, 'y) { line => + val p = line.split(" ") + (p(0), p(1)) + } + .groupBy('x)(_.size) + .groupBy('size)(_.size('cnt)) .write(Tsv(args("out"))) } @@ -1135,13 +1149,18 @@ class DoubleGroupSpec extends WordSpec with Matchers { JobTest(new DoubleGroupJob(_)) .arg("in", "fakeIn") .arg("out", "fakeOut") - .source(TextLine("fakeIn"), List("0" -> "one 1", - "1" -> "two 1", - "2" -> "two 2", - "3" -> "three 3", - "4" -> "three 4", - "5" -> "three 5", - "6" -> "just one")) + .source( + TextLine("fakeIn"), + List( + "0" -> "one 1", + "1" -> "two 1", + "2" -> "two 2", + "3" -> "three 3", + "4" -> "three 4", + "5" -> "three 5", + "6" -> "just one" + ) + ) .sink[(Long, Long)](Tsv("fakeOut")) { outBuf => "correctly build histogram" in { val outM = outBuf.toMap @@ -1156,11 +1175,12 @@ class DoubleGroupSpec extends WordSpec with Matchers { } class GroupUniqueJob(args: Args) extends Job(args) { - TextLine(args("in")).mapTo('x, 'y) { line => - val p = line.split(" ") - (p(0), p(1)) - } - .groupBy('x) { _.size } + TextLine(args("in")) + .mapTo('x, 'y) { line => + val p = line.split(" ") + (p(0), p(1)) + } + .groupBy('x)(_.size) .unique('size) .write(Tsv(args("out"))) } @@ -1170,14 +1190,19 @@ class GroupUniqueSpec extends WordSpec with Matchers { JobTest(new GroupUniqueJob(_)) .arg("in", "fakeIn") .arg("out", "fakeOut") - .source(TextLine("fakeIn"), List("0" -> "one 1", - "1" -> "two 1", - "2" -> "two 2", - "3" -> "three 3", - "4" -> "three 4", - "5" -> "three 5", - "6" -> "just one")) - .sink[(Long)](Tsv("fakeOut")) { outBuf => + .source( + TextLine("fakeIn"), + List( + "0" -> "one 1", + "1" -> "two 1", + "2" -> "two 2", + "3" -> "three 3", + "4" -> "three 4", + "5" -> "three 5", + "6" -> "just one" + ) + ) + .sink[Long](Tsv("fakeOut")) { outBuf => "correctly count unique sizes" in { outBuf.toSet should have size 3 } @@ -1188,11 +1213,12 @@ class GroupUniqueSpec extends WordSpec with Matchers { } class DiscardTestJob(args: Args) extends Job(args) { - TextLine(args("in")).flatMapTo('words) { line => line.split("\\s+") } + TextLine(args("in")) + .flatMapTo('words)(line => line.split("\\s+")) .map('words -> 'wsize) { word: String => word.length } .discard('words) .map('* -> 'correct) { te: TupleEntry => !te.getFields.contains('words) } - .groupAll { _.forall('correct -> 'correct) { x: Boolean => x } } + .groupAll(_.forall('correct -> 'correct) { x: Boolean => x }) .write(Tsv(args("out"))) } @@ -1217,8 +1243,8 @@ class DiscardTest extends WordSpec with Matchers { class HistogramJob(args: Args) extends Job(args) { TextLine(args("in")).read - .groupBy('line) { _.size } - .groupBy('size) { _.size('freq) } + .groupBy('line)(_.size) + .groupBy('size)(_.size('freq)) .write(Tsv(args("out"))) } @@ -1244,9 +1270,9 @@ class HistogramTest extends WordSpec with Matchers { class ForceReducersJob(args: Args) extends Job(args) { TextLine("in").read .rename((0, 1) -> ('num, 'line)) - .flatMap('line -> 'words){ l: String => l.split(" ") } - .groupBy('num){ _.toList[String]('words -> 'wordList).forceToReducers } - .map('wordList -> 'wordList){ w: List[String] => w.mkString(" ") } + .flatMap('line -> 'words) { l: String => l.split(" ") } + .groupBy('num)(_.toList[String]('words -> 'wordList).forceToReducers) + .map('wordList -> 'wordList) { w: List[String] => w.mkString(" ") } .project('num, 'wordList) .write(Tsv("out")) } @@ -1272,16 +1298,16 @@ class ForceReducersTest extends WordSpec with Matchers { class ToListJob(args: Args) extends Job(args) { TextLine(args("in")).read - .flatMap('line -> 'words){ l: String => l.split(" ") } - .groupBy('offset){ _.toList[String]('words -> 'wordList) } - .map('wordList -> 'wordList){ w: List[String] => w.mkString(" ") } + .flatMap('line -> 'words) { l: String => l.split(" ") } + .groupBy('offset)(_.toList[String]('words -> 'wordList)) + .map('wordList -> 'wordList) { w: List[String] => w.mkString(" ") } .project('offset, 'wordList) .write(Tsv(args("out"))) } class NullListJob(args: Args) extends Job(args) { TextLine(args("in")).read - .groupBy('offset){ _.toList[String]('line -> 'lineList).spillThreshold(100) } + .groupBy('offset)(_.toList[String]('line -> 'lineList).spillThreshold(100)) .map('lineList -> 'lineList) { ll: List[String] => ll.mkString(" ") } .write(Tsv(args("out"))) } @@ -1329,7 +1355,7 @@ class CrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } val p2 = Tsv(args("in2")).read - .mapTo(0 -> 'z) { (z: Int) => z } + .mapTo(0 -> 'z)((z: Int) => z) p1.crossWithTiny(p2).write(Tsv(args("out"))) } @@ -1341,7 +1367,7 @@ class CrossTest extends WordSpec with Matchers { .arg("in2", "fakeIn2") .arg("out", "fakeOut") .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) - .source(Tsv("fakeIn2"), List("4", "5").map { Tuple1(_) }) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) .sink[(Int, Int, Int)](Tsv("fakeOut")) { outBuf => (idx + ": must look exactly right") in { outBuf should have size 6 @@ -1358,11 +1384,11 @@ class CrossTest extends WordSpec with Matchers { class GroupAllCrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } - .groupAll { _.max('x) } + .groupAll(_.max('x)) .map('x -> 'x) { x: Int => List(x) } val p2 = Tsv(args("in2")).read - .mapTo(0 -> 'z) { (z: Int) => z } + .mapTo(0 -> 'z)((z: Int) => z) p2.crossWithTiny(p1) .map('x -> 'x) { l: List[Int] => l.size } .project('x, 'z) @@ -1377,7 +1403,7 @@ class GroupAllCrossTest extends WordSpec with Matchers { .arg("in2", "fakeIn2") .arg("out", "fakeOut") .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) - .source(Tsv("fakeIn2"), List("4", "5").map { Tuple1(_) }) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) .sink[(Int, Int)](Tsv("fakeOut")) { outBuf => (idx + ": must look exactly right") in { outBuf should have size 2 @@ -1395,7 +1421,7 @@ class SmallCrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } val p2 = Tsv(args("in2")).read - .mapTo(0 -> 'z) { (z: Int) => z } + .mapTo(0 -> 'z)((z: Int) => z) p1.crossWithSmaller(p2).write(Tsv(args("out"))) } @@ -1407,7 +1433,7 @@ class SmallCrossTest extends WordSpec with Matchers { .arg("in2", "fakeIn2") .arg("out", "fakeOut") .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) - .source(Tsv("fakeIn2"), List("4", "5").map { Tuple1(_) }) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) .sink[(Int, Int, Int)](Tsv("fakeOut")) { outBuf => (idx + ": must look exactly right") in { outBuf should have size 6 @@ -1423,9 +1449,9 @@ class SmallCrossTest extends WordSpec with Matchers { class TopKJob(args: Args) extends Job(args) { Tsv(args("in")).read - .mapTo(0 -> 'x) { (tup: Int) => tup } + .mapTo(0 -> 'x)((tup: Int) => tup) //Take the smallest 3 values: - .groupAll { _.sortedTake[Int]('x -> 'x, 3) } + .groupAll(_.sortedTake[Int]('x -> 'x, 3)) .write(Tsv(args("out"))) } @@ -1434,7 +1460,7 @@ class TopKTest extends WordSpec with Matchers { JobTest(new TopKJob(_)) .arg("in", "fakeIn") .arg("out", "fakeOut") - .source(Tsv("fakeIn"), List(3, 24, 1, 4, 5).map { Tuple1(_) }) + .source(Tsv("fakeIn"), List(3, 24, 1, 4, 5).map(Tuple1(_))) .sink[List[Int]](Tsv("fakeOut")) { outBuf => "must look exactly right" in { outBuf should have size 1 @@ -1450,7 +1476,7 @@ class ScanJob(args: Args) extends Job(args) { Tsv("in", ('x, 'y, 'z)) .groupBy('x) { _.sortBy('y) - .scanLeft('y -> 'ys)(0) { (oldV: Int, newV: Int) => oldV + newV } + .scanLeft('y -> 'ys)(0)((oldV: Int, newV: Int) => oldV + newV) } .project('x, 'ys, 'z) .map('z -> 'z) { z: Int => z } //Make sure the null z is converted to an int @@ -1481,7 +1507,7 @@ class TakeJob(args: Args) extends Job(args) { val input = Tsv("in").read .mapTo((0, 1, 2) -> ('x, 'y, 'z)) { tup: (Int, Int, Int) => tup } - input.groupBy('x) { _.take(2) }.write(Tsv("out2")) + input.groupBy('x)(_.take(2)).write(Tsv("out2")) input.groupAll.write(Tsv("outall")) } @@ -1510,7 +1536,7 @@ class DropJob(args: Args) extends Job(args) { val input = Tsv("in").read .mapTo((0, 1, 2) -> ('x, 'y, 'z)) { tup: (Int, Int, Int) => tup } - input.groupBy('x) { _.drop(2) }.write(Tsv("out2")) + input.groupBy('x)(_.drop(2)).write(Tsv("out2")) input.groupAll.write(Tsv("outall")) } @@ -1540,11 +1566,13 @@ class PivotJob(args: Args) extends Job(args) { .write(Tsv("unpivot")) .groupBy('k) { _.pivot(('col, 'val) -> ('w, 'y, 'z)) - }.write(Tsv("pivot")) + } + .write(Tsv("pivot")) .unpivot(('w, 'y, 'z) -> ('col, 'val)) .groupBy('k) { _.pivot(('col, 'val) -> ('w, 'y, 'z, 'default), 2.0) - }.write(Tsv("pivot_with_default")) + } + .write(Tsv("pivot_with_default")) } class PivotTest extends WordSpec with Matchers with FieldConversions { @@ -1555,8 +1583,14 @@ class PivotTest extends WordSpec with Matchers with FieldConversions { .sink[(String, String, String)](Tsv("unpivot")) { outBuf => "unpivot columns correctly" in { outBuf should have size 6 - outBuf.toList.sorted shouldBe (List(("1", "w", "a"), ("1", "y", "b"), ("1", "z", "c"), - ("2", "w", "d"), ("2", "y", "e"), ("2", "z", "f")).sorted) + outBuf.toList.sorted shouldBe (List( + ("1", "w", "a"), + ("1", "y", "b"), + ("1", "z", "c"), + ("2", "w", "d"), + ("2", "y", "e"), + ("2", "z", "f") + ).sorted) } } .sink[(String, String, String, String)](Tsv("pivot")) { outBuf => @@ -1588,7 +1622,8 @@ class IterableSourceJob(args: Args) extends Job(args) { .write(Tsv("tiny")) //Now without fields and using the implicit: Tsv("in", ('x, 'w)) - .joinWithTiny('x -> 0, list).write(Tsv("imp")) + .joinWithTiny('x -> 0, list) + .write(Tsv("imp")) } class IterableSourceTest extends WordSpec with Matchers with FieldConversions { @@ -1622,10 +1657,13 @@ class IterableSourceTest extends WordSpec with Matchers with FieldConversions { } class HeadLastJob(args: Args) extends Job(args) { - Tsv("input", ('x, 'y)).groupBy('x) { - _.sortBy('y) - .head('y -> 'yh).last('y -> 'yl) - }.write(Tsv("output")) + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.sortBy('y) + .head('y -> 'yh) + .last('y -> 'yl) + } + .write(Tsv("output")) } class HeadLastTest extends WordSpec with Matchers { @@ -1645,9 +1683,11 @@ class HeadLastTest extends WordSpec with Matchers { } class HeadLastUnsortedJob(args: Args) extends Job(args) { - Tsv("input", ('x, 'y)).groupBy('x) { - _.head('y -> 'yh).last('y -> 'yl) - }.write(Tsv("output")) + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.head('y -> 'yh).last('y -> 'yl) + } + .write(Tsv("output")) } class HeadLastUnsortedTest extends WordSpec with Matchers { @@ -1667,11 +1707,13 @@ class HeadLastUnsortedTest extends WordSpec with Matchers { } class MkStringToListJob(args: Args) extends Job(args) { - Tsv("input", ('x, 'y)).groupBy('x) { - _.sortBy('y) - .mkString('y -> 'ystring, ",") - .toList[Int]('y -> 'ylist) - }.write(Tsv("output")) + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.sortBy('y) + .mkString('y -> 'ystring, ",") + .toList[Int]('y -> 'ylist) + } + .write(Tsv("output")) } class MkStringToListTest extends WordSpec with Matchers with FieldConversions { @@ -1713,19 +1755,21 @@ class InsertJobTest extends WordSpec with Matchers { } class FoldJob(args: Args) extends Job(args) { - import scala.collection.mutable.{ Set => MSet } - Tsv("input", ('x, 'y)).groupBy('x) { - // DON'T USE MUTABLE, IT IS UNCOOL AND DANGEROUS!, but we test, just in case - _.foldLeft('y -> 'yset)(MSet[Int]()){ (ms: MSet[Int], y: Int) => - ms += y - ms + import scala.collection.mutable.{Set => MSet} + Tsv("input", ('x, 'y)) + .groupBy('x) { + // DON'T USE MUTABLE, IT IS UNCOOL AND DANGEROUS!, but we test, just in case + _.foldLeft('y -> 'yset)(MSet[Int]()) { (ms: MSet[Int], y: Int) => + ms += y + ms + } } - }.write(Tsv("output")) + .write(Tsv("output")) } class FoldJobTest extends WordSpec with Matchers { import Dsl._ - import scala.collection.mutable.{ Set => MSet } + import scala.collection.mutable.{Set => MSet} val input = List((1, 30), (1, 10), (1, 20), (2, 0)) "A FoldTestJob" should { @@ -1746,8 +1790,8 @@ class FoldJobTest extends WordSpec with Matchers { case class V(v: Int) class InnerCaseJob(args: Args) extends Job(args) { val res = TypedTsv[Int]("input") - .mapTo(('xx, 'vx)) { x => (x * x, V(x)) } - .groupBy('xx) { _.head('vx) } + .mapTo(('xx, 'vx))(x => (x * x, V(x))) + .groupBy('xx)(_.head('vx)) .map('vx -> 'x) { v: V => v.v } .project('x, 'xx) .write(Tsv("output")) @@ -1770,8 +1814,7 @@ class InnerCaseTest extends WordSpec with Matchers { } class NormalizeJob(args: Args) extends Job(args) { - Tsv("in") - .read + Tsv("in").read .mapTo((0, 1) -> ('x, 'y)) { tup: (Double, Int) => tup } .normalize('x) .project('x, 'y) @@ -1794,12 +1837,10 @@ class NormalizeTest extends WordSpec with Matchers { } class ForceToDiskJob(args: Args) extends Job(args) { - val x = Tsv("in", ('x, 'y)) - .read + val x = Tsv("in", ('x, 'y)).read .filter('x) { x: Int => x > 0 } .rename('x -> 'x1) - Tsv("in", ('x, 'y)) - .read + Tsv("in", ('x, 'y)).read .joinWithTiny('y -> 'y, x.forceToDisk) .project('x, 'x1, 'y) .write(Tsv("out")) @@ -1810,13 +1851,13 @@ class ForceToDiskTest extends WordSpec with Matchers { "A ForceToDiskJob" should { var idx = 0 - val input = (1 to 1000).flatMap { i => List((-1, i), (1, i)) }.toList + val input = (1 to 1000).flatMap(i => List((-1, i), (1, i))).toList JobTest(new ForceToDiskJob(_)) .source(Tsv("in", ('x, 'y)), input) .sink[(Int, Int, Int)](Tsv("out")) { outBuf => (idx + ": run correctly when combined with joinWithTiny") in { outBuf should have size 2000 - val correct = (1 to 1000).flatMap { y => List((1, 1, y), (-1, 1, y)) }.sorted + val correct = (1 to 1000).flatMap(y => List((1, 1, y), (-1, 1, y))).sorted outBuf.toList.sorted shouldBe correct } idx += 1 @@ -1828,10 +1869,9 @@ class ForceToDiskTest extends WordSpec with Matchers { } class ThrowsErrorsJob(args: Args) extends Job(args) { - Tsv("input", ('letter, 'x)) - .read + Tsv("input", ('letter, 'x)).read .addTrap(Tsv("trapped")) - .map(('letter, 'x) -> 'yPrime){ fields: Product => + .map(('letter, 'x) -> 'yPrime) { fields: Product => val x = fields.productElement(1).asInstanceOf[Int] if (x == 1) throw new Exception("Erroneous Ones") else x } @@ -1877,12 +1917,13 @@ object TypedThrowsErrorsJob { class TypedThrowsErrorsJob(args: Args) extends Job(args) { import TypedThrowsErrorsJob._ - TypedPipe.from(input) - .map { trans1(_) } + TypedPipe + .from(input) + .map(trans1(_)) .addTrap(trap1) - .map { tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup) } + .map(tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup)) .addTrap(trap2) - .map { tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup) } + .map(tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup)) .write(output) } @@ -1899,11 +1940,12 @@ object TypedThrowsErrorsJob2 { class TypedThrowsErrorsJob2(args: Args) extends Job(args) { import TypedThrowsErrorsJob2._ - TypedPipe.from(input) - .map { trans1(_) } + TypedPipe + .from(input) + .map(trans1(_)) .addTrap(trap) - .map { tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup) } - .map { tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup) } + .map(tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup)) + .map(tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup)) .write(output) } @@ -1960,12 +2002,12 @@ class TypedItsATrapTest extends WordSpec with Matchers { class GroupAllToListTestJob(args: Args) extends Job(args) { TypedTsv[(Long, String, Double)]("input") .mapTo('a, 'b) { case (id, k, v) => (id, Map(k -> v)) } - .groupBy('a) { _.sum[Map[String, Double]]('b) } + .groupBy('a)(_.sum[Map[String, Double]]('b)) .groupAll { _.toList[(Long, Map[String, Double])](('a, 'b) -> 'abList) } - .map('abList -> 'abMap) { - list: List[(Long, Map[String, Double])] => list.toMap + .map('abList -> 'abMap) { list: List[(Long, Map[String, Double])] => + list.toMap } .project('abMap) .map('abMap -> 'abMap) { x: AnyRef => x.toString } @@ -1993,7 +2035,7 @@ class GroupAllToListTest extends WordSpec with Matchers { class ToListGroupAllToListTestJob(args: Args) extends Job(args) { TypedTsv[(Long, String)]("input") .mapTo('b, 'c) { case (k, v) => (k, v) } - .groupBy('c) { _.toList[Long]('b -> 'bList) } + .groupBy('c)(_.toList[Long]('b -> 'bList)) .groupAll { _.toList[(String, List[Long])](('c, 'bList) -> 'cbList) } @@ -2007,7 +2049,10 @@ class ToListGroupAllToListSpec extends WordSpec with Matchers { "A ToListGroupAllToListTestJob" should { JobTest(new ToListGroupAllToListTestJob(_)) - .source(TypedTsv[(Long, String)]("input"), List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb"))) + .source( + TypedTsv[(Long, String)]("input"), + List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb")) + ) .sink[String](Tsv("output")) { outBuf => "must properly aggregate stuff in hadoop mode" in { outBuf should have size 1 @@ -2019,7 +2064,10 @@ class ToListGroupAllToListSpec extends WordSpec with Matchers { .finish() JobTest(new ToListGroupAllToListTestJob(_)) - .source(TypedTsv[(Long, String)]("input"), List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb"))) + .source( + TypedTsv[(Long, String)]("input"), + List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb")) + ) .sink[List[(String, List[Long])]](Tsv("output")) { outBuf => "must properly aggregate stuff in local model" in { outBuf should have size 1 @@ -2069,11 +2117,11 @@ class HangingTest extends Specification { .finish() } } -*/ + */ class Function2Job(args: Args) extends Job(args) { import FunctionImplicits._ - Tsv("in", ('x, 'y)).mapTo(('x, 'y) -> 'xy) { (x: String, y: String) => x + y }.write(Tsv("output")) + Tsv("in", ('x, 'y)).mapTo(('x, 'y) -> 'xy)((x: String, y: String) => x + y).write(Tsv("output")) } class Function2Test extends WordSpec with Matchers { @@ -2103,18 +2151,23 @@ class SampleWithReplacementTest extends WordSpec with Matchers { import com.twitter.scalding.mathematics.Poisson val p = new Poisson(1.0, 0) - val simulated = (1 to 100).map{ - i => i -> p.nextInt - }.filterNot(_._2 == 0).toSet + val simulated = (1 to 100) + .map { i => + i -> p.nextInt + } + .filterNot(_._2 == 0) + .toSet "A SampleWithReplacementJob" should { JobTest(new SampleWithReplacementJob(_)) .source(Tsv("in"), (1 to 100).map(i => i)) .sink[Int](Tsv("output")) { outBuf => "sampleWithReplacement must sample items according to a poisson distribution" in { - outBuf.toList.groupBy(i => i) + outBuf.toList + .groupBy(i => i) .map(p => p._1 -> p._2.size) - .filterNot(_._2 == 0).toSet shouldBe simulated + .filterNot(_._2 == 0) + .toSet shouldBe simulated } } .run @@ -2133,8 +2186,21 @@ class VerifyTypesJob(args: Args) extends Job(args) { class VerifyTypesJobTest extends WordSpec with Matchers { "Verify types operation" should { "put bad records in a trap" in { - val input = List((3, "aaa"), (23, 154), (15, "123"), (53, 143), (7, 85), (19, 195), - (42, 187), (35, 165), (68, 121), (13, "34"), (17, 173), (2, 13), (2, "break")) + val input = List( + (3, "aaa"), + (23, 154), + (15, "123"), + (53, 143), + (7, 85), + (19, 195), + (42, 187), + (35, 165), + (68, 121), + (13, "34"), + (17, 173), + (2, 13), + (2, "break") + ) JobTest(new VerifyTypesJob(_)) .source(Tsv("input", new Fields("age", "weight")), input) @@ -2152,8 +2218,7 @@ class VerifyTypesJobTest extends WordSpec with Matchers { } class SortingJob(args: Args) extends Job(args) { - Tsv("in", ('x, 'y, 'z)) - .read + Tsv("in", ('x, 'y, 'z)).read .groupAll(_.sortBy('y)) .write(Tsv("output")) } @@ -2176,14 +2241,16 @@ class SortingJobTest extends WordSpec with Matchers { class CollectJob(args: Args) extends Job(args) { Tsv("input", new Fields("name", "age")) - .collectTo[(String, Int), String](('name, 'age) -> 'adultFirstNames) { case (name, age) if age > 18 => name.split(" ").head } + .collectTo[(String, Int), String](('name, 'age) -> 'adultFirstNames) { + case (name, age) if age > 18 => name.split(" ").head + } .write(Tsv("output")) } class CollectJobTest extends WordSpec with Matchers { "A CollectJob" should { val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) - val expectedOutput = input.collect{ case (name, age) if age > 18 => name.split(" ").head } + val expectedOutput = input.collect { case (name, age) if age > 18 => name.split(" ").head } JobTest(new CollectJob(_)) .source(Tsv("input", new Fields("name", "age")), input) @@ -2242,7 +2309,7 @@ class CounterJob(args: Args) extends Job(args) { val age_group_older_than_18 = Stat("age_group_older_than_18") val reduce_hit = Stat("reduce_hit") Tsv("input", new Fields("name", "age")) - .filter('age){ age: Int => + .filter('age) { age: Int => foo_bar.incBy(2) true } @@ -2251,11 +2318,10 @@ class CounterJob(args: Args) extends Job(args) { age_group_older_than_18.inc() name.split(" ").head } - .groupAll{ - _.reduce('age -> 'sum_of_ages) { - (acc: Int, age: Int) => - reduce_hit.inc() - acc + age + .groupAll { + _.reduce('age -> 'sum_of_ages) { (acc: Int, age: Int) => + reduce_hit.inc() + acc + age } } .write(Tsv("output")) @@ -2264,22 +2330,19 @@ class CounterJob(args: Args) extends Job(args) { class CounterJobTest extends WordSpec with Matchers { "A CounterJob" should { val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) - val expectedOutput = input.collect{ case (name, age) if age > 18 => age }.sum.toString + val expectedOutput = input.collect { case (name, age) if age > 18 => age }.sum.toString "have the right counter and output values" in { JobTest(new CounterJob(_)) .source(Tsv("input", new Fields("name", "age")), input) - .sink[String](Tsv("output")) { outBuf => outBuf(0) shouldBe expectedOutput } - .counter("foo_bar") { _ shouldBe 10 } - .counter("age_group_older_than_18") { _ shouldBe 3 } - .counter("reduce_hit") { _ shouldBe 2 } - .counter("bad_group_bad_counter") { _ shouldBe 0 } + .sink[String](Tsv("output"))(outBuf => outBuf(0) shouldBe expectedOutput) + .counter("foo_bar")(_ shouldBe 10) + .counter("age_group_older_than_18")(_ shouldBe 3) + .counter("reduce_hit")(_ shouldBe 2) + .counter("bad_group_bad_counter")(_ shouldBe 0) // This is redundant but just added here to show both methods for counter tests .counters { - _ shouldBe Map( - "foo_bar" -> 10, - "age_group_older_than_18" -> 3, - "reduce_hit" -> 2) + _ shouldBe Map("foo_bar" -> 10, "age_group_older_than_18" -> 3, "reduce_hit" -> 2) } .run .finish() @@ -2299,7 +2362,11 @@ object DailySuffixTsvJob { class DailySuffixTsvJob(args: Args) extends Job(args) with UtcDateRangeJob { import TDsl._ - DailySuffixTsvJob.source("input0").read.toTypedPipe[(String, Int)]((0, 1)).write(TypedTsv[(String, Int)]("output0")) + DailySuffixTsvJob + .source("input0") + .read + .toTypedPipe[(String, Int)]((0, 1)) + .write(TypedTsv[(String, Int)]("output0")) } class DailySuffixTsvTest extends WordSpec with Matchers { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala index a3648d5291..f8158e785a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala @@ -5,29 +5,27 @@ import org.scalatest.WordSpec import com.twitter.scalding.typed.CumulativeSum._ class AddRankingWithCumulativeSum(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[(String, Double)]("input1")) - .map { - case (gender, height) => - (gender, (height, 1L)) + TypedPipe + .from(TypedTsv[(String, Double)]("input1")) + .map { case (gender, height) => + (gender, (height, 1L)) } .cumulativeSum - .map { - case (gender, (height, rank)) => - (gender, height, rank) + .map { case (gender, (height, rank)) => + (gender, height, rank) } .write(TypedTsv("result1")) } class AddRankingWithPartitionedCumulativeSum(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[(String, Double)]("input1")) - .map { - case (gender, height) => - (gender, (height, 1L)) + TypedPipe + .from(TypedTsv[(String, Double)]("input1")) + .map { case (gender, height) => + (gender, (height, 1L)) } - .cumulativeSum { h => (h / 100).floor.toLong } - .map { - case (gender, (height, rank)) => - (gender, height, rank) + .cumulativeSum(h => (h / 100).floor.toLong) + .map { case (gender, (height, rank)) => + (gender, height, rank) } .write(TypedTsv("result1")) } @@ -45,7 +43,8 @@ class CumulativeSumTest1 extends WordSpec { ("female", "272.2"), ("male", "284.1"), ("male", "225.4"), - ("female", "228.6")) + ("female", "228.6") + ) // Each group sorted and ranking added highest person to shortest val expectedOutput1 = Set( @@ -58,7 +57,8 @@ class CumulativeSumTest1 extends WordSpec { ("male", 265.2, 5), ("male", 225.4, 4), ("female", 272.2, 4), - ("female", 228.6, 3)) + ("female", 228.6, 3) + ) "A simple ranking cumulative sum job" should { JobTest("com.twitter.scalding.AddRankingWithCumulativeSum") diff --git a/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala index be03eadcd7..71e4bb4084 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.typed.CoGrouped.distinctBy @@ -37,8 +37,9 @@ object DistinctByProps extends Properties("CoGrouped.DistinctBy") { val fn = { (i: Int) => idx += 1; idx } distinctBy(l)(fn) == l } - property("distinctBy works like groupBy(fn).map(_._2.head).toSet") = forAll { (l: List[Int], fn: Int => Byte) => - distinctBy(l)(fn).toSet == l.groupBy(fn).map(_._2.head).toSet + property("distinctBy works like groupBy(fn).map(_._2.head).toSet") = forAll { + (l: List[Int], fn: Int => Byte) => + distinctBy(l)(fn).toSet == l.groupBy(fn).map(_._2.head).toSet } property("distinctBy matches a mutable implementation") = forAll { (l: List[Int], fn: Int => Byte) => val dlist = distinctBy(l)(fn) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala index 2a7524b315..e61ec12c0a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalacheck.Properties @@ -21,7 +21,11 @@ import org.scalacheck.Prop._ // Be careful here in that Array[String] equality isn't contents based. its java referenced based. object ExecutionAppProperties extends Properties("ExecutionApp Properties") { - def debugPrint(inputArgs: Array[String], resultingHadoop: HadoopArgs, resultingNonHadoop: NonHadoopArgs): Unit = { + def debugPrint( + inputArgs: Array[String], + resultingHadoop: HadoopArgs, + resultingNonHadoop: NonHadoopArgs + ): Unit = { val errorMsg = "Input Args: " + inputArgs.map("\"" + _ + "\"").mkString(",") + "\n" + "Hadoop Args: " + resultingHadoop.toArray.mkString(",") + "\n" + "Non-Hadoop Args: " + resultingNonHadoop.toArray.mkString(",") + "\n" @@ -35,27 +39,29 @@ object ExecutionAppProperties extends Properties("ExecutionApp Properties") { res } - property("adding an hadoop lib jars in the middle will extract it right") = forAll { (leftArgs: Array[String], rightArgs: Array[String]) => - // in the process of validating the hadoop args we give this to generic options parser - // as a result this file must exist. the parser enforces this. - val inputHadoopArgs = Array("-libjars", "/etc/hosts") - val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs - val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) - val res = (!hadoopArgs.toArray.isEmpty) && - (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && - (inputHadoopArgs.sameElements(hadoopArgs.toArray)) - if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) - res + property("adding an hadoop lib jars in the middle will extract it right") = forAll { + (leftArgs: Array[String], rightArgs: Array[String]) => + // in the process of validating the hadoop args we give this to generic options parser + // as a result this file must exist. the parser enforces this. + val inputHadoopArgs = Array("-libjars", "/etc/hosts") + val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) + val res = (!hadoopArgs.toArray.isEmpty) && + (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && + (inputHadoopArgs.sameElements(hadoopArgs.toArray)) + if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) + res } - property("adding an hadoop -D parameter in the middle will extract it right") = forAll { (leftArgs: Array[String], rightArgs: Array[String]) => - val inputHadoopArgs = Array("-Dx.y.z=123") - val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs - val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) - val res = (!hadoopArgs.toArray.isEmpty) && - (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && - (inputHadoopArgs.sameElements(hadoopArgs.toArray)) - if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) - res + property("adding an hadoop -D parameter in the middle will extract it right") = forAll { + (leftArgs: Array[String], rightArgs: Array[String]) => + val inputHadoopArgs = Array("-Dx.y.z=123") + val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) + val res = (!hadoopArgs.toArray.isEmpty) && + (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && + (inputHadoopArgs.sameElements(hadoopArgs.toArray)) + if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) + res } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala index 9fcda42a91..079b0b734b 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala @@ -17,7 +17,9 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.Buffer class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { - class MemorySource[T: TupleConverter](inFields: Fields = Fields.NONE) extends Mappable[T] with TypedSink[T] { + class MemorySource[T: TupleConverter](inFields: Fields = Fields.NONE) + extends Mappable[T] + with TypedSink[T] { private[this] val buf = Buffer[Tuple]() private[this] val name: String = UUID.randomUUID.toString @@ -33,13 +35,13 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { case _ => sys.error("MemorySink only usable with cascading local") } - def fields = { + def fields = if (inFields.isNone && setter.arity > 0) { Dsl.intFields(0 until setter.arity) } else inFields - } - override def converter[U >: T]: TupleConverter[U] = TupleConverter.asSuperConverter[T, U](implicitly[TupleConverter[T]]) + override def converter[U >: T]: TupleConverter[U] = + TupleConverter.asSuperConverter[T, U](implicitly[TupleConverter[T]]) private lazy val hdfsTap: Tap[_, _, _] = new MemorySourceTap(buf.asJava, fields) @@ -48,11 +50,11 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { sys.error("IterableSource is a Read-only Source") } mode match { - case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) - case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) - case Hdfs(_, _) => hdfsTap + case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) + case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) + case Hdfs(_, _) => hdfsTap case HadoopTest(_, _) => hdfsTap - case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) + case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) } } } @@ -68,26 +70,25 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { override def apply(p: TypedPipe[Int]): TypedPipe[Int] = p.map(_ + i) } - def mapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = { + def mapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = exec.flatMap { pipe => Gen.frequency( (1, Execution.Mapped(pipe, PlusOne())), (5, Arbitrary.arbitrary[Int].map(i => Execution.Mapped(pipe, PlusI(i)))) ) } - } - case class ReplaceTo[T](to: Execution[TypedPipe[Int]]) extends (TypedPipe[Int] => Execution[TypedPipe[Int]]) { + case class ReplaceTo[T](to: Execution[TypedPipe[Int]]) + extends (TypedPipe[Int] => Execution[TypedPipe[Int]]) { override def apply(v1: TypedPipe[Int]): Execution[TypedPipe[Int]] = to } - def flatMapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = { + def flatMapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = exec.flatMap { from => exec.map { to => from.flatMap(ReplaceTo(to)) } } - } def zipped[A, B](left: Gen[Execution[A]], right: Gen[Execution[B]]): Gen[Execution[(A, B)]] = for { @@ -128,21 +129,26 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { ) val iterableExec = - Gen.oneOf( - zippedWrites, - zippedFlatMapped, - zippedMapped, - zipped(mappedOrFlatMapped, write(TypedPipeGen.genWithIterableSources)), - zipped(write(TypedPipeGen.genWithIterableSources), mappedOrFlatMapped) - ).map { exec => - exec flatMap { - case (left, right) => left.toIterableExecution.zip(right.toIterableExecution) - } map { - case (left, right) => left ++ right - } map { - _.toList.sorted + Gen + .oneOf( + zippedWrites, + zippedFlatMapped, + zippedMapped, + zipped(mappedOrFlatMapped, write(TypedPipeGen.genWithIterableSources)), + zipped(write(TypedPipeGen.genWithIterableSources), mappedOrFlatMapped) + ) + .map { exec => + exec + .flatMap { case (left, right) => + left.toIterableExecution.zip(right.toIterableExecution) + } + .map { case (left, right) => + left ++ right + } + .map { + _.toList.sorted + } } - } import ExecutionOptimizationRules._ @@ -170,7 +176,7 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { val (dag, _) = Dag(ex, ExecutionOptimizationRules.toLiteral) dag.allNodes.count { case Execution.WriteExecution(_, _, _) => true - case _ => false + case _ => false } } @@ -181,7 +187,8 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { } test("optimization rules are reproducible") { - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 500) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) forAll(genExec, genRule) { (exec, rule) => val optimized = ExecutionOptimizationRules.apply(exec, rule) @@ -191,7 +198,8 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { } test("standard rules are reproducible") { - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 500) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) forAll(genExec) { exec => val optimized = ExecutionOptimizationRules.stdOptimizations(exec) @@ -229,7 +237,8 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { val pipe = TypedPipe.from(List(1, 2, 3)) val sink = new MemorySource[Int]() - val job0 = pipe.writeExecution(sink) + val job0 = pipe + .writeExecution(sink) .zip(Execution.from("hello")) .zip(pipe.writeExecution(sink)) @@ -237,13 +246,15 @@ class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { assert(writeCount(ExecutionOptimizationRules.stdOptimizations(job0)) == 1) - val job1 = pipe.writeExecution(sink) + val job1 = pipe + .writeExecution(sink) .zip(Execution.from("hello").zip(pipe.writeExecution(sink))) assert(writeCount(job1) == 2) assert(writeCount(ExecutionOptimizationRules.stdOptimizations(job1)) == 1) - val job2 = pipe.writeExecution(sink) + val job2 = pipe + .writeExecution(sink) .zip(Execution.from("world")) .zip(Execution.from("hello").zip(pipe.writeExecution(sink))) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala index cd32e127f2..35abe77405 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization @@ -21,18 +21,19 @@ import java.nio.file.Files import java.io.File import java.util import java.util.concurrent.CountDownLatch -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ -import scala.concurrent.{ Future, Promise, ExecutionContext => ConcurrentExecutionContext } -import scala.util.{ Failure, Success, Try } -import cascading.flow.{ Flow, FlowDef, FlowListener } +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import scala.util.{Failure, Success, Try} +import cascading.flow.{Flow, FlowDef, FlowListener} import com.twitter.scalding.typed.cascading_backend.AsyncFlowDefRunner.TempFileCleanup import com.twitter.scalding.cascading_interop.FlowListenerPromise.FlowStopException import org.apache.hadoop.conf.Configuration object ExecutionTestJobs { def wordCount(in: String, out: String) = - TypedPipe.from(TextLine(in)) + TypedPipe + .from(TextLine(in)) .flatMap(_.split("\\s+")) .map((_, 1L)) .sumByKey @@ -57,10 +58,10 @@ object ExecutionTestJobs { } def writeExecutionWithTempFile(tempFile: String, testData: List[String]): Execution[List[String]] = { - val forced = TypedPipe.from(testData).map(s => s) - .forceToDiskExecution + val forced = TypedPipe.from(testData).map(s => s).forceToDiskExecution - Execution.withConfig(forced) { conf => conf + ("hadoop.tmp.dir" -> tempFile) } + Execution + .withConfig(forced)(conf => conf + ("hadoop.tmp.dir" -> tempFile)) .flatMap(_.toIterableExecution) .map(_.toList) } @@ -79,11 +80,18 @@ class WordCountEc(args: Args) extends TestExecutionJob[Unit](args) { def execution = ExecutionTestJobs.wordCount(args("input"), args("output")) } -class ExecutionWithTempFiles(args: Args, tempFile: String, testData: List[String]) extends TestExecutionJob[List[String]](args) { +class ExecutionWithTempFiles(args: Args, tempFile: String, testData: List[String]) + extends TestExecutionJob[List[String]](args) { override def execution = ExecutionTestJobs.writeExecutionWithTempFile(tempFile, testData) } -class ZippedExecutionWithTempFiles(args: Args, tempFileOne: String, tempFileTwo: String, testDataOne: List[String], testDataTwo: List[String]) extends TestExecutionJob[(List[String], List[String])](args) { +class ZippedExecutionWithTempFiles( + args: Args, + tempFileOne: String, + tempFileTwo: String, + testDataOne: List[String], + testDataTwo: List[String] +) extends TestExecutionJob[(List[String], List[String])](args) { override def execution = { val executionOne = ExecutionTestJobs.writeExecutionWithTempFile(tempFileOne, testDataOne) val executionTwo = ExecutionTestJobs.writeExecutionWithTempFile(tempFileTwo, testDataTwo) @@ -94,16 +102,16 @@ class ZippedExecutionWithTempFiles(args: Args, tempFileOne: String, tempFileTwo: case class MyCustomType(s: String) class NormalJobToExecutionTestJob(args: Args) extends Job(args) { - TypedPipe.from(0 to 100) + TypedPipe + .from(0 to 100) .groupBy(_ % 3) .sum .write(source.NullSink) } class FlowListenerWithException extends FlowListener { - override def onStarting(flow: Flow[_]): Unit = { + override def onStarting(flow: Flow[_]): Unit = throw new RuntimeException("something goes wrong") - } override def onCompleted(flow: Flow[_]): Unit = {} @@ -153,21 +161,27 @@ class ExecutionTest extends WordSpec with Matchers { "An Execution" should { "run" in { - ExecutionTestJobs.wordCount2(TypedPipe.from(List("a b b c c c", "d d d d"))) - .waitFor(Config.default, Local(false)).get.toMap shouldBe Map("a" -> 1L, "b" -> 2L, "c" -> 3L, "d" -> 4L) + ExecutionTestJobs + .wordCount2(TypedPipe.from(List("a b b c c c", "d d d d"))) + .waitFor(Config.default, Local(false)) + .get + .toMap shouldBe Map("a" -> 1L, "b" -> 2L, "c" -> 3L, "d" -> 4L) } "run with zip" in { - (ExecutionTestJobs.zipped(TypedPipe.from(0 until 100), TypedPipe.from(100 until 200)) + (ExecutionTestJobs + .zipped(TypedPipe.from(0 until 100), TypedPipe.from(100 until 200)) .shouldSucceed() match { - case (it1, it2) => (it1.head, it2.head) - }) shouldBe ((0 until 100).sum, (100 until 200).sum) + case (it1, it2) => (it1.head, it2.head) + }) shouldBe ((0 until 100).sum, (100 until 200).sum) } "run with exception in flow listener" in { val exec = ExecutionTestJobs.wordCount2(TypedPipe.from(List("a", "b"))) - Execution.withConfig(exec) { config => - config.addFlowListener((_, _) => new FlowListenerWithException()) - }.shouldFailWith("Flow was stopped") + Execution + .withConfig(exec) { config => + config.addFlowListener((_, _) => new FlowListenerWithException()) + } + .shouldFailWith("Flow was stopped") } "lift to try" in { val res = ExecutionTestJobs @@ -201,15 +215,18 @@ class ExecutionTest extends WordSpec with Matchers { } "If either fails, zip fails, else we get success" in { val neverHappens = Promise[Int]().future - Execution.fromFuture { _ => neverHappens } + Execution + .fromFuture(_ => neverHappens) .zip(Execution.failed(new Exception("oh no"))) .shouldFail() - Execution.failed(new Exception("oh no")) - .zip(Execution.fromFuture { _ => neverHappens }) + Execution + .failed(new Exception("oh no")) + .zip(Execution.fromFuture(_ => neverHappens)) .shouldFail() // If both are good, we succeed: - Execution.from(1) + Execution + .from(1) .zip(Execution.from("1")) .shouldSucceed() shouldBe (1, "1") } @@ -217,7 +234,8 @@ class ExecutionTest extends WordSpec with Matchers { "If one write fails, the other gets cancelled" in { @volatile var cancelledEx: Option[Throwable] = None - val failedTp: TypedPipe[Int] = TypedPipe.from(Seq(0)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution val mapCountDownLatch = new CountDownLatch(1) @@ -253,7 +271,8 @@ class ExecutionTest extends WordSpec with Matchers { // do the same on the other side @volatile var cancelledEx2: Option[Throwable] = None - val failedTp2: TypedPipe[Int] = TypedPipe.from(Seq(0)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val failedTp2: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val failedEx2: Execution[Iterable[Int]] = failedTp2.toIterableExecution val mapCountDownLatch2 = new CountDownLatch(1) @@ -290,7 +309,8 @@ class ExecutionTest extends WordSpec with Matchers { "If one write fails, the flatmapped execution gets cancelled" in { @volatile var cancelledEx: Option[Throwable] = None - val failedTp: TypedPipe[Int] = TypedPipe.from(Seq(0)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution val mapCountDownLatch = new CountDownLatch(1) @@ -298,19 +318,27 @@ class ExecutionTest extends WordSpec with Matchers { val otherTp: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values val onCompleteCountDownLatch = new CountDownLatch(1) - val otherEx: Execution[Iterable[Int]] = otherTp.toIterableExecution.flatMap { _ => - TypedPipe.from(Seq(2)).groupAll.sum.values.map { i => - // block until we are done - mapCountDownLatch.await() - i - }.toIterableExecution - }.onComplete { t => - if (t.isFailure) { - // capture the exception - cancelledEx = t.failed.toOption + val otherEx: Execution[Iterable[Int]] = otherTp.toIterableExecution + .flatMap { _ => + TypedPipe + .from(Seq(2)) + .groupAll + .sum + .values + .map { i => + // block until we are done + mapCountDownLatch.await() + i + } + .toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx = t.failed.toOption + } + onCompleteCountDownLatch.countDown() } - onCompleteCountDownLatch.countDown() - } val zipped = failedEx.zip(otherEx) @@ -328,7 +356,8 @@ class ExecutionTest extends WordSpec with Matchers { // do the same on the other side @volatile var cancelledEx2: Option[Throwable] = None - val failedTp2: TypedPipe[Int] = TypedPipe.from(Seq(0)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val failedTp2: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val failedEx2: Execution[Iterable[Int]] = failedTp2.toIterableExecution val mapCountDownLatch2 = new CountDownLatch(1) @@ -336,19 +365,27 @@ class ExecutionTest extends WordSpec with Matchers { val otherTp2: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values val onCompleteCountDownLatch2 = new CountDownLatch(1) - val otherEx2: Execution[Iterable[Int]] = otherTp2.toIterableExecution.flatMap { _ => - TypedPipe.from(Seq(2)).groupAll.sum.values.map { i => - // block until we are done - mapCountDownLatch2.await() - i - }.toIterableExecution - }.onComplete { t => - if (t.isFailure) { - // capture the exception - cancelledEx2 = t.failed.toOption + val otherEx2: Execution[Iterable[Int]] = otherTp2.toIterableExecution + .flatMap { _ => + TypedPipe + .from(Seq(2)) + .groupAll + .sum + .values + .map { i => + // block until we are done + mapCountDownLatch2.await() + i + } + .toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx2 = t.failed.toOption + } + onCompleteCountDownLatch2.countDown() } - onCompleteCountDownLatch2.countDown() - } val zipped2 = otherEx2.zip(failedEx2) @@ -367,17 +404,20 @@ class ExecutionTest extends WordSpec with Matchers { "recoverWith may fail to match" in { val exception = new RuntimeException() - val result = Execution.from[Unit] { - throw exception - }.recoverWith { - case _: NullPointerException => Execution.unit - }.waitFor(Config.default, Local(true)) + val result = Execution + .from[Unit] { + throw exception + } + .recoverWith { case _: NullPointerException => + Execution.unit + } + .waitFor(Config.default, Local(true)) result shouldBe Failure(exception) } "recover from failure" in { - val tp = TypedPipe.from(Seq(1)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val tp = TypedPipe.from(Seq(1)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val recoveredTp = TypedPipe.from(Seq(2)).groupAll.sum.values val recoveredEx = tp.toIterableExecution.recoverWith { case t: Throwable => recoveredTp.toIterableExecution @@ -390,7 +430,8 @@ class ExecutionTest extends WordSpec with Matchers { "not recover when cancelled by another execution" in { @volatile var cancelledEx: Option[Throwable] = None - val failedTp: TypedPipe[Int] = TypedPipe.from(Seq(0)).groupAll.sum.values.map { _ => throw new Exception("oh no") } + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution val mapCountDownLatch = new CountDownLatch(1) @@ -403,15 +444,17 @@ class ExecutionTest extends WordSpec with Matchers { val onCompleteCountDownLatch = new CountDownLatch(1) val recoveredTp = TypedPipe.from(Seq(2)) - val otherEx: Execution[Iterable[Int]] = blockingTp.toIterableExecution.recoverWith { case t: Throwable => - recoveredTp.toIterableExecution - }.onComplete { t => - if (t.isFailure) { - // capture the exception - cancelledEx = t.failed.toOption + val otherEx: Execution[Iterable[Int]] = blockingTp.toIterableExecution + .recoverWith { case t: Throwable => + recoveredTp.toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx = t.failed.toOption + } + onCompleteCountDownLatch.countDown() } - onCompleteCountDownLatch.countDown() - } val zipped = failedEx.zip(otherEx) @@ -427,11 +470,10 @@ class ExecutionTest extends WordSpec with Matchers { mapCountDownLatch.countDown() } - "Config transformer will isolate Configs" in { def doesNotHaveVariable(message: String) = Execution.getConfig.flatMap { cfg => if (cfg.get("test.cfg.variable").isDefined) - Execution.failed(new Exception(s"${message}\n: var: ${cfg.get("test.cfg.variable")}")) + Execution.failed(new Exception(s"$message\n: var: ${cfg.get("test.cfg.variable")}")) else Execution.from(()) } @@ -443,10 +485,10 @@ class ExecutionTest extends WordSpec with Matchers { Execution.from(()) } - def addOption(cfg: Config) = cfg.+ ("test.cfg.variable", "dummyValue") + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") doesNotHaveVariable("Should not see variable before we've started transforming") - .flatMap{ _ => Execution.withConfig(hasVariable)(addOption) } + .flatMap(_ => Execution.withConfig(hasVariable)(addOption)) .flatMap(_ => doesNotHaveVariable("Should not see variable in flatMap's after the isolation")) .map(_ => true) .shouldSucceed() shouldBe true @@ -463,11 +505,11 @@ class ExecutionTest extends WordSpec with Matchers { Execution.from(()) } - def addOption(cfg: Config) = cfg.+ ("test.cfg.variable", "dummyValue") + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") // Here we run without the option, with the option, and finally without again. incrementor - .flatMap{ _ => Execution.withConfig(incrementor)(addOption) } + .flatMap(_ => Execution.withConfig(incrementor)(addOption)) .flatMap(_ => incrementor) .map(_ => true) .shouldSucceed() shouldBe true @@ -492,19 +534,23 @@ class ExecutionTest extends WordSpec with Matchers { val sink = TypedTsv[Int](sinkF) val src = TypedTsv[Int](srcF) - val operationTP = (TypedPipe.from(src) ++ TypedPipe.from((1 until 100).toList)).writeExecution(sink).getCounters.map(_._2.toMap) + val operationTP = (TypedPipe.from(src) ++ TypedPipe + .from((1 until 100).toList)).writeExecution(sink).getCounters.map(_._2.toMap) - def addOption(cfg: Config) = cfg.+ ("test.cfg.variable", "dummyValue") + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") // Here we run without the option, with the option, and finally without again. val (oldCounters, newCounters) = operationTP - .flatMap{ oc => + .flatMap { oc => writeNums(List(1, 2, 3, 4, 5, 6, 7)) - Execution.withConfig(operationTP)(addOption).map { nc => (oc, nc) } + Execution.withConfig(operationTP)(addOption).map(nc => (oc, nc)) } .shouldSucceed() - assert(oldCounters != newCounters, "With new configs given the source changed we shouldn't cache so the counters should be different") + assert( + oldCounters != newCounters, + "With new configs given the source changed we shouldn't cache so the counters should be different" + ) } @@ -542,11 +588,11 @@ class ExecutionTest extends WordSpec with Matchers { val parser = new ExecutionApp { def job = Execution.from(()) } "parse hadoop args correctly" in { val conf = parser.config(Array("-Dmapred.reduce.tasks=100", "--local"))._1 - conf.get("mapred.reduce.tasks") should contain ("100") + conf.get("mapred.reduce.tasks") should contain("100") conf.getArgs.boolean("local") shouldBe true val (conf1, Hdfs(_, hconf)) = parser.config(Array("--test", "-Dmapred.reduce.tasks=110", "--hdfs")) - conf1.get("mapred.reduce.tasks") should contain ("110") + conf1.get("mapred.reduce.tasks") should contain("110") conf1.getArgs.boolean("test") shouldBe true hconf.get("mapred.reduce.tasks") shouldBe "110" } @@ -577,12 +623,12 @@ class ExecutionTest extends WordSpec with Matchers { Execution.fromFn((_, _) => flowDef) } - def pipeBuilder(implicit flowDef: FlowDef, mode: Mode): TypedPipe[Int] = { - TypedPipe.from(TextLine(args("input"))) + def pipeBuilder(implicit flowDef: FlowDef, mode: Mode): TypedPipe[Int] = + TypedPipe + .from(TextLine(args("input"))) .map(_.toInt) .map(_ * 2) .write(TypedTsv[Int]("out")) - } } val input = List((0, "1"), (1, "2"), (2, "3"), (3, "4"), (4, "5")) @@ -607,7 +653,8 @@ class ExecutionTest extends WordSpec with Matchers { Files.exists(tempFileOne) should be(true) Files.exists(tempFileTwo) should be(true) - val cleanupThread = TempFileCleanup(List(tempFileOne.toFile.getAbsolutePath, tempFileTwo.toFile.getAbsolutePath), mode) + val cleanupThread = + TempFileCleanup(List(tempFileOne.toFile.getAbsolutePath, tempFileTwo.toFile.getAbsolutePath), mode) cleanupThread.run() Files.exists(tempFileOne) should be(false) @@ -621,7 +668,8 @@ class ExecutionTest extends WordSpec with Matchers { isTempFileCleanupHook(hook) should be(false) } - ExecutionTestJobs.writeExecutionWithTempFile(tempFile, testData) + ExecutionTestJobs + .writeExecutionWithTempFile(tempFile, testData) .shouldSucceedHadoop() // This is hacky, but there's a small chance that the new cleanup hook isn't registered by the time we get here @@ -668,7 +716,8 @@ class ExecutionTest extends WordSpec with Matchers { isTempFileCleanupHook(hook) should be(false) } - ExecutionTestJobs.writeExecutionWithTempFile(tempFileOne, testDataOne) + ExecutionTestJobs + .writeExecutionWithTempFile(tempFileOne, testDataOne) .zip(ExecutionTestJobs.writeExecutionWithTempFile(tempFileTwo, testDataTwo)) .shouldSucceedHadoop() @@ -692,12 +741,13 @@ class ExecutionTest extends WordSpec with Matchers { var first = 0 var second = 0 var third = 0 - val e1 = Execution.from({ first += 1; 42 }) + val e1 = Execution.from { first += 1; 42 } val e2 = e1.flatMap { x => second += 1 Execution.from(2 * x) } val e3 = e1.map { x => third += 1; x * 3 } + /** * Notice both e3 and e2 need to evaluate e1. */ @@ -706,56 +756,64 @@ class ExecutionTest extends WordSpec with Matchers { assert((first, second, third) == (1, 1, 1)) } "zip does not duplicate counters" in { - val c1 = Execution.withId { implicit uid => - val stat = Stat("test") - val e1 = TypedPipe.from(0 until 100).map { x => - stat.inc - x - } - .writeExecution(source.NullSink) - - e1.zip(e1) - } - .getCounters.map { case (_, c) => c("test") } + val c1 = Execution + .withId { implicit uid => + val stat = Stat("test") + val e1 = TypedPipe + .from(0 until 100) + .map { x => + stat.inc + x + } + .writeExecution(source.NullSink) - val c2 = Execution.withId { implicit uid => - val stat = Stat("test") - val e2 = TypedPipe.from(0 until 100).map { x => - stat.inc - x + e1.zip(e1) } - .writeExecution(source.NullSink) + .getCounters + .map { case (_, c) => c("test") } + + val c2 = Execution + .withId { implicit uid => + val stat = Stat("test") + val e2 = TypedPipe + .from(0 until 100) + .map { x => + stat.inc + x + } + .writeExecution(source.NullSink) - e2.flatMap(Execution.from(_)).zip(e2) - } - .getCounters.map { case (_, c) => c("test") } + e2.flatMap(Execution.from(_)).zip(e2) + } + .getCounters + .map { case (_, c) => c("test") } c1.shouldSucceed() should ===(100) c2.shouldSucceed() should ===(100) } "zip does not duplicate pure counters" in { val c1 = { - val e1 = TypedPipe.from(0 until 100) + val e1 = TypedPipe + .from(0 until 100) .tallyAll("scalding", "test") .writeExecution(source.NullSink) - e1.zip(e1) - .getCounters.map { case (_, c) => - println(c.toMap) - c(("test", "scalding")) - } + e1.zip(e1).getCounters.map { case (_, c) => + println(c.toMap) + c(("test", "scalding")) + } } val c2 = { - val e2 = TypedPipe.from(0 until 100) + val e2 = TypedPipe + .from(0 until 100) .tallyAll("scalding", "test") .writeExecution(source.NullSink) - e2.flatMap(Execution.from(_)).zip(e2) - .getCounters.map { case (_, c) => - println(c.toMap) - c(("test", "scalding")) - } + e2.flatMap(Execution.from(_)).zip(e2).getCounters.map { case (_, c) => + println(c.toMap) + c(("test", "scalding")) + } } c1.shouldSucceed() should ===(100) @@ -766,33 +824,45 @@ class ExecutionTest extends WordSpec with Matchers { var timesEvaluated = 0 import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ // Attempt to use up 4 boxed classes for every execution - def baseExecution(idx: Int): Execution[Unit] = TypedPipe.from(0 until 1000).map(_.toShort).flatMap { i => - timesEvaluated += 1 - List((i, i), (i, i)) - }.sumByKey.map { - case (k, v) => + def baseExecution(idx: Int): Execution[Unit] = TypedPipe + .from(0 until 1000) + .map(_.toShort) + .flatMap { i => + timesEvaluated += 1 + List((i, i), (i, i)) + } + .sumByKey + .map { case (k, v) => (k.toInt, v) - }.sumByKey.map { - case (k, v) => + } + .sumByKey + .map { case (k, v) => (k.toLong, v) - }.sumByKey.map { - case (k, v) => + } + .sumByKey + .map { case (k, v) => (k.toString, v) - }.sumByKey.map { - case (k, v) => + } + .sumByKey + .map { case (k, v) => (MyCustomType(k), v) - }.sumByKey.writeExecution(TypedTsv(s"/tmp/asdf_${idx}")) + } + .sumByKey + .writeExecution(TypedTsv(s"/tmp/asdf_$idx")) implicitly[OrderedSerialization[MyCustomType]] match { - case mos: MacroEqualityOrderedSerialization[_] => assert(mos.uniqueId == "com.twitter.scalding.MyCustomType") - case _ => sys.error("Ordered serialization should have been the MacroEqualityOrderedSerialization for this test") + case mos: MacroEqualityOrderedSerialization[_] => + assert(mos.uniqueId == "com.twitter.scalding.MyCustomType") + case _ => + sys.error( + "Ordered serialization should have been the MacroEqualityOrderedSerialization for this test" + ) } - def executionLoop(idx: Int): Execution[Unit] = { + def executionLoop(idx: Int): Execution[Unit] = if (idx > 0) baseExecution(idx).flatMap(_ => executionLoop(idx - 1)) else Execution.unit - } executionLoop(55).shouldSucceed() assert(timesEvaluated == 55 * 1000, "Should run the 55 execution loops for 1000 elements") @@ -801,61 +871,80 @@ class ExecutionTest extends WordSpec with Matchers { "evaluate shared portions just once, writeExecution" in { var timesEvaluated = 0 - val baseTp = TypedPipe.from(0 until 1000).flatMap { i => - timesEvaluated += 1 - List(i, i) - }.fork + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork - val fde1 = baseTp.map{ _ * 3 }.writeExecution(TypedTsv("/tmp/asdf")) - val fde2 = baseTp.map{ _ * 5 }.writeExecution(TypedTsv("/tmp/asdf2")) + val fde1 = baseTp.map(_ * 3).writeExecution(TypedTsv("/tmp/asdf")) + val fde2 = baseTp.map(_ * 5).writeExecution(TypedTsv("/tmp/asdf2")) val res = fde1.zip(fde2) res.shouldSucceed() - assert(timesEvaluated == 1000, "Should share the common sub section of the graph when we zip two write Executions") + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions" + ) } "evaluate shared portions just once, forceToDiskExecution" in { var timesEvaluated = 0 - val baseTp = TypedPipe.from(0 until 1000).flatMap { i => - timesEvaluated += 1 - List(i, i) - }.fork + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork - val fde1 = baseTp.map{ _ * 3 }.forceToDiskExecution - val fde2 = baseTp.map{ _ * 5 }.forceToDiskExecution + val fde1 = baseTp.map(_ * 3).forceToDiskExecution + val fde2 = baseTp.map(_ * 5).forceToDiskExecution val res = fde1.zip(fde2) res.shouldSucceed() - assert(timesEvaluated == 1000, "Should share the common sub section of the graph when we zip two write Executions") + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions" + ) } "evaluate shared portions just once, forceToDiskExecution with execution cache" in { var timesEvaluated = 0 - val baseTp = TypedPipe.from(0 until 1000).flatMap { i => - timesEvaluated += 1 - List(i, i) - }.fork + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork - val fde1 = baseTp.map{ _ * 3 }.forceToDiskExecution - val fde2 = baseTp.map{ _ * 5 }.forceToDiskExecution + val fde1 = baseTp.map(_ * 3).forceToDiskExecution + val fde2 = baseTp.map(_ * 5).forceToDiskExecution - val res = fde1.zip(fde2).flatMap{ _ => fde1 }.flatMap(_.toIterableExecution) + val res = fde1.zip(fde2).flatMap(_ => fde1).flatMap(_.toIterableExecution) res.shouldSucceed() - assert(timesEvaluated == 1000, "Should share the common sub section of the graph when we zip two write Executions and then flatmap") + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions and then flatmap" + ) } "Ability to do isolated caches so we don't exhaust memory" in { - def memoryWastingExecutionGenerator(id: Int): Execution[Array[Long]] = Execution.withNewCache(Execution.from(id).flatMap{ idx => - Execution.from(Array.fill(4000000)(idx.toLong)) - }) + def memoryWastingExecutionGenerator(id: Int): Execution[Array[Long]] = + Execution.withNewCache(Execution.from(id).flatMap { idx => + Execution.from(Array.fill(4000000)(idx.toLong)) + }) - def writeAll(numExecutions: Int): Execution[Unit] = { + def writeAll(numExecutions: Int): Execution[Unit] = if (numExecutions > 0) { memoryWastingExecutionGenerator(numExecutions).flatMap { _ => writeAll(numExecutions - 1) @@ -863,7 +952,6 @@ class ExecutionTest extends WordSpec with Matchers { } else { Execution.from(()) } - } writeAll(400).shouldSucceed() } @@ -874,7 +962,8 @@ class ExecutionTest extends WordSpec with Matchers { } "handle an error running in parallel" in { - val executions = Execution.failed(new Exception("failed")) :: 0.to(10).map(i => Execution.from[Int](i)).toList + val executions = + Execution.failed(new Exception("failed")) :: 0.to(10).map(i => Execution.from[Int](i)).toList val result = Execution.withParallelism(executions, 3) @@ -896,12 +985,16 @@ class ExecutionTest extends WordSpec with Matchers { seen += 1 } - val executions = 0.to(10).map{ i => - Execution - .from[Int](i) - .map{ i => Thread.sleep(10 - i); i } - .onComplete(t => updateSeen(t.get)) - }.toList.reverse + val executions = 0 + .to(10) + .map { i => + Execution + .from[Int](i) + .map { i => Thread.sleep(10 - i); i } + .onComplete(t => updateSeen(t.get)) + } + .toList + .reverse val result = Execution.withParallelism(executions, 1) @@ -962,27 +1055,29 @@ class ExecutionTest extends WordSpec with Matchers { } "Execution#map" in { - reconstructibleLaws( - Execution.fromFuture(futureF).map(mapF), - Execution.fromFuture(futureF).map(mapF2)) + reconstructibleLaws(Execution.fromFuture(futureF).map(mapF), Execution.fromFuture(futureF).map(mapF2)) } "Execution.zip" in { reconstructibleLaws( Execution.zip(Execution.fromFuture(futureF2), Execution.withId(withIdF)), - Execution.zip(Execution.fromFuture(futureF2), Execution.withId(withIdF2))) + Execution.zip(Execution.fromFuture(futureF2), Execution.withId(withIdF2)) + ) } "Execution.sequence" in { reconstructibleLaws( - Execution.sequence(Seq( - Execution.fromFuture(futureF), - Execution.withId(withIdF), - Execution.fromFuture(futureF2).map(mapF))), - Execution.sequence(Seq( - Execution.fromFuture(futureF), - Execution.withId(withIdF), - Execution.fromFn(fnF)))) + Execution.sequence( + Seq( + Execution.fromFuture(futureF), + Execution.withId(withIdF), + Execution.fromFuture(futureF2).map(mapF) + ) + ), + Execution.sequence( + Seq(Execution.fromFuture(futureF), Execution.withId(withIdF), Execution.fromFn(fnF)) + ) + ) } } @@ -991,11 +1086,13 @@ class ExecutionTest extends WordSpec with Matchers { // want equality to be consistent trait MutableX[T] { protected var x: Int - def setX(newX: Int): Unit = { x = newX } + def setX(newX: Int): Unit = x = newX def makeExecution: Execution[T] } - case class FromFutureMutable(var x: Int = 0) extends Function1[ConcurrentExecutionContext, Future[Int]] with MutableX[Int] { + case class FromFutureMutable(var x: Int = 0) + extends Function1[ConcurrentExecutionContext, Future[Int]] + with MutableX[Int] { def apply(context: ConcurrentExecutionContext) = Future.successful(x) def makeExecution = Execution.fromFuture(this) } @@ -1003,7 +1100,9 @@ class ExecutionTest extends WordSpec with Matchers { def apply(config: Config, mode: Mode) = null def makeExecution = Execution.fromFn(this) } - case class WithIdMutable(var x: Int = 0) extends Function1[UniqueID, Execution[Int]] with MutableX[Int] { + case class WithIdMutable(var x: Int = 0) + extends Function1[UniqueID, Execution[Int]] + with MutableX[Int] { def apply(id: UniqueID) = Execution.fromFuture(FromFutureMutable(x)) def makeExecution = Execution.withId(this) } @@ -1037,17 +1136,14 @@ class ExecutionTest extends WordSpec with Matchers { def makeExecution = Execution.sequence(Seq(m1.makeExecution, m2.makeExecution)) } - def mutableLaws[T, U <: MutableX[T]]( - mutableGen: => U, - expectedOpt: Option[Int => T] = None): Unit = { + def mutableLaws[T, U <: MutableX[T]](mutableGen: => U, expectedOpt: Option[Int => T] = None): Unit = { expectedOpt.foreach { expected => require(expected(10) != expected(20)) } - def validate(ex: Execution[T], seed: Int): Unit = { + def validate(ex: Execution[T], seed: Int): Unit = expectedOpt.foreach { expected => assert(ex.shouldSucceed() == expected(seed)) } - } val mutable1 = mutableGen mutable1.setX(10) @@ -1084,7 +1180,7 @@ class ExecutionTest extends WordSpec with Matchers { } "Execution.fromFuture" in { - mutableLaws(FromFutureMutable(), Some({ x: Int => x })) + mutableLaws(FromFutureMutable(), Some { x: Int => x }) } "Execution.fromFn" in { @@ -1092,26 +1188,29 @@ class ExecutionTest extends WordSpec with Matchers { } "Execution.withId" in { - mutableLaws(WithIdMutable(), Some({ x: Int => x })) + mutableLaws(WithIdMutable(), Some { x: Int => x }) } "Execution#map" in { - mutableLaws(MapMutable(), Some({ x: Int => x * x })) + mutableLaws(MapMutable(), Some { x: Int => x * x }) } "Execution#zip" in { - mutableLaws(ZipMutable(), Some({ x: Int => (x, x + 20) })) + mutableLaws(ZipMutable(), Some { x: Int => (x, x + 20) }) } "Execution.sequence" in { - mutableLaws(SequenceMutable(), Some({ x: Int => Seq(x, x * 3) })) + mutableLaws(SequenceMutable(), Some { x: Int => Seq(x, x * 3) }) } } } "Simple jobs" should { "convert to Execution and run" in { - val ex = Job.toExecutionFromClass(classOf[NormalJobToExecutionTestJob], Execution.failed(new Exception("couldn't run"))) + val ex = Job.toExecutionFromClass( + classOf[NormalJobToExecutionTestJob], + Execution.failed(new Exception("couldn't run")) + ) val res = ex.waitFor(Config.empty, Local(true)) assert(res.isSuccess) } @@ -1145,8 +1244,11 @@ class ExecutionTest extends WordSpec with Matchers { } "work in a mapped TypedSource" in { val workingDir = System.getProperty("user.dir") - val job = TypedPipe.from(TextLine(workingDir + "/../tutorial/data/hello.txt")).map(_.size).toIterableExecution - assert(job.waitFor(Config.empty, Local(true)).get.toList == List("Hello world", "Goodbye world").map(_.size)) + val job = + TypedPipe.from(TextLine(workingDir + "/../tutorial/data/hello.txt")).map(_.size).toIterableExecution + assert( + job.waitFor(Config.empty, Local(true)).get.toList == List("Hello world", "Goodbye world").map(_.size) + ) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala index b31118e1f9..7ff0485c14 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class ExecutionUtilTest extends WordSpec with Matchers { import ExecutionUtil._ diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala index c8ab4c52af..e478924bb7 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding import java.io.File -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class ExpandLibJarsGlobsTest extends WordSpec with Matchers { def touch(parent: File, p: String): String = { @@ -11,7 +11,8 @@ class ExpandLibJarsGlobsTest extends WordSpec with Matchers { } def getTmpRoot = { - val tmpRoot = new File(System.getProperty("java.io.tmpdir"), scala.util.Random.nextInt(Int.MaxValue).toString) + val tmpRoot = + new File(System.getProperty("java.io.tmpdir"), scala.util.Random.nextInt(Int.MaxValue).toString) require(tmpRoot.mkdirs(), "Failed to make temporary directory") tmpRoot.deleteOnExit() tmpRoot @@ -23,18 +24,21 @@ class ExpandLibJarsGlobsTest extends WordSpec with Matchers { val tmpRoot = getTmpRoot // Has a side effect, but returns us the jars absolute paths val jars = (0 until 20).map { idx => - touch(tmpRoot, s"myF_${idx}.jar") + touch(tmpRoot, s"myF_$idx.jar") } ++ (0 until 20).map { idx => - touch(tmpRoot, s".myHidden.jar.myF_${idx}.jar") + touch(tmpRoot, s".myHidden.jar.myF_$idx.jar") } - val resultingLibJars1 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.jar"))(1).split(",") + val resultingLibJars1 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.jar"))(1).split(",") assert(resultingLibJars1.sorted.toList == jars.sorted.toList) - val resultingLibJars2 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/"))(1).split(",") + val resultingLibJars2 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/"))(1).split(",") assert(resultingLibJars2.sorted.toList == jars.sorted.toList) - val resultingLibJars3 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*"))(1).split(",") + val resultingLibJars3 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*"))(1).split(",") assert(resultingLibJars3.sorted.toList == jars.sorted.toList) } @@ -43,12 +47,14 @@ class ExpandLibJarsGlobsTest extends WordSpec with Matchers { // Has a side effect, but returns us the jars absolute paths val jars = (0 until 20).map { idx => - touch(tmpRoot, s"myF_${idx}.jar") + touch(tmpRoot, s"myF_$idx.jar") } ++ (0 until 20).map { idx => - touch(tmpRoot, s".myHidden.jar.myF_${idx}.jar") + touch(tmpRoot, s".myHidden.jar.myF_$idx.jar") } - val resultingLibJars1 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.zip"))(1).split(",").filter(_.nonEmpty) + val resultingLibJars1 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.zip"))(1) + .split(",") + .filter(_.nonEmpty) assert(resultingLibJars1.isEmpty) } @@ -58,21 +64,25 @@ class ExpandLibJarsGlobsTest extends WordSpec with Matchers { // Has a side effect, but returns us the jars absolute paths val jars1 = (0 until 20).map { idx => - touch(tmpRoot1, s"myF_${idx}.jar") + touch(tmpRoot1, s"myF_$idx.jar") } ++ (0 until 20).map { idx => - touch(tmpRoot1, s".myHidden.jar.myF_${idx}.jar") + touch(tmpRoot1, s".myHidden.jar.myF_$idx.jar") } val jars2 = (0 until 1).map { idx => - touch(tmpRoot2, s"myF_${idx}.jar") + touch(tmpRoot2, s"myF_$idx.jar") } // Using wildcards for both - val resultingLibJars1 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/*.jar"))(1).split(",") + val resultingLibJars1 = ExpandLibJarsGlobs( + Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/*.jar") + )(1).split(",") assert(resultingLibJars1.sorted.toList == (jars1 ++ jars2).sorted.toList) // No wildcards for second dir - val resultingLibJars2 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/myF_0.jar"))(1).split(",") + val resultingLibJars2 = ExpandLibJarsGlobs( + Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/myF_0.jar") + )(1).split(",") assert(resultingLibJars2.sorted.toList == (jars1 ++ jars2).sorted.toList) } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala index ba772559ed..260051deae 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala @@ -12,26 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class FieldImpsTest extends WordSpec with Matchers with FieldConversions { - def setAndCheck[T <: Comparable[_]](v: T)(implicit conv: (T) => Fields): Unit = { + def setAndCheck[T <: Comparable[_]](v: T)(implicit conv: (T) => Fields): Unit = conv(v) shouldBe (new Fields(v)) - } - def setAndCheckS[T <: Comparable[_]](v: Seq[T])(implicit conv: (Seq[T]) => Fields): Unit = { + def setAndCheckS[T <: Comparable[_]](v: Seq[T])(implicit conv: (Seq[T]) => Fields): Unit = conv(v) shouldBe (new Fields(v: _*)) - } - def setAndCheckSym(v: Symbol): Unit = { + def setAndCheckSym(v: Symbol): Unit = (v: Fields) shouldBe (new Fields(v.toString.tail)) - } - def setAndCheckSymS(v: Seq[Symbol]): Unit = { + def setAndCheckSymS(v: Seq[Symbol]): Unit = (v: Fields) shouldBe (new Fields(v.map(_.toString.tail): _*)) - } def setAndCheckField(v: Field[_]): Unit = { val vF: Fields = v val fields = new Fields(v.id) @@ -44,12 +40,10 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { fields.setComparators(v.map(_.ord): _*) checkFieldsWithComparators(vF, fields) } - def setAndCheckEnumValue(v: Enumeration#Value): Unit = { + def setAndCheckEnumValue(v: Enumeration#Value): Unit = (v: Fields) shouldBe (new Fields(v.toString)) - } - def setAndCheckEnumValueS(v: Seq[Enumeration#Value]): Unit = { + def setAndCheckEnumValueS(v: Seq[Enumeration#Value]): Unit = (v: Fields) shouldBe (new Fields(v.map(_.toString): _*)) - } def checkFieldsWithComparators(actual: Fields, expected: Fields): Unit = { // sometimes one or the other is actually a RichFields, so rather than test for // actual.equals(expected), we just check that all the field names and comparators line up @@ -60,7 +54,7 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { "Field" should { "contain manifest" in { val field = Field[Long]("foo") - field.mf should contain (implicitly[Manifest[Long]]) + field.mf should contain(implicitly[Manifest[Long]]) } } "RichFields" should { @@ -80,7 +74,10 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { val comparator = implicitly[Ordering[String]] fields.setComparators(comparator, comparator) val fieldList: List[Field[_]] = fields.toFieldList - fieldList shouldBe List(new StringField[String]("foo")(comparator, None), new StringField[String]("bar")(comparator, None)) + fieldList shouldBe List( + new StringField[String]("foo")(comparator, None), + new StringField[String]("bar")(comparator, None) + ) } "throw an exception on when converting a virtual Fields instance" in { @@ -203,7 +200,7 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { f2 = (0 until 10) -> 'you f2 shouldBe (new Fields((0 until 10).map(int2Integer): _*), new Fields("you")) - f2 = (('hey, 'world) -> 'other) + f2 = ('hey, 'world) -> 'other f2 shouldBe (new Fields("hey", "world"), new Fields("other")) f2 = 0 -> 2 @@ -214,12 +211,12 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { val foo = Field[java.math.BigInteger]("foo") val bar = Field[java.math.BigDecimal]("bar") - f2 = ((foo, bar) -> 'bell) + f2 = (foo, bar) -> 'bell var fields = new Fields("foo", "bar") fields.setComparators(foo.ord, bar.ord) f2 shouldBe (fields, new Fields("bell")) - f2 = (foo -> ('bar, 'bell)) + f2 = foo -> ('bar, 'bell) fields = RichFields(foo) fields.setComparators(foo.ord) f2 shouldBe (fields, new Fields("bar", "bell")) @@ -232,7 +229,7 @@ class FieldImpsTest extends WordSpec with Matchers with FieldConversions { f2 shouldBe (new Fields("one", "two", "three"), new Fields("n1", "n2", "n3")) f2 = List(4, 5, 6) -> List(1, 2, 3) f2 shouldBe (new Fields(int2Integer(4), int2Integer(5), int2Integer(6)), - new Fields(int2Integer(1), int2Integer(2), int2Integer(3))) + new Fields(int2Integer(1), int2Integer(2), int2Integer(3))) object Schema extends Enumeration { val one, two, three = Value diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala index a8ea468d10..e2dbd5cee9 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.scheme.NullScheme import cascading.tuple.Fields import org.apache.hadoop.conf.Configuration -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class MultiTsvInputJob(args: Args) extends Job(args) { try { @@ -32,7 +32,8 @@ class MultiTsvInputJob(args: Args) extends Job(args) { class SequenceFileInputJob(args: Args) extends Job(args) { try { SequenceFile("input0").read.write(SequenceFile("output0")) - WritableSequenceFile("input1", ('query, 'queryStats)).read.write(WritableSequenceFile("output1", ('query, 'queryStats))) + WritableSequenceFile("input1", ('query, 'queryStats)).read + .write(WritableSequenceFile("output1", ('query, 'queryStats))) } catch { case e: Exception => e.printStackTrace() } @@ -51,39 +52,36 @@ class FileSourceTest extends WordSpec with Matchers { import Dsl._ "A MultipleTsvFile Source" should { - JobTest(new MultiTsvInputJob(_)). - source(MultipleTsvFiles(List("input0", "input1"), ('query, 'queryStats)), - List(("foobar", 1), ("helloworld", 2))). - sink[(String, Int)](Tsv("output0")) { - outBuf => - "take multiple Tsv files as input sources" in { - outBuf should have length 2 - outBuf.toList shouldBe List(("foobar", 1), ("helloworld", 2)) - } + JobTest(new MultiTsvInputJob(_)) + .source( + MultipleTsvFiles(List("input0", "input1"), ('query, 'queryStats)), + List(("foobar", 1), ("helloworld", 2)) + ) + .sink[(String, Int)](Tsv("output0")) { outBuf => + "take multiple Tsv files as input sources" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar", 1), ("helloworld", 2)) } + } .run .finish() } "A WritableSequenceFile Source" should { - JobTest(new SequenceFileInputJob(_)). - source(SequenceFile("input0"), - List(("foobar0", 1), ("helloworld0", 2))). - source(WritableSequenceFile("input1", ('query, 'queryStats)), - List(("foobar1", 1), ("helloworld1", 2))). - sink[(String, Int)](SequenceFile("output0")) { - outBuf => - "sequence file input" in { - outBuf should have length 2 - outBuf.toList shouldBe List(("foobar0", 1), ("helloworld0", 2)) - } - } - .sink[(String, Int)](WritableSequenceFile("output1", ('query, 'queryStats))) { - outBuf => - "writable sequence file input" in { - outBuf should have length 2 - outBuf.toList shouldBe List(("foobar1", 1), ("helloworld1", 2)) - } + JobTest(new SequenceFileInputJob(_)) + .source(SequenceFile("input0"), List(("foobar0", 1), ("helloworld0", 2))) + .source(WritableSequenceFile("input1", ('query, 'queryStats)), List(("foobar1", 1), ("helloworld1", 2))) + .sink[(String, Int)](SequenceFile("output0")) { outBuf => + "sequence file input" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar0", 1), ("helloworld0", 2)) + } + } + .sink[(String, Int)](WritableSequenceFile("output1", ('query, 'queryStats))) { outBuf => + "writable sequence file input" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar1", 1), ("helloworld1", 2)) + } } .run .finish() @@ -95,7 +93,7 @@ class FileSourceTest extends WordSpec with Matchers { .source(MultipleTextLineFiles("input0", "input1"), List("foobar", "helloworld")) .sink[String](Tsv("output0")) { outBuf => "take multiple text files as input sources" in { - outBuf should have length 2 + (outBuf should have).length(2) outBuf.toList shouldBe List("foobar", "helloworld") } } @@ -105,29 +103,26 @@ class FileSourceTest extends WordSpec with Matchers { "TextLine.toIterator" should { "correctly read strings" in { - TextLine("../tutorial/data/hello.txt").toIterator(Config.default, Local(true)).toList shouldBe List("Hello world", "Goodbye world") + TextLine("../tutorial/data/hello.txt").toIterator(Config.default, Local(true)).toList shouldBe List( + "Hello world", + "Goodbye world" + ) } } /** - * The layout of the test data looks like this: - * /test_data/2013/02 does not exist + * The layout of the test data looks like this: /test_data/2013/02 does not exist * - * /test_data/2013/03 (dir with a single data file in it) - * /test_data/2013/03/2013-03.txt + * /test_data/2013/03 (dir with a single data file in it) /test_data/2013/03/2013-03.txt * - * /test_data/2013/04 (dir with a single data file and a _SUCCESS file) - * /test_data/2013/04/2013-04.txt + * /test_data/2013/04 (dir with a single data file and a _SUCCESS file) /test_data/2013/04/2013-04.txt * /test_data/2013/04/_SUCCESS * - * /test_data/2013/05 (logically empty dir: git does not support empty dirs) + * /test_data/2013/05 (logically empty dir: git does not support empty dirs) * - * /test_data/2013/06 (dir with only a _SUCCESS file) - * /test_data/2013/06/_SUCCESS + * /test_data/2013/06 (dir with only a _SUCCESS file) /test_data/2013/06/_SUCCESS * - * /test_data/2013/07 - * /test_data/2013/07/2013-07.txt - * /test_data/2013/07/_SUCCESS + * /test_data/2013/07 /test_data/2013/07/2013-07.txt /test_data/2013/07/_SUCCESS */ "default pathIsGood" should { import TestFileSource.pathIsGood @@ -247,11 +242,11 @@ class FileSourceTest extends WordSpec with Matchers { "FixedPathSource.hdfsWritePath" should { "crib if path == *" in { - intercept[AssertionError] { TestFixedPathSource("*").hdfsWritePath } + intercept[AssertionError](TestFixedPathSource("*").hdfsWritePath) } "crib if path == /*" in { - intercept[AssertionError] { TestFixedPathSource("/*").hdfsWritePath } + intercept[AssertionError](TestFixedPathSource("/*").hdfsWritePath) } "remove /* from a path ending in /*" in { @@ -306,7 +301,7 @@ object TestPath { def getCurrentDirectory = new java.io.File(".").getCanonicalPath def prefix = getCurrentDirectory.split("/").last match { case "scalding-core" => getCurrentDirectory - case _ => getCurrentDirectory + "/scalding-core" + case _ => getCurrentDirectory + "/scalding-core" } val testfsPathRoot = prefix + "/src/test/resources/com/twitter/scalding/test_filesystem/" } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala index e179db2afa..941154d264 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala @@ -3,7 +3,7 @@ package com.twitter.scalding import org.scalatest.FunSuite import cascading.flow.FlowDef -import com.twitter.scalding.source.{ TypedText, NullSink } +import com.twitter.scalding.source.{NullSink, TypedText} import com.twitter.scalding.typed.cascading_backend.CascadingBackend class FlowStateMapTest extends FunSuite { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala index 5d01df0157..a9a3bb8196 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class IntegralCompTest extends WordSpec with Matchers { def box[T](t: T) = t.asInstanceOf[AnyRef] @@ -39,8 +39,8 @@ class IntegralCompTest extends WordSpec with Matchers { "handle null inputs" in { intComp.hashCode(null) shouldBe 0 List(box(1), box("hey"), box(2L), box(0.0)).foreach { x => - intComp.compare(null, x) should be < (0) - intComp.compare(x, null) should be > (0) + intComp.compare(null, x) should be < 0 + intComp.compare(x, null) should be > 0 intComp.compare(x, x) shouldBe 0 } intComp.compare(null, null) shouldBe 0 @@ -53,8 +53,8 @@ class IntegralCompTest extends WordSpec with Matchers { } List((box(1), box(2L)), (box(2), box(3L)), (box(3), box(4L))) .foreach { pair => - intComp.compare(pair._1, pair._2) should be < (0) - intComp.compare(pair._2, pair._1) should be > (0) + intComp.compare(pair._1, pair._2) should be < 0 + intComp.compare(pair._2, pair._1) should be > 0 } } "Compare strings properly" in { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala index 7aec317e8d..2719ac817e 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala @@ -3,7 +3,7 @@ package com.twitter.scalding import com.twitter.bijection.JavaSerializationInjection import com.twitter.chill.KryoPool import com.twitter.chill.config.ScalaAnyRefMapConfig -import com.twitter.scalding.serialization.{ Externalizer, KryoHadoop } +import com.twitter.scalding.serialization.{Externalizer, KryoHadoop} import com.twitter.scalding.source.TypedText import org.scalatest.FunSuite @@ -14,7 +14,8 @@ class ToIterableSerializationTest extends FunSuite { } val myFoo = new Foo - val testIterableExecution = Execution.toIterable(TypedPipe.from(TypedText.tsv[Int]("foo")).map(_ * myFoo.field)) + val testIterableExecution = + Execution.toIterable(TypedPipe.from(TypedText.tsv[Int]("foo")).map(_ * myFoo.field)) test("toIterableExecution should roundtrip") { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala index bbfd175774..ab93c91258 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala @@ -1,12 +1,13 @@ package com.twitter.scalding import com.twitter.scalding.source.TypedText -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} /** * Simple identity job that reads from a Tsv and writes to a Tsv with no change. * - * @param args to the job. "input" specifies the input file, and "output" the output file. + * @param args + * to the job. "input" specifies the input file, and "output" the output file. */ class SimpleTestJob(args: Args) extends Job(args) { Tsv(args("input")).read.write(Tsv(args("output"))) @@ -29,12 +30,15 @@ class JobTestTest extends WordSpec with Matchers { .arg("input", "input") .arg("output", "output") .source(incorrectSource, testInput) - .sink[(String, Int)](Tsv("output")){ outBuf => { outBuf shouldBe testInput } } + .sink[(String, Int)](Tsv("output"))(outBuf => outBuf shouldBe testInput) .run - the[IllegalArgumentException] thrownBy { + (the[IllegalArgumentException] thrownBy { runJobTest() - } should have message (s"Failed to create tap for: ${requiredSource}, with error: requirement failed: " + TestTapFactory.sourceNotFoundError.format(requiredSource)) + } should have).message( + s"Failed to create tap for: $requiredSource, with error: requirement failed: " + TestTapFactory.sourceNotFoundError + .format(requiredSource) + ) } "use local mode by default" in { JobTest(new SimpleTestJob(_)).getTestMode(true, None) match { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala index ef94369e6a..600b9fb916 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala @@ -12,38 +12,32 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} -import java.io.{ ByteArrayOutputStream => BOS } -import java.io.{ ByteArrayInputStream => BIS } +import java.io.{ByteArrayOutputStream => BOS} +import java.io.{ByteArrayInputStream => BIS} import scala.collection.immutable.ListMap import scala.collection.immutable.HashMap -import com.twitter.algebird.{ - AveragedValue, - DecayedValue, - HyperLogLogMonoid, - Moments, - Monoid -} +import com.twitter.algebird.{AveragedValue, DecayedValue, HyperLogLogMonoid, Moments, Monoid} -import com.twitter.chill.config.{ ConfiguredInstantiator, ScalaMapConfig } +import com.twitter.chill.config.{ConfiguredInstantiator, ScalaMapConfig} import com.twitter.chill.hadoop.HadoopConfig import com.twitter.chill.hadoop.KryoSerialization -import com.esotericsoftware.kryo.io.{ Input, Output } +import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.hadoop.conf.Configuration /* -* This is just a test case for Kryo to deal with. It should -* be outside KryoTest, otherwise the enclosing class, KryoTest -* will also need to be serialized -*/ + * This is just a test case for Kryo to deal with. It should + * be outside KryoTest, otherwise the enclosing class, KryoTest + * will also need to be serialized + */ case class TestCaseClassForSerialization(x: String, y: Int) case class TestValMap(val map: Map[String, Double]) @@ -80,17 +74,14 @@ class KryoTest extends WordSpec with Matchers { ks.close res.asInstanceOf[T] } - def singleRT[T <: AnyRef](in: T): T = { + def singleRT[T <: AnyRef](in: T): T = deserObj[T](in.getClass, serObj(in)) - } //These are analogous to how Hadoop will serialize - def serialize(ins: List[AnyRef]) = { - ins.map { v => (v.getClass, serObj(v)) } - } - def deserialize(input: List[(Class[_], Array[Byte])]) = { - input.map { tup => deserObj[AnyRef](tup._1, tup._2) } - } + def serialize(ins: List[AnyRef]) = + ins.map(v => (v.getClass, serObj(v))) + def deserialize(input: List[(Class[_], Array[Byte])]) = + input.map(tup => deserObj[AnyRef](tup._1, tup._2)) def serializationRT(ins: List[AnyRef]) = deserialize(serialize(ins)) "KryoSerializers and KryoDeserializers" should { @@ -109,40 +100,49 @@ class KryoTest extends WordSpec with Matchers { "round trip any non-array object" in { implicit val hllmon: HyperLogLogMonoid = new HyperLogLogMonoid(4) - val test = List(1, 2, "hey", (1, 2), Args("--this is --a --b --test 34"), + val test = List( + 1, + 2, + "hey", + (1, 2), + Args("--this is --a --b --test 34"), ("hey", "you"), ("slightly", 1L, "longer", 42, "tuple"), Map(1 -> 2, 4 -> 5), 0 to 100, - (0 to 42).toList, Seq(1, 100, 1000), + (0 to 42).toList, + Seq(1, 100, 1000), Map("good" -> 0.5, "bad" -> -1.0), Set(1, 2, 3, 4, 10), ListMap("good" -> 0.5, "bad" -> -1.0), HashMap("good" -> 0.5, "bad" -> -1.0), TestCaseClassForSerialization("case classes are: ", 10), - TestValMap(Map("you" -> 1.0, "every" -> 2.0, "body" -> 3.0, "a" -> 1.0, - "b" -> 2.0, "c" -> 3.0, "d" -> 4.0)), + TestValMap( + Map("you" -> 1.0, "every" -> 2.0, "body" -> 3.0, "a" -> 1.0, "b" -> 2.0, "c" -> 3.0, "d" -> 4.0) + ), TestValHashMap(HashMap("you" -> 1.0)), Vector(1, 2, 3, 4, 5), TestValMap(null), Some("junk"), DecayedValue(1.0, 2.0), - Moments(100.0), Monoid.plus(Moments(100), Moments(2)), + Moments(100.0), + Monoid.plus(Moments(100), Moments(2)), AveragedValue(100, 32.0), // Serialize an instance of the HLL monoid hllmon.toHLL(42), - Monoid.sum(List(1, 2, 3, 4).map { hllmon.toHLL(_) }), - 'hai) + Monoid.sum(List(1, 2, 3, 4).map(hllmon.toHLL(_))), + 'hai + ) .asInstanceOf[List[AnyRef]] serializationRT(test) shouldBe test // HyperLogLogMonoid doesn't have a good equals. :( singleRT(new HyperLogLogMonoid(5)).bits shouldBe 5 } "handle arrays" in { - def arrayRT[T](arr: Array[T]): Unit = { + def arrayRT[T](arr: Array[T]): Unit = serializationRT(List(arr)).head - .asInstanceOf[Array[T]].toList shouldBe (arr.toList) - } + .asInstanceOf[Array[T]] + .toList shouldBe (arr.toList) arrayRT(Array(0)) arrayRT(Array(0.1)) arrayRT(Array("hey")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala index 930e56ee2a..496ba79de5 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala @@ -8,22 +8,22 @@ import scala.concurrent.duration._ /** * on branch 0.17.x: - * - size=2 took 0.5 seconds - * - size=4 took 0.2 seconds - * - size=8 took 0.3 seconds - * - size=16 took 0.4 seconds - * - size=32 took 0.7 seconds - * - size=64 took 18.9 seconds - * - size=128 timed out (after 60 seconds) + * - size=2 took 0.5 seconds + * - size=4 took 0.2 seconds + * - size=8 took 0.3 seconds + * - size=16 took 0.4 seconds + * - size=32 took 0.7 seconds + * - size=64 took 18.9 seconds + * - size=128 timed out (after 60 seconds) * * on branch cascading3: - * - size=2 took 0.6 seconds - * - size=4 took 0.3 seconds - * - size=8 took 0.3 seconds - * - size=16 took 0.4 seconds - * - size=32 took 0.5 seconds - * - size=64 took 1.2 seconds - * - size=128 took 2.7 seconds + * - size=2 took 0.6 seconds + * - size=4 took 0.3 seconds + * - size=8 took 0.3 seconds + * - size=16 took 0.4 seconds + * - size=32 took 0.5 seconds + * - size=64 took 1.2 seconds + * - size=128 took 2.7 seconds */ class LargePlanTest extends FunSuite { @@ -48,17 +48,16 @@ class LargePlanTest extends FunSuite { val exec = pipe.toIterableExecution val fut = exec.run(Config.empty, Local(true)) val values = Await.result(fut, Timeout) - val secs = "%.1f" format ((System.currentTimeMillis() - t0) / 1000.0) + val secs = "%.1f".format((System.currentTimeMillis() - t0) / 1000.0) assert(true) println(s"size=$size took $secs seconds") } - test("size=2") { run(2) } - test("size=4") { run(4) } - test("size=8") { run(8) } - test("size=16") { run(16) } - test("size=32") { run(32) } - test("size=64") { run(64) } + test("size=2")(run(2)) + test("size=4")(run(4)) + test("size=8")(run(8)) + test("size=16")(run(16)) + test("size=32")(run(32)) + test("size=64")(run(64)) // test("size=128") { run(128) } } - diff --git a/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala index 2d09d6c1c0..2cd855eac3 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.typed.LookupJoin -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.algebird.Semigroup @@ -25,9 +25,10 @@ object LookupJoinedTest { // Not defined if there is a collision in K and T, so make those unique: def genList(maxTime: Int, maxKey: Int, sz: Int): List[(Int, Int, Int)] = { val rng = new java.util.Random - (0 until sz).view.map { _ => - (rng.nextInt(maxTime), rng.nextInt(maxKey), rng.nextInt) - } + (0 until sz).view + .map { _ => + (rng.nextInt(maxTime), rng.nextInt(maxKey), rng.nextInt) + } .groupBy { case (t, k, v) => (t, k) } .mapValues(_.headOption.toList) .values @@ -41,19 +42,22 @@ class LookupJoinerJob(args: Args) extends Job(args) { val in0 = TypedTsv[(Int, Int, Int)]("input0") val in1 = TypedTsv[(Int, Int, Int)]("input1") - LookupJoin(TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, - TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) }) - .map { - case (t, (k, (v, opt))) => - (t.toString, k.toString, v.toString, opt.toString) + LookupJoin( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + ) + .map { case (t, (k, (v, opt))) => + (t.toString, k.toString, v.toString, opt.toString) } .write(TypedTsv[(String, String, String, String)]("output")) - LookupJoin.rightSumming(TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, - TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) }) - .map { - case (t, (k, (v, opt))) => - (t.toString, k.toString, v.toString, opt.toString) + LookupJoin + .rightSumming( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + ) + .map { case (t, (k, (v, opt))) => + (t.toString, k.toString, v.toString, opt.toString) } .write(TypedTsv[(String, String, String, String)]("output2")) } @@ -67,7 +71,8 @@ class LookupJoinedTest extends WordSpec with Matchers { def lookup(t: T, k: K): Option[W] = { val ord = Ordering.by { tkw: (T, K, W) => tkw._1 } serv.get(k).flatMap { in1s => - in1s.filter { case (t1, _, _) => Ordering[T].lt(t1, t) } + in1s + .filter { case (t1, _, _) => Ordering[T].lt(t1, t) } .reduceOption(ord.max(_, _)) .map { _._3 @@ -81,21 +86,24 @@ class LookupJoinedTest extends WordSpec with Matchers { implicit val ord: Ordering[(T, K, W)] = Ordering.by { _._1 } - val serv: Map[K, List[(T, K, W)]] = in1.groupBy(_._2).map { - case (k, v) => - (k, v.toList - .sorted + val serv: Map[K, List[(T, K, W)]] = in1.groupBy(_._2).map { case (k, v) => + ( + k, + v.toList.sorted .scanLeft(None: Option[(T, K, W)]) { (old, newer) => - old.map { case (_, _, w) => (newer._1, newer._2, Semigroup.plus(w, newer._3)) } + old + .map { case (_, _, w) => (newer._1, newer._2, Semigroup.plus(w, newer._3)) } .orElse(Some(newer)) } - .collect { case Some(v) => v }) + .collect { case Some(v) => v } + ) } def lookup(t: T, k: K): Option[W] = { val ord = Ordering.by { tkw: (T, K, W) => tkw._1 } serv.get(k).flatMap { in1s => - in1s.filter { case (t1, _, _) => Ordering[T].lt(t1, t) } + in1s + .filter { case (t1, _, _) => Ordering[T].lt(t1, t) } .reduceOption(ord.max(_, _)) .map { _._3 @@ -114,16 +122,16 @@ class LookupJoinedTest extends WordSpec with Matchers { JobTest(new LookupJoinerJob(_)) .source(TypedTsv[(Int, Int, Int)]("input0"), in0) .source(TypedTsv[(Int, Int, Int)]("input1"), in1) - .sink[(String, String, String, String)]( - TypedTsv[(String, String, String, String)]("output")) { outBuf => - outBuf.toSet should equal (lookupJoin(in0, in1).toSet) - in0.size should equal (outBuf.size) - } - .sink[(String, String, String, String)]( - TypedTsv[(String, String, String, String)]("output2")) { outBuf => + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output")) { + outBuf => + outBuf.toSet should equal(lookupJoin(in0, in1).toSet) + in0.size should equal(outBuf.size) + } + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output2")) { + outBuf => outBuf.toSet should equal(lookupSumJoin(in0, in1).toSet) in0.size should equal(outBuf.size) - } + } .run //.runHadoop .finish() @@ -140,11 +148,13 @@ class WindowLookupJoinerJob(args: Args) extends Job(args) { def gate(left: Int, right: Int) = (left.toLong - right.toLong) < window - LookupJoin.withWindow(TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, - TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) })(gate _) - .map { - case (t, (k, (v, opt))) => - (t.toString, k.toString, v.toString, opt.toString) + LookupJoin + .withWindow( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + )(gate _) + .map { case (t, (k, (v, opt))) => + (t.toString, k.toString, v.toString, opt.toString) } .write(TypedTsv[(String, String, String, String)]("output")) } @@ -159,10 +169,10 @@ class WindowLookupJoinedTest extends WordSpec with Matchers { def lookup(t: Int, k: K): Option[W] = { val ord = Ordering.by { tkw: (Int, K, W) => tkw._1 } serv.get(k).flatMap { in1s => - in1s.filter { - case (t1, _, _) => + in1s + .filter { case (t1, _, _) => (t1 < t) && ((t.toLong - t1.toLong) < win) - } + } .reduceOption(ord.max(_, _)) .map { _._3 @@ -184,8 +194,8 @@ class WindowLookupJoinedTest extends WordSpec with Matchers { .arg("window", "100") .source(TypedTsv[(Int, Int, Int)]("input0"), in0) .source(TypedTsv[(Int, Int, Int)]("input1"), in1) - .sink[(String, String, String, String)]( - TypedTsv[(String, String, String, String)]("output")) { outBuf => + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output")) { + outBuf => val results = outBuf.toList.sorted val correct = windowLookupJoin(in0, in1, 100).toList.sorted def some(it: List[(String, String, String, String)]) = @@ -196,12 +206,11 @@ class WindowLookupJoinedTest extends WordSpec with Matchers { some(results) shouldBe (some(correct)) none(results) shouldBe (none(correct)) - in0.size should equal (outBuf.size) - } + in0.size should equal(outBuf.size) + } .run //.runHadoop .finish() } } } - diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala index 6584104b75..fb61419a48 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.TupleEntry -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.beans.BeanProperty import scala.collection.mutable.Buffer @@ -25,9 +25,8 @@ import scala.collection.mutable.Buffer class IntContainer { private var firstValue = 0 def getFirstValue = firstValue - def setFirstValue(v: Int): Unit = { + def setFirstValue(v: Int): Unit = firstValue = v - } @BeanProperty // Test the other syntax var secondValue = 0 @@ -92,8 +91,7 @@ class FatContainer { case class IntCaseClass(firstValue: Int, secondValue: Int) class ContainerPopulationJob(args: Args) extends Job(args) { - Tsv("input") - .read + Tsv("input").read .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .pack[IntContainer](('firstValue, 'secondValue) -> 'combined) .project('combined) @@ -103,15 +101,13 @@ class ContainerPopulationJob(args: Args) extends Job(args) { } class ContainerToPopulationJob(args: Args) extends Job(args) { - Tsv("input") - .read + Tsv("input").read .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .packTo[IntContainer](('firstValue, 'secondValue) -> 'combined) .unpackTo[IntContainer]('combined -> ('firstValue, 'secondValue)) .write(Tsv("output")) - Tsv("input") - .read + Tsv("input").read .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .packTo[IntCaseClass](('firstValue, 'secondValue) -> 'combined) .unpackTo[IntCaseClass]('combined -> ('firstValue, 'secondValue)) @@ -119,8 +115,7 @@ class ContainerToPopulationJob(args: Args) extends Job(args) { } class FatContainerPopulationJob(args: Args) extends Job(args) { - Tsv("input") - .read + Tsv("input").read .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .map(('firstValue, 'secondValue) -> 'fatContainer) { v: (Int, Int) => FatContainer.fromFibonacci(v._1, v._2) @@ -131,8 +126,7 @@ class FatContainerPopulationJob(args: Args) extends Job(args) { } class FatContainerToPopulationJob(args: Args) extends Job(args) { - Tsv("input") - .read + Tsv("input").read .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .map(('firstValue, 'secondValue) -> 'fatContainer) { v: (Int, Int) => FatContainer.fromFibonacci(v._1, v._2) @@ -142,10 +136,7 @@ class FatContainerToPopulationJob(args: Args) extends Job(args) { } class PackTest extends WordSpec with Matchers { - val inputData = List( - (1, 2), - (2, 2), - (3, 2)) + val inputData = List((1, 2), (2, 2), (3, 2)) "A ContainerPopulationJob" should { JobTest(new ContainerPopulationJob(_)) @@ -180,7 +171,8 @@ class PackTest extends WordSpec with Matchers { } val fatInputData = List((8, 13)) - val fatCorrect = List(8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811) + val fatCorrect = List(8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, + 28657, 46368, 75025, 121393, 196418, 317811) "A FatContainerPopulationJob" should { JobTest(new FatContainerPopulationJob(_)) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala index d70e542f70..c5001e9de0 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.File -import scala.io.{ Source => ScalaSource } +import scala.io.{Source => ScalaSource} -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import cascading.tap.SinkMode import cascading.tuple.Fields @@ -27,7 +27,7 @@ import cascading.tuple.TupleEntry import cascading.util.Util import cascading.tap.partition.Partition -import com.twitter.scalding.{ PartitionedTsv => StandardPartitionedTsv } +import com.twitter.scalding.{PartitionedTsv => StandardPartitionedTsv} object PartitionSourceTestHelpers { import Dsl._ @@ -46,7 +46,8 @@ object PartitionSourceTestHelpers { // Define once, here, otherwise testMode.getWritePathFor() won't work val DelimitedPartitionedTsv = StandardPartitionedTsv("base", "/", 'col1) - val CustomPartitionedTsv = StandardPartitionedTsv("base", new CustomPartition('col1, 'col2), false, Fields.ALL, SinkMode.REPLACE) + val CustomPartitionedTsv = + StandardPartitionedTsv("base", new CustomPartition('col1, 'col2), false, Fields.ALL, SinkMode.REPLACE) val PartialPartitionedTsv = StandardPartitionedTsv("base", "/", ('col1, 'col2), false, ('col1, 'col3)) } @@ -101,7 +102,7 @@ class DelimitedPartitionSourceTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(DelimitedPartitionedTsv)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("A", "B") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) @@ -135,7 +136,7 @@ class CustomPartitionSourceTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(CustomPartitionedTsv)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("{A}->{x}", "{B}->{y}") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("{A}->{x}", "{B}->{y}") val aSource = ScalaSource.fromFile(new File(directory, "{A}->{x}/part-00000-00000")) val bSource = ScalaSource.fromFile(new File(directory, "{B}->{y}/part-00000-00001")) @@ -170,7 +171,7 @@ class PartialPartitionSourceTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(PartialPartitionedTsv)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("A", "B") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/x/part-00000-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/y/part-00000-00001")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala index 698206cea6..88f0d0470d 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } -import org.apache.hadoop.fs.{ Path => HadoopPath, PathFilter } +import org.scalatest.{Matchers, WordSpec} +import org.apache.hadoop.fs.{Path => HadoopPath, PathFilter} class PathFilterTest extends WordSpec with Matchers { "RichPathFilter" should { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala index d8f8d2675f..7add21b039 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala @@ -12,17 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class SortWithTakeJob(args: Args) extends Job(args) { try { Tsv("input0", ('key, 'item_id, 'score)).read .groupBy('key) { _.sortWithTake[(Long, Double)]((('item_id, 'score), 'top_items), 5) { - (item_0: (Long, Double), item_1: (Long, Double)) => if (item_0._2 == item_1._2) { item_0._1 > item_1._1 } else { item_0._2 > item_1._2 } + (item_0: (Long, Double), item_1: (Long, Double)) => + if (item_0._2 == item_1._2) { item_0._1 > item_1._1 } + else { item_0._2 > item_1._2 } } } .map('top_items -> 'top_items) { @@ -78,8 +80,8 @@ class ApproximateUniqueCountJob(args: Args) extends Job(args) { .groupBy('category) { _.approximateUniqueCount[String]('os -> 'os_count) } - .map('os_count -> 'os_count) { - osCount: Double => osCount.toLong + .map('os_count -> 'os_count) { osCount: Double => + osCount.toLong } .write(Tsv("output0")) } catch { @@ -89,7 +91,17 @@ class ApproximateUniqueCountJob(args: Args) extends Job(args) { class ReduceOperationsTest extends WordSpec with Matchers { import Dsl._ - val inputData = List(("a", 2L, 3.0), ("a", 3L, 3.0), ("a", 1L, 3.5), ("b", 1L, 6.0), ("b", 2L, 5.0), ("b", 3L, 4.0), ("b", 4L, 3.0), ("b", 5L, 2.0), ("b", 6L, 1.0)) + val inputData = List( + ("a", 2L, 3.0), + ("a", 3L, 3.0), + ("a", 1L, 3.5), + ("b", 1L, 6.0), + ("b", 2L, 5.0), + ("b", 3L, 4.0), + ("b", 4L, 3.0), + ("b", 5L, 2.0), + ("b", 6L, 1.0) + ) "A sortWithTake job" should { JobTest(new SortWithTakeJob(_)) @@ -98,7 +110,8 @@ class ReduceOperationsTest extends WordSpec with Matchers { "grouped list" in { val whatWeWant: Map[String, String] = Map( "a" -> List((1L, 3.5), (3L, 3.0), (2L, 3.0)).toString, - "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString) + "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) @@ -114,7 +127,8 @@ class ReduceOperationsTest extends WordSpec with Matchers { "grouped list" in { val whatWeWant: Map[String, String] = Map( "a" -> List((1L, 3.5), (2L, 3.0), (3L, 3.0)).toString, - "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString) + "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) @@ -131,7 +145,8 @@ class ReduceOperationsTest extends WordSpec with Matchers { "grouped list" in { val whatWeWant: Map[String, String] = Map( "a" -> List((3L, 3.0), (2L, 3.0), (1L, 3.5)).toString, - "b" -> List((6L, 1.0), (5L, 2.0), (4L, 3.0), (3L, 4.0), (2L, 5.0)).toString) + "b" -> List((6L, 1.0), (5L, 2.0), (4L, 3.0), (3L, 4.0), (2L, 5.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) @@ -145,15 +160,14 @@ class ReduceOperationsTest extends WordSpec with Matchers { val inputData = List( ("laptop", "mbp 15' retina", "macosx"), ("mobile", "iphone5", "ios"), - ("mobile", "droid x", "android")) + ("mobile", "droid x", "android") + ) JobTest(new ApproximateUniqueCountJob(_)) .source(Tsv("input0", ('category, 'model, 'os)), inputData) .sink[(String, Long)](Tsv("output0")) { buf => "grouped OS count" in { - val whatWeWant: Map[String, Long] = Map( - "laptop" -> 1, - "mobile" -> 2) + val whatWeWant: Map[String, Long] = Map("laptop" -> 1, "mobile" -> 2) val whatWeGet: Map[String, Long] = buf.toMap whatWeGet should have size 2 whatWeGet.get("laptop").getOrElse("apples") shouldBe (whatWeWant.get("laptop").getOrElse("oranges")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala index 6d6af5bf8d..e13898d91b 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding import org.apache.hadoop.io.BytesWritable -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} case class C1(a: Int) case class C2(b: Int) @@ -37,7 +37,8 @@ class ReferencedClassFinderTest extends WordSpec with Matchers { "Identify and tokenize used case classes" in { val job = JobTest(new ReferencedClassFinderExample(_)) .arg("output", "outputFile") - .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")){ _: Any => Unit }.initJob(false) + .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")) { _: Any => Unit } + .initJob(false) val config = Config.tryFrom(job.config).get val tokenizedClasses = config.getCascadingSerializationTokens.values.toSet val kryoRegisteredClasses = config.getKryoRegisteredClasses @@ -56,15 +57,15 @@ class ReferencedClassFinderTest extends WordSpec with Matchers { tokenizedClasses should not contain (classOf[BytesWritable].getName) kryoRegisteredClasses should not contain (classOf[BytesWritable]) // classOf[Int] will return the primitive int, so manually pass in scala's wrapper - tokenizedClasses should not contain ("scala.Int") - tokenizedClasses should not contain ("scala.Array") + tokenizedClasses should not contain "scala.Int" + tokenizedClasses should not contain "scala.Array" } "Run successfully" in { JobTest(new ReferencedClassFinderExample(_)) .arg("output", "outputFile") - .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")){ _: Any => Unit } + .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")) { _: Any => Unit } .runHadoop } } -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala b/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala index bd8116e96a..a2a01837c3 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala @@ -5,15 +5,17 @@ import org.scalatest.FunSuite class RegressionTests extends FunSuite { test("hashJoins + merges that fail in cascading 3") { val p1 = - TypedPipe.from(List(1, 2)) + TypedPipe + .from(List(1, 2)) .cross(TypedPipe.from(List(3, 4))) val p2 = - TypedPipe.from(List(5, 6)) + TypedPipe + .from(List(5, 6)) .cross(TypedPipe.from(List(8, 9))) - val p3 = (p1 ++ p2) - val p4 = (TypedPipe.from(List((8, 1), (10, 2))) ++ p3) + val p3 = p1 ++ p2 + val p4 = TypedPipe.from(List((8, 1), (10, 2))) ++ p3 val expected = List((1, 3), (1, 4), (2, 3), (2, 4), (5, 8), (5, 9), (6, 8), (6, 9), (8, 1), (10, 2)) val values = p4.toIterableExecution diff --git a/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala b/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala index 1e204c279f..6b0f1d5229 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala @@ -3,50 +3,55 @@ package com.twitter.scalding import java.util.UUID import org.scalacheck.Prop._ -import org.scalacheck.{ Gen, Properties } +import org.scalacheck.{Gen, Properties} object RichPipeSpecification extends Properties("RichPipe") { import Gen._ - import cascading.pipe.{ Pipe => CPipe } + import cascading.pipe.{Pipe => CPipe} def extractPipeNumber(pipeName: String) = pipeName match { case RichPipe.FormerAssignedPipeNamePattern(pipenum) => pipenum.toInt - case _ => 0 + case _ => 0 } /* Note: in these tests, we can never compare to equality with the basePipeNumber or offsets from that; as the pipe assigned names number sequence is a global atomic integer, and the test framework might run other tests in parallel to this, we can only count on it being monotonically increasing. */ - property("assignName carries over the old number " + - "if it was already an assigned name") = forAll(posNum[Int]) { (oldNum: Int) => + property( + "assignName carries over the old number " + + "if it was already an assigned name" + ) = forAll(posNum[Int]) { (oldNum: Int) => val basePipeNumber = extractPipeNumber(RichPipe.getNextName) - val p = new CPipe(s"_pipe_${oldNum}") + val p = new CPipe(s"_pipe_$oldNum") val ap = RichPipe.assignName(p) val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${oldNum}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$oldNum") } - property("assignName carries over the last (12-hexdigits) group from the UUID " + - "if the old name included one") = forAll(alphaStr, uuid, alphaStr) { - (prefix: String, uuid: UUID, suffix: String) => - val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + property( + "assignName carries over the last (12-hexdigits) group from the UUID " + + "if the old name included one" + ) = forAll(alphaStr, uuid, alphaStr) { (prefix: String, uuid: UUID, suffix: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) - val lastGroup = uuid.toString.split("-").last - val p = new CPipe(prefix + uuid + suffix) - val ap = RichPipe.assignName(p) + val lastGroup = uuid.toString.split("-").last + val p = new CPipe(prefix + uuid + suffix) + val ap = RichPipe.assignName(p) - val newNum = extractPipeNumber(ap.getName) + val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${lastGroup}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$lastGroup") } - property("assignName carries over the last (12-hexdigits) group from the *last* UUID " + - "if the old name included more than one") = forAll(alphaStr, uuid, alphaStr, uuid, alphaStr) { + property( + "assignName carries over the last (12-hexdigits) group from the *last* UUID " + + "if the old name included more than one" + ) = forAll(alphaStr, uuid, alphaStr, uuid, alphaStr) { (prefix: String, uuid1: UUID, middle: String, uuid: UUID, suffix: String) => val basePipeNumber = extractPipeNumber(RichPipe.getNextName) @@ -56,50 +61,53 @@ object RichPipeSpecification extends Properties("RichPipe") { val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${lastGroup}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$lastGroup") } - property("assignName carries over the over the old number " - + "if it was already an assigned name carrying bits from a UUID") = forAll(posNum[Int], uuid) { - (oldNum: Int, uuid: UUID) => - val basePipeNumber = extractPipeNumber(RichPipe.getNextName) - val lastGroup = uuid.toString.split("-").last + property( + "assignName carries over the over the old number " + + "if it was already an assigned name carrying bits from a UUID" + ) = forAll(posNum[Int], uuid) { (oldNum: Int, uuid: UUID) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val lastGroup = uuid.toString.split("-").last - val p = new CPipe(s"_pipe_${oldNum}-${lastGroup}") - val ap = RichPipe.assignName(p) + val p = new CPipe(s"_pipe_$oldNum-$lastGroup") + val ap = RichPipe.assignName(p) - val newNum = extractPipeNumber(ap.getName) + val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${oldNum}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$oldNum") } - val smallNames = Gen.choose(0, 12) flatMap { sz => Gen.listOfN(sz, alphaChar) } map (_.mkString) + val smallNames = Gen.choose(0, 12).flatMap(sz => Gen.listOfN(sz, alphaChar)).map(_.mkString) - val longNames = Gen.choose(13, 256) flatMap { sz => Gen.listOfN(sz, alphaChar) } map (_.mkString) + val longNames = Gen.choose(13, 256).flatMap(sz => Gen.listOfN(sz, alphaChar)).map(_.mkString) - property("assignName carries over the whole old name " - + "if it's 12 characters or less") = forAll(smallNames) { - (name: String) => - val basePipeNumber = extractPipeNumber(RichPipe.getNextName) - val p = new CPipe(name) - val ap = RichPipe.assignName(p) + property( + "assignName carries over the whole old name " + + "if it's 12 characters or less" + ) = forAll(smallNames) { (name: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val p = new CPipe(name) + val ap = RichPipe.assignName(p) - val newNum = extractPipeNumber(ap.getName) + val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${name}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$name") } - property("assignName carries over the last 12 characters of the old name " - + "if it's more than 12 characters") = forAll(longNames) { - (name: String) => - val basePipeNumber = extractPipeNumber(RichPipe.getNextName) - val nameEnd = name.subSequence(name.length - 12, name.length) - val p = new CPipe(name) - val ap = RichPipe.assignName(p) + property( + "assignName carries over the last 12 characters of the old name " + + "if it's more than 12 characters" + ) = forAll(longNames) { (name: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val nameEnd = name.subSequence(name.length - 12, name.length) + val p = new CPipe(name) + val ap = RichPipe.assignName(p) - val newNum = extractPipeNumber(ap.getName) + val newNum = extractPipeNumber(ap.getName) - (newNum > basePipeNumber) && ap.getName.endsWith(s"-${nameEnd}") + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$nameEnd") } -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala index 7823077261..9e415cee3f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala @@ -1,21 +1,17 @@ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} /** - * Simple Example: First group data by gender and then sort by height reverse order. - * Then add another column for each group which is the rank order of the height. + * Simple Example: First group data by gender and then sort by height reverse order. Then add another column + * for each group which is the rank order of the height. */ class AddRankingWithScanLeft(args: Args) extends Job(args) { - Tsv("input1", ('gender, 'height)) - .read + Tsv("input1", ('gender, 'height)).read .groupBy('gender) { group => group.sortBy('height).reverse - group.scanLeft(('height) -> ('rank))((0L)) { - (rank: Long, user_id: Double) => - { - (rank + 1L) - } + group.scanLeft('height -> 'rank)(0L) { (rank: Long, user_id: Double) => + (rank + 1L) } } // scanLeft generates an extra line per group, thus remove it @@ -28,12 +24,8 @@ class ScanLeftTest extends WordSpec with Matchers { import Dsl._ // --- A simple ranking job - val sampleInput1 = List( - ("male", "165.2"), - ("female", "172.2"), - ("male", "184.1"), - ("male", "125.4"), - ("female", "128.6")) + val sampleInput1 = + List(("male", "165.2"), ("female", "172.2"), ("male", "184.1"), ("male", "125.4"), ("female", "128.6")) // Each group sorted and ranking added highest person to shortest val expectedOutput1 = Set( @@ -41,7 +33,8 @@ class ScanLeftTest extends WordSpec with Matchers { ("male", 165.2, 2), ("male", 125.4, 3), ("female", 172.2, 1), - ("female", 128.6, 2)) + ("female", 128.6, 2) + ) "A simple ranking scanleft job" should { JobTest(new AddRankingWithScanLeft(_)) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala index b2964fb64e..bf8c92e725 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} /* * Zip uses side effect construct to create zipped list. @@ -28,18 +28,17 @@ class Zip(args: Args) extends Job(args) { def release(): Unit = () } - val zipped = Tsv("line", ('line)).pipe - .using { createState } - .flatMap[String, (String, String)] ('line -> ('l1, 'l2)) { - case (accu, line) => - if (accu.lastLine == null) { - accu.lastLine = line - List() - } else { - val zipped = List((accu.lastLine, line)) - accu.lastLine = line - zipped - } + val zipped = Tsv("line", 'line).pipe + .using(createState) + .flatMap[String, (String, String)]('line -> ('l1, 'l2)) { case (accu, line) => + if (accu.lastLine == null) { + accu.lastLine = line + List() + } else { + val zipped = List((accu.lastLine, line)) + accu.lastLine = line + zipped + } } .project('l1, 'l2) @@ -49,7 +48,7 @@ class Zip(args: Args) extends Job(args) { class SideEffectTest extends WordSpec with Matchers with FieldConversions { "Zipper should do create zipped sequence. Coded with side effect" should { JobTest(new Zip(_)) - .source(Tsv("line", ('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) + .source(Tsv("line", 'line), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) .sink[(String, String)](Tsv("zipped")) { ob => "correctly compute zipped sequence" in { val res = ob.toList @@ -73,7 +72,7 @@ class ZipBuffer(args: Args) extends Job(args) { def release(): Unit = () } - val zipped = Tsv("line", ('line)).pipe + val zipped = Tsv("line", 'line).pipe .map('line -> 'oddOrEven) { line: String => line.substring(line.length - 1).toInt % 2 match { case 0 => "even" @@ -81,15 +80,13 @@ class ZipBuffer(args: Args) extends Job(args) { } } .groupBy('oddOrEven) { - _.using { createState } + _.using(createState) .mapStream('line -> ('l1, 'l2)) { (accu, iter: Iterator[String]) => - { - accu.lastLine = iter.next() - for (line <- iter) yield { - val result = (accu.lastLine, line) - accu.lastLine = line - result - } + accu.lastLine = iter.next() + for (line <- iter) yield { + val result = (accu.lastLine, line) + accu.lastLine = line + result } } } @@ -101,11 +98,22 @@ class ZipBuffer(args: Args) extends Job(args) { class SideEffectBufferTest extends WordSpec with Matchers with FieldConversions { "ZipBuffer should do create two zipped sequences, one for even lines and one for odd lines. Coded with side effect" should { JobTest("com.twitter.scalding.ZipBuffer") - .source(Tsv("line", ('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"), Tuple1("line5"), Tuple1("line6"))) + .source( + Tsv("line", 'line), + List( + Tuple1("line1"), + Tuple1("line2"), + Tuple1("line3"), + Tuple1("line4"), + Tuple1("line5"), + Tuple1("line6") + ) + ) .sink[(String, String)](Tsv("zipped")) { ob => "correctly compute zipped sequence" in { val res = ob.toList.sorted - val expected = List(("line1", "line3"), ("line3", "line5"), ("line2", "line4"), ("line4", "line6")).sorted + val expected = + List(("line1", "line3"), ("line3", "line5"), ("line2", "line4"), ("line4", "line6")).sorted res shouldBe expected } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala index 354aa08800..48ff6242cf 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.mutable.Buffer @@ -23,10 +23,11 @@ class SkewJoinJob(args: Args) extends Job(args) { val sampleRate = args.getOrElse("sampleRate", "0.001").toDouble val reducers = args.getOrElse("reducers", "-1").toInt val replicationFactor = args.getOrElse("replicationFactor", "1").toInt - val replicator = if (args.getOrElse("replicator", "a") == "a") - SkewReplicationA(replicationFactor) - else - SkewReplicationB() + val replicator = + if (args.getOrElse("replicator", "a") == "a") + SkewReplicationA(replicationFactor) + else + SkewReplicationB() val in0 = Tsv("input0").read.mapTo((0, 1, 2) -> ('x1, 'y1, 's1)) { input: (Int, Int, Int) => input } val in1 = Tsv("input1").read.mapTo((0, 1, 2) -> ('x2, 'y2, 's2)) { input: (Int, Int, Int) => input } @@ -47,13 +48,18 @@ object JoinTestHelper { val rng = new java.util.Random def generateInput(size: Int, max: Int): List[(String, String, String)] = { def next: String = rng.nextInt(max).toString - (0 to size).map { i => (next, next, next) }.toList + (0 to size).map(i => (next, next, next)).toList } type JoinResult = (Int, Int, Int, Int, Int, Int) - def runJobWithArguments(fn: (Args) => Job, sampleRate: Double = 0.001, reducers: Int = -1, - replicationFactor: Int = 1, replicator: String = "a"): (List[JoinResult], List[JoinResult]) = { + def runJobWithArguments( + fn: (Args) => Job, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicationFactor: Int = 1, + replicator: String = "a" + ): (List[JoinResult], List[JoinResult]) = { val skewResult = Buffer[JoinResult]() val innerResult = Buffer[JoinResult]() @@ -64,8 +70,8 @@ object JoinTestHelper { .arg("replicator", replicator) .source(Tsv("input0"), generateInput(1000, 100)) .source(Tsv("input1"), generateInput(100, 100)) - .sink[(Int, Int, Int, Int, Int, Int)](Tsv("output")) { outBuf => skewResult ++= outBuf } - .sink[(Int, Int, Int, Int, Int, Int)](Tsv("jws-output")) { outBuf => innerResult ++= outBuf } + .sink[(Int, Int, Int, Int, Int, Int)](Tsv("output"))(outBuf => skewResult ++= outBuf) + .sink[(Int, Int, Int, Int, Int, Int)](Tsv("jws-output"))(outBuf => innerResult ++= outBuf) .run //.runHadoop //this takes MUCH longer to run. Commented out by default, but tests pass on my machine .finish() @@ -128,10 +134,11 @@ class CollidingKeySkewJoinJob(args: Args) extends Job(args) { val sampleRate = args.getOrElse("sampleRate", "0.001").toDouble val reducers = args.getOrElse("reducers", "-1").toInt val replicationFactor = args.getOrElse("replicationFactor", "1").toInt - val replicator = if (args.getOrElse("replicator", "a") == "a") - SkewReplicationA(replicationFactor) - else - SkewReplicationB() + val replicator = + if (args.getOrElse("replicator", "a") == "a") + SkewReplicationA(replicationFactor) + else + SkewReplicationB() val in0 = Tsv("input0").read.mapTo((0, 1, 2) -> ('k1, 'k3, 'v1)) { input: (Int, Int, Int) => input } val in1 = Tsv("input1").read.mapTo((0, 1, 2) -> ('k2, 'k3, 'v2)) { input: (Int, Int, Int) => input } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala b/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala index 51c0e80ad1..0eb57c8ac7 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import cascading.pipe.Pipe import cascading.tuple.Fields @@ -47,8 +47,10 @@ class SourceSpec extends WordSpec with Matchers { } } - class DailySuffixTsvSecond(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with DelimitedScheme { + class DailySuffixTsvSecond(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange + ) extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs } @@ -65,55 +67,63 @@ class SourceSpec extends WordSpec with Matchers { } } -case class AddOneTsv(p: String) extends FixedPathSource(p) - with DelimitedScheme with Mappable[(Int, String, String)] { +case class AddOneTsv(p: String) + extends FixedPathSource(p) + with DelimitedScheme + with Mappable[(Int, String, String)] { import Dsl._ import TDsl._ override val transformInTest = true override val sourceFields = new Fields("one", "two", "three") override def converter[U >: (Int, String, String)] = - TupleConverter.asSuperConverter[(Int, String, String), U](implicitly[TupleConverter[(Int, String, String)]]) - override def transformForRead(p: Pipe) = { - p.mapTo((0, 1) -> ('one, 'two, 'three)) { - t: (Int, String) => t :+ "1" + TupleConverter.asSuperConverter[(Int, String, String), U]( + implicitly[TupleConverter[(Int, String, String)]] + ) + override def transformForRead(p: Pipe) = + p.mapTo((0, 1) -> ('one, 'two, 'three)) { t: (Int, String) => + t :+ "1" } - } } -case class RemoveOneTsv(p: String) extends FixedPathSource(p) - with DelimitedScheme with Mappable[(Int, String, String)] { +case class RemoveOneTsv(p: String) + extends FixedPathSource(p) + with DelimitedScheme + with Mappable[(Int, String, String)] { override val transformInTest = true import Dsl._ override val sourceFields = new Fields("one", "two", "three") override def converter[U >: (Int, String, String)] = - TupleConverter.asSuperConverter[(Int, String, String), U](implicitly[TupleConverter[(Int, String, String)]]) - override def transformForWrite(p: Pipe) = { - p.mapTo(('one, 'two, 'three) -> (0, 1)) { - t: (Int, String, String) => (t._1, t._2) + TupleConverter.asSuperConverter[(Int, String, String), U]( + implicitly[TupleConverter[(Int, String, String)]] + ) + override def transformForWrite(p: Pipe) = + p.mapTo(('one, 'two, 'three) -> (0, 1)) { t: (Int, String, String) => + (t._1, t._2) } - } } class AddRemoveOneJob(args: Args) extends Job(args) { - AddOneTsv("input") - .read + AddOneTsv("input").read //just for fun lets just switch all 1s with 2s .map('three -> 'three) { s: String => "2" } - .write(RemoveOneTsv("output")) } class MapTypedPipe(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[(Int, String)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, String)]("input")) .map(MapFunctionAndThenTest.mapFunction) .write(TypedText.tsv[(Int, String, Int)]("output")) } class IdentityTypedPipe(args: Args) extends Job(args) { - TypedPipe.from( - TypedText.tsv[(Int, String)]("input") - .andThen(MapFunctionAndThenTest.mapFunction)) + TypedPipe + .from( + TypedText + .tsv[(Int, String)]("input") + .andThen(MapFunctionAndThenTest.mapFunction) + ) .write(TypedText.tsv[(Int, String, Int)]("output")) } @@ -129,7 +139,7 @@ class TypedPipeAndThenTest extends WordSpec with Matchers { "Mappable.andThen is like TypedPipe.map" should { JobTest(new MapTypedPipe(_)) .source(TypedText.tsv[(Int, String)]("input"), input) - .typedSink(TypedText.tsv[(Int, String, Int)]("output")){ outputBuffer => + .typedSink(TypedText.tsv[(Int, String, Int)]("output")) { outputBuffer => val outMap = outputBuffer.toList "TypedPipe return proper results" in { outMap should have size 3 @@ -141,7 +151,7 @@ class TypedPipeAndThenTest extends WordSpec with Matchers { JobTest(new IdentityTypedPipe(_)) .source(TypedText.tsv[(Int, String)]("input"), input) - .typedSink(TypedText.tsv[(Int, String, Int)]("output")){ outputBuffer => + .typedSink(TypedText.tsv[(Int, String, Int)]("output")) { outputBuffer => val outMap = outputBuffer.toList "Mappable.andThen return proper results" in { outMap should have size 3 diff --git a/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala index 664ee49bfc..59fa003e32 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala @@ -1,14 +1,15 @@ package com.twitter.scalding import cascading.flow.FlowException -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.util.Try class StatsTestJob1(args: Args) extends Job(args) with CounterVerification { val nonZero = Stat("number of non-zero records", "stats") - TypedPipe.from(TypedTsv[(String, Int)](args("input"))) + TypedPipe + .from(TypedTsv[(String, Int)](args("input"))) .map { kv => if (kv._2 != 0) nonZero.inc() (kv._1.toLowerCase, kv._2) @@ -29,14 +30,13 @@ class StatsTest extends WordSpec with Matchers { val goodInput = List(("a", 0), ("b", 1), ("c", 2)) val badInput = List(("a", 0), ("b", 0), ("c", 0)) - def runJobTest[T: TupleSetter](f: Args => Job, input: List[T]): Unit = { + def runJobTest[T: TupleSetter](f: Args => Job, input: List[T]): Unit = JobTest(f) .arg("input", "input") .arg("output", "output") .source(TypedTsv[(String, Int)]("input"), input) - .sink[(String, Int)](TypedTsv[(String, Int)]("output")){ outBuf => outBuf shouldBe input } + .sink[(String, Int)](TypedTsv[(String, Int)]("output"))(outBuf => outBuf shouldBe input) .run - } "StatsTestJob" should { "pass if verifyCounters() is true" in { @@ -52,7 +52,10 @@ class StatsTest extends WordSpec with Matchers { it should { "skip verifyCounters() if job fails" in { - (the[FlowException] thrownBy runJobTest(new StatsTestJob1(_), List((null, 0)))).getCause.getCause shouldBe a[NullPointerException] + (the[FlowException] thrownBy runJobTest( + new StatsTestJob1(_), + List((null, 0)) + )).getCause.getCause shouldBe a[NullPointerException] } } @@ -62,4 +65,4 @@ class StatsTest extends WordSpec with Matchers { } } -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/StringUtilityTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/StringUtilityTest.scala index eaae37b2a8..fa02d1fbcc 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/StringUtilityTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/StringUtilityTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding -import org.scalatest.{ PropSpec, Matchers, WordSpec } +import org.scalatest.{Matchers, PropSpec, WordSpec} import org.scalacheck.Prop.forAll import org.scalatest.prop.Checkers import org.scalacheck.Gen @@ -51,12 +51,11 @@ class StringUtilityPropertyTest extends PropSpec with Checkers { property("fastSplit(s, sep) should match s.split(sep, -1) for non-regex sep") { check { - forAll(randomStringGen, randomSeparator) { - (str, separator) => - val t = str.mkString("") - val r1 = t.split(separator, -1).toList - val r2 = StringUtility.fastSplit(t, separator) - r1 == r2 + forAll(randomStringGen, randomSeparator) { (str, separator) => + val t = str.mkString("") + val r1 = t.split(separator, -1).toList + val r2 = StringUtility.fastSplit(t, separator) + r1 == r2 } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala index 7ab5ff7bc2..6a56711a5a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.File -import scala.io.{ Source => ScalaSource } +import scala.io.{Source => ScalaSource} -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TemplateTestJob(args: Args) extends Job(args) { try { @@ -51,7 +51,7 @@ class TemplateSourceTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(TemplatedTsv("base", "%s", 'col1))) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("A", "B") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala index 6ba9ae5a10..d92c0f02c7 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala @@ -1,9 +1,9 @@ package com.twitter.scalding import cascading.tap.Tap -import cascading.tuple.{ Fields, Tuple } +import cascading.tuple.{Fields, Tuple} import scala.collection.mutable.Buffer -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TestTapFactoryTest extends WordSpec with Matchers { "A test tap created by TestTapFactory" should { @@ -14,19 +14,19 @@ class TestTapFactoryTest extends WordSpec with Matchers { // Map of sources to use when creating the tap-- does not contain testSource val emptySourceMap = Map[Source, Buffer[Tuple]]() - val testMode = Test { emptySourceMap.get(_) } + val testMode = Test(emptySourceMap.get(_)) val testTapFactory = TestTapFactory(testSource, new Fields()) def createIllegalTap(accessMode: AccessMode): Tap[Any, Any, Any] = testTapFactory.createTap(accessMode)(testMode).asInstanceOf[Tap[Any, Any, Any]] - the[IllegalArgumentException] thrownBy { + (the[IllegalArgumentException] thrownBy { createIllegalTap(Read) - } should have message ("requirement failed: " + TestTapFactory.sourceNotFoundError.format(testSource)) + } should have).message("requirement failed: " + TestTapFactory.sourceNotFoundError.format(testSource)) - the[IllegalArgumentException] thrownBy { + (the[IllegalArgumentException] thrownBy { createIllegalTap(Write) - } should have message ("requirement failed: " + TestTapFactory.sinkNotFoundError.format(testSource)) + } should have).message("requirement failed: " + TestTapFactory.sinkNotFoundError.format(testSource)) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala index 9b57ce8730..561a17e3a1 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TimePathedSourceTest extends WordSpec with Matchers { "TimePathedSource.hdfsWritePath" should { @@ -25,15 +25,15 @@ class TimePathedSourceTest extends WordSpec with Matchers { val utcTZ = DateOps.UTC "crib if path == /*" in { - intercept[AssertionError] { TestTimePathedSource("/*", dateRange, utcTZ).hdfsWritePath } + intercept[AssertionError](TestTimePathedSource("/*", dateRange, utcTZ).hdfsWritePath) } "crib if path doesn't end with /*" in { - intercept[AssertionError] { TestTimePathedSource("/my/invalid/path", dateRange, utcTZ).hdfsWritePath } + intercept[AssertionError](TestTimePathedSource("/my/invalid/path", dateRange, utcTZ).hdfsWritePath) } "work for path ending with /*" in { - TestTimePathedSource("/my/path/*", dateRange, utcTZ).hdfsWritePath startsWith "/my/path" + TestTimePathedSource("/my/path/*", dateRange, utcTZ).hdfsWritePath.startsWith("/my/path") } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala index 8050af4c24..f32ac670aa 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tuple.{ TupleEntry, Tuple => CTuple } +import cascading.tuple.{Tuple => CTuple, TupleEntry} -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TupleTest extends WordSpec with Matchers { def get[T](ctup: CTuple)(implicit tc: TupleConverter[T]) = tc(new TupleEntry(ctup)) @@ -94,7 +94,10 @@ class TupleTest extends WordSpec with Matchers { assert(TupleConverter.tuple2Converter[Int, Int] == TupleConverter.tuple2Converter[Int, Int]) assert(TupleConverter.tuple2Converter[Int, String] == TupleConverter.tuple2Converter[Int, String]) - assert(TupleConverter.tuple2Converter[Int, (Int, String)] == TupleConverter.tuple2Converter[Int, (Int, String)]) + assert( + TupleConverter.tuple2Converter[Int, (Int, String)] == TupleConverter + .tuple2Converter[Int, (Int, String)] + ) assert(TupleSetter.singleSetter[Int] == TupleSetter.singleSetter[Int]) assert(TupleSetter.singleSetter[String] == TupleSetter.singleSetter[String]) @@ -102,15 +105,26 @@ class TupleTest extends WordSpec with Matchers { assert(TupleSetter.tup2Setter[(Int, Int)] == TupleSetter.tup2Setter[(Int, Int)]) assert(TupleSetter.tup2Setter[(String, Int)] == TupleSetter.tup2Setter[(String, Int)]) - assert(TupleSetter.tup2Setter[((Int, String), String)] == TupleSetter.tup2Setter[((Int, String), String)]) + assert( + TupleSetter.tup2Setter[((Int, String), String)] == TupleSetter.tup2Setter[((Int, String), String)] + ) } "CascadingBackend can tell Converter/Setter inverses" in { import com.twitter.scalding.typed.cascading_backend.CascadingBackend - assert(CascadingBackend.areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.singleSetter[Any])) - assert(!CascadingBackend.areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.tup2Setter[(Any, Any)])) - assert(CascadingBackend.areDefiniteInverse(TupleConverter.tuple2Converter[Any, Any], TupleSetter.tup2Setter[(Any, Any)])) + assert( + CascadingBackend + .areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.singleSetter[Any]) + ) + assert( + !CascadingBackend + .areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.tup2Setter[(Any, Any)]) + ) + assert( + CascadingBackend + .areDefiniteInverse(TupleConverter.tuple2Converter[Any, Any], TupleSetter.tup2Setter[(Any, Any)]) + ) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala index eca74034e8..cf204690ef 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.source.DailySuffixTypedTsv class TypedTsvJob(args: Args) extends Job(args) { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala index 679437faae..b7387a23c2 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.FlowException -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TypedFieldsTest extends WordSpec with Matchers { @@ -37,7 +37,7 @@ class TypedFieldsTest extends WordSpec with Matchers { .arg("input", "inputFile") .arg("output", "outputFile") .source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")) - .sink[(Opaque, Int)](Tsv("outputFile")){ outputBuffer => + .sink[(Opaque, Int)](Tsv("outputFile")) { outputBuffer => val outMap = outputBuffer.map { case (opaque: Opaque, i: Int) => (opaque.str, i) }.toMap outMap should have size 2 outMap("foo") shouldBe 14 @@ -50,15 +50,14 @@ class TypedFieldsTest extends WordSpec with Matchers { } - def untypedJob(): Unit = { + def untypedJob(): Unit = JobTest(new UntypedFieldsJob(_)) .arg("input", "inputFile") .arg("output", "outputFile") .source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")) - .sink[(Opaque, Int)](Tsv("outputFile")){ _ => } + .sink[(Opaque, Int)](Tsv("outputFile")) { _ => } .run .finish() - } } @@ -69,7 +68,7 @@ class UntypedFieldsJob(args: Args) extends Job(args) { val split = line.split(",") (split(0).toInt, new Opaque(split(1))) } - .groupBy('y) { _.sum[Double]('x) } + .groupBy('y)(_.sum[Double]('x)) .write(Tsv(args("output"))) } @@ -79,7 +78,7 @@ class UntypedFieldsJob(args: Args) extends Job(args) { class TypedFieldsJob(args: Args) extends Job(args) { implicit val ordering: Ordering[Opaque] = new Ordering[Opaque] { - def compare(a: Opaque, b: Opaque) = a.str compare b.str + def compare(a: Opaque, b: Opaque) = a.str.compare(b.str) } val xField = Field[String]('x) @@ -90,7 +89,7 @@ class TypedFieldsJob(args: Args) extends Job(args) { val split = line.split(",") (split(0).toInt, new Opaque(split(1))) } - .groupBy(yField) { _.sum[Double](xField -> xField) } + .groupBy(yField)(_.sum[Double](xField -> xField)) .write(Tsv(args("output"))) } @@ -100,8 +99,8 @@ class TypedFieldsJob(args: Args) extends Job(args) { class Opaque(val str: String) { override def equals(other: Any) = other match { - case other: Opaque => str equals other.str - case _ => false + case other: Opaque => str.equals(other.str) + case _ => false } override def hashCode = str.hashCode } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala index 0329199716..accbd489f2 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala @@ -1,13 +1,13 @@ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class TypedPipeCheckerTest extends WordSpec with Matchers { import TypedPipeChecker._ "TypedPipeChecker" should { "run asserts on pipe" in { - checkOutput(TypedPipe.from(List(1, 2, 3, 4))){ rows => + checkOutput(TypedPipe.from(List(1, 2, 3, 4))) { rows => assert(rows.size == 4) assert(rows == List(1, 2, 3, 4)) } @@ -25,7 +25,7 @@ class TypedPipeCheckerTest extends WordSpec with Matchers { "allow for a list of input to be run through a transform function" in { def transform(pipe: TypedPipe[Int]) = pipe.map(identity) - checkOutputTransform(List(1, 2, 3))(transform){ rows => + checkOutputTransform(List(1, 2, 3))(transform) { rows => assert(rows == List(1, 2, 3)) } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala index fcd21bbcb7..81ba82e28e 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ FunSuite, Matchers, WordSpec } -import com.twitter.scalding.source.{ FixedTypedText, TypedText } +import org.scalatest.{FunSuite, Matchers, WordSpec} +import com.twitter.scalding.source.{FixedTypedText, TypedText} import scala.collection.mutable // Use the scalacheck generators import org.scalacheck.Gen @@ -27,9 +27,9 @@ import TDsl._ import typed.MultiJoin object TUtil { - def printStack(fn: => Unit): Unit = { - try { fn } catch { case e: Throwable => e.printStackTrace; throw e } - } + def printStack(fn: => Unit): Unit = + try { fn } + catch { case e: Throwable => e.printStackTrace; throw e } implicit class JobTestExt(test: JobTest) { def writesLessDataThen(limitInBytes: Int): JobTest = test @@ -41,8 +41,9 @@ object TUtil { class TupleAdderJob(args: Args) extends Job(args) { - TypedText.tsv[(String, String)]("input") - .map{ f => + TypedText + .tsv[(String, String)]("input") + .map { f => (1 +: f) ++ (2, 3) } .write(TypedText.tsv[(Int, String, String, Int, Int)]("output")) @@ -52,11 +53,12 @@ class TupleAdderTest extends WordSpec with Matchers { "A TupleAdderJob" should { JobTest(new TupleAdderJob(_)) .source(TypedText.tsv[(String, String)]("input"), List(("a", "a"), ("b", "b"))) - .sink[(Int, String, String, Int, Int)](TypedText.tsv[(Int, String, String, Int, Int)]("output")) { outBuf => - "be able to use generated tuple adders" in { - outBuf should have size 2 - outBuf.toSet shouldBe Set((1, "a", "a", 2, 3), (1, "b", "b", 2, 3)) - } + .sink[(Int, String, String, Int, Int)](TypedText.tsv[(Int, String, String, Int, Int)]("output")) { + outBuf => + "be able to use generated tuple adders" in { + outBuf should have size 2 + outBuf.toSet shouldBe Set((1, "a", "a", 2, 3), (1, "b", "b", 2, 3)) + } } .run .finish() @@ -66,8 +68,8 @@ class TupleAdderTest extends WordSpec with Matchers { class TypedPipeJob(args: Args) extends Job(args) { //Word count using TypedPipe TextLine("inputFile") - .flatMap { _.split("\\s+") } - .map { w => (w, 1L) } + .flatMap(_.split("\\s+")) + .map(w => (w, 1L)) .forceToDisk .group //.forceToReducers @@ -82,7 +84,7 @@ class TypedPipeTest extends WordSpec with Matchers { TUtil.printStack { JobTest(new TypedPipeJob(_)) .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) - .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")){ outputBuffer => + .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap (idx + ": count words correctly") in { outMap("hack") shouldBe 4 @@ -100,7 +102,7 @@ class TypedPipeTest extends WordSpec with Matchers { class TypedSumByKeyJob(args: Args) extends Job(args) { //Word count using TypedPipe TextLine("inputFile") - .flatMap { l => l.split("\\s+").map((_, 1L)) } + .flatMap(l => l.split("\\s+").map((_, 1L))) .sumByKey .write(TypedText.tsv[(String, Long)]("outputFile")) } @@ -111,7 +113,7 @@ class TypedSumByKeyTest extends WordSpec with Matchers { TUtil.printStack { JobTest(new TypedSumByKeyJob(_)) .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) - .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")){ outputBuffer => + .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap (idx + ": count words correctly") in { outMap("hack") shouldBe 4 @@ -127,7 +129,8 @@ class TypedSumByKeyTest extends WordSpec with Matchers { } class TypedPipeSortByJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[(Int, Float, String)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, Float, String)]("input")) .groupBy(_._1) .sortBy(_._2) .mapValues(_._3) @@ -138,14 +141,18 @@ class TypedPipeSortByJob(args: Args) extends Job(args) { class TypedPipeSortByTest extends FunSuite { test("groups should not be disturbed by sortBy") { JobTest(new TypedPipeSortByJob(_)) - .source(TypedText.tsv[(Int, Float, String)]("input"), - List((0, 0.6f, "6"), + .source( + TypedText.tsv[(Int, Float, String)]("input"), + List( + (0, 0.6f, "6"), (0, 0.5f, "5"), (0, 0.1f, "1"), (1, 0.1f, "10"), (1, 0.5f, "50"), - (1, 0.51f, "510"))) - .sink[(Int, String)](TypedText.tsv[(Int, String)]("output")){ outputBuffer => + (1, 0.51f, "510") + ) + ) + .sink[(Int, String)](TypedText.tsv[(Int, String)]("output")) { outputBuffer => val map = outputBuffer.toList.groupBy(_._1) assert(map.size == 2, "should be two keys") assert(map.forall { case (_, vs) => vs.size == 1 }, "only one key per value") @@ -159,8 +166,10 @@ class TypedPipeSortByTest extends FunSuite { } class TypedPipeJoinJob(args: Args) extends Job(args) { - (Tsv("inputFile0").read.toTypedPipe[(Int, Int)](0, 1).group - leftJoin TypedPipe.from[(Int, Int)](Tsv("inputFile1").read, (0, 1)).group) + Tsv("inputFile0").read + .toTypedPipe[(Int, Int)](0, 1) + .group + .leftJoin(TypedPipe.from[(Int, Int)](Tsv("inputFile1").read, (0, 1)).group) .toTypedPipe .write(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) } @@ -170,16 +179,17 @@ class TypedPipeJoinTest extends WordSpec with Matchers { JobTest(new com.twitter.scalding.TypedPipeJoinJob(_)) .source(Tsv("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) .source(Tsv("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) - .typedSink[(Int, (Int, Option[Int]))](TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")){ outputBuffer => - val outMap = outputBuffer.toMap - "correctly join" in { - outMap should have size 5 - outMap(0) shouldBe (0, Some(1)) - outMap(1) shouldBe (1, Some(2)) - outMap(2) shouldBe (2, Some(3)) - outMap(3) shouldBe (3, Some(4)) - outMap(4) shouldBe (5, None) - } + .typedSink[(Int, (Int, Option[Int]))](TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) { + outputBuffer => + val outMap = outputBuffer.toMap + "correctly join" in { + outMap should have size 5 + outMap(0) shouldBe (0, Some(1)) + outMap(1) shouldBe (1, Some(2)) + outMap(2) shouldBe (2, Some(3)) + outMap(3) shouldBe (3, Some(4)) + outMap(4) shouldBe (5, None) + } }(implicitly[TypeDescriptor[(Int, (Int, Option[Int]))]].converter) .run .finish() @@ -191,7 +201,8 @@ class OpaqueJoinBox(i: Int) { def get = i } class TypedPipeJoinKryoJob(args: Args) extends Job(args) { val box = new OpaqueJoinBox(2) - TypedPipe.from(TypedText.tsv[(Int, Int)]("inputFile0")) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("inputFile0")) .join(TypedPipe.from(TypedText.tsv[(Int, Int)]("inputFile1"))) .mapValues { case (x, y) => x * y * box.get } .write(TypedText.tsv[(Int, Int)]("outputFile")) @@ -214,7 +225,7 @@ class TypedPipeJoinKryoTest extends WordSpec with Matchers { JobTest(new com.twitter.scalding.TypedPipeJoinKryoJob(_)) .source(TypedText.tsv[(Int, Int)]("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) .source(TypedText.tsv[(Int, Int)]("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) - .typedSink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")){ outputBuffer => + .typedSink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "correctly join" in { outMap should have size 4 @@ -230,7 +241,8 @@ class TypedPipeJoinKryoTest extends WordSpec with Matchers { } class TypedPipeDistinctJob(args: Args) extends Job(args) { - Tsv("inputFile").read.toTypedPipe[(Int, Int)](0, 1) + Tsv("inputFile").read + .toTypedPipe[(Int, Int)](0, 1) .distinct .write(TypedText.tsv[(Int, Int)]("outputFile")) } @@ -239,7 +251,7 @@ class TypedPipeDistinctTest extends WordSpec with Matchers { "A TypedPipeDistinctJob" should { JobTest(new TypedPipeDistinctJob(_)) .source(Tsv("inputFile"), List((0, 0), (1, 1), (2, 2), (2, 2), (2, 5))) - .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")){ outputBuffer => + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "correctly count unique item sizes" in { outputBuffer.toSet should have size 4 @@ -262,7 +274,7 @@ class TypedPipeDistinctWordsTest extends WordSpec with Matchers { var idx = 0 JobTest(new TypedPipeDistinctWordsJob(_)) .source(TextLine("inputFile"), List(1 -> "a b b c", 2 -> "c d e")) - .sink[String](TextLine("outputFile")){ outputBuffer => + .sink[String](TextLine("outputFile")) { outputBuffer => s"$idx: correctly count unique item sizes" in { outputBuffer.toSet should have size 5 } @@ -275,7 +287,8 @@ class TypedPipeDistinctWordsTest extends WordSpec with Matchers { } class TypedPipeDistinctByJob(args: Args) extends Job(args) { - Tsv("inputFile").read.toTypedPipe[(Int, Int)](0, 1) + Tsv("inputFile").read + .toTypedPipe[(Int, Int)](0, 1) .distinctBy(_._2) .write(TypedText.tsv[(Int, Int)]("outputFile")) } @@ -284,11 +297,11 @@ class TypedPipeDistinctByTest extends WordSpec with Matchers { "A TypedPipeDistinctByJob" should { JobTest(new TypedPipeDistinctByJob(_)) .source(Tsv("inputFile"), List((0, 1), (1, 1), (2, 2), (2, 2), (2, 5))) - .typedSink(TypedText.tsv[(Int, Int)]("outputFile")){ outputBuffer => + .typedSink(TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => "correctly count unique item sizes" in { val outSet = outputBuffer.toSet outSet should have size 3 - List(outSet) should contain oneOf (Set((0, 1), (2, 2), (2, 5)), Set((1, 1), (2, 2), (2, 5))) + (List(outSet) should contain).oneOf(Set((0, 1), (2, 2), (2, 5)), Set((1, 1), (2, 2), (2, 5))) } } .run @@ -297,14 +310,11 @@ class TypedPipeDistinctByTest extends WordSpec with Matchers { } class TypedPipeGroupedDistinctJob(args: Args) extends Job(args) { - val groupedTP = Tsv("inputFile").read.toTypedPipe[(Int, Int)](0, 1) - .group + val groupedTP = Tsv("inputFile").read.toTypedPipe[(Int, Int)](0, 1).group - groupedTP - .distinctValues + groupedTP.distinctValues .write(TypedText.tsv[(Int, Int)]("outputFile1")) - groupedTP - .distinctSize + groupedTP.distinctSize .write(TypedText.tsv[(Int, Long)]("outputFile2")) } @@ -312,13 +322,13 @@ class TypedPipeGroupedDistinctJobTest extends WordSpec with Matchers { "A TypedPipeGroupedDistinctJob" should { JobTest(new TypedPipeGroupedDistinctJob(_)) .source(Tsv("inputFile"), List((0, 0), (0, 1), (0, 1), (1, 0), (1, 1))) - .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile1")){ outputBuffer => + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile1")) { outputBuffer => val outSet = outputBuffer.toSet "correctly generate unique items" in { outSet should have size 4 } } - .sink[(Int, Int)](TypedText.tsv[(Int, Long)]("outputFile2")){ outputBuffer => + .sink[(Int, Int)](TypedText.tsv[(Int, Long)]("outputFile2")) { outputBuffer => val outMap = outputBuffer.toMap "correctly count unique item sizes" in { outMap(0) shouldBe 2 @@ -331,7 +341,8 @@ class TypedPipeGroupedDistinctJobTest extends WordSpec with Matchers { } class TypedPipeHashJoinJob(args: Args) extends Job(args) { - TypedText.tsv[(Int, Int)]("inputFile0") + TypedText + .tsv[(Int, Int)]("inputFile0") .group .hashLeftJoin(TypedText.tsv[(Int, Int)]("inputFile1").group) .write(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) @@ -342,7 +353,7 @@ class TypedPipeHashJoinTest extends WordSpec with Matchers { JobTest(new TypedPipeHashJoinJob(_)) .source(TypedText.tsv[(Int, Int)]("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) .source(TypedText.tsv[(Int, Int)]("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) - .typedSink(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")){ outputBuffer => + .typedSink(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "correctly join" in { outMap should have size 5 @@ -366,34 +377,38 @@ class TypedPipeTwoHashJoinsInARowTest extends WordSpec with Matchers { val tp2 = TypedPipe.from(elements.map(v => (v, 2 * v))) val tp3 = TypedPipe.from(elements.map(v => (v, 3 * v))) TypedPipeChecker.checkOutput(tp1.hashJoin(tp2).hashJoin(tp3))(result => - result shouldBe elements.map(v => (v, ((v, 2 * v), 3 * v)))) + result shouldBe elements.map(v => (v, ((v, 2 * v), 3 * v))) + ) } } } class TypedImplicitJob(args: Args) extends Job(args) { def revTup[K, V](in: (K, V)): (V, K) = (in._2, in._1) - TextLine("inputFile").read.typed(1 -> ('maxWord, 'maxCnt)) { tpipe: TypedPipe[String] => - tpipe.flatMap { _.split("\\s+") } - .map { w => (w, 1L) } - .group - .sum - .groupAll - // Looks like swap, but on the values in the grouping: - .mapValues { revTup _ } - .forceToReducers - .max - // Throw out the Unit key and reverse the value tuple - .values - .swap - }.write(TypedText.tsv[(String, Int)]("outputFile")) + TextLine("inputFile").read + .typed(1 -> ('maxWord, 'maxCnt)) { tpipe: TypedPipe[String] => + tpipe + .flatMap(_.split("\\s+")) + .map(w => (w, 1L)) + .group + .sum + .groupAll + // Looks like swap, but on the values in the grouping: + .mapValues(revTup _) + .forceToReducers + .max + // Throw out the Unit key and reverse the value tuple + .values + .swap + } + .write(TypedText.tsv[(String, Int)]("outputFile")) } class TypedPipeTypedTest extends WordSpec with Matchers { "A TypedImplicitJob" should { JobTest(new TypedImplicitJob(_)) .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) - .typedSink(TypedText.tsv[(String, Int)]("outputFile")){ outputBuffer => + .typedSink(TypedText.tsv[(String, Int)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "find max word" in { outMap should have size 1 @@ -411,7 +426,8 @@ class TypedWithOnCompleteJob(args: Args) extends Job(args) { def onCompleteMapper() = onCompleteMapperStat.inc() def onCompleteReducer() = onCompleteReducerStat.inc() // find repeated words ignoring case - TypedText.tsv[String]("input") + TypedText + .tsv[String]("input") .map(_.toUpperCase) .onComplete(onCompleteMapper) .groupBy(identity) @@ -428,11 +444,18 @@ class TypedPipeWithOnCompleteTest extends WordSpec with Matchers { "A TypedWithOnCompleteJob" should { JobTest(new TypedWithOnCompleteJob(_)) .source(TypedText.tsv[String]("input"), inputText.split("\\s+").map(Tuple1(_))) - .counter("onCompleteMapper") { cnt => "have onComplete called on mapper" in { assert(cnt == 1) } } - .counter("onCompleteReducer") { cnt => "have onComplete called on reducer" in { assert(cnt == 1) } } + .counter("onCompleteMapper")(cnt => "have onComplete called on mapper" in { assert(cnt == 1) }) + .counter("onCompleteReducer")(cnt => "have onComplete called on reducer" in { assert(cnt == 1) }) .sink[String](TypedText.tsv[String]("output")) { outbuf => "have the correct output" in { - val correct = inputText.split("\\s+").map(_.toUpperCase).groupBy(x => x).filter(_._2.size > 1).keys.toList.sorted + val correct = inputText + .split("\\s+") + .map(_.toUpperCase) + .groupBy(x => x) + .filter(_._2.size > 1) + .keys + .toList + .sorted val sortedL = outbuf.toList.sorted assert(sortedL == correct) } @@ -478,22 +501,28 @@ class TypedPipeWithOuterAndLeftJoinTest extends WordSpec with Matchers { } class TJoinCountJob(args: Args) extends Job(args) { - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)).group - join TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .join(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) .size .write(TypedText.tsv[(Int, Long)]("out")) //Also check simple joins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)).group - join TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .join(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) //Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1, kvw._2._2) } + .map(kvw => (kvw._1, kvw._2._1, kvw._2._2)) .write(TypedText.tsv[(Int, Int, Int)]("out2")) //Also check simple leftJoins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)).group - leftJoin TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .leftJoin(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) //Flatten out to three values: .toTypedPipe .map { kvw: (Int, (Int, Option[Int])) => @@ -507,22 +536,25 @@ class TJoinCountJob(args: Args) extends Job(args) { */ class TNiceJoinCountJob(args: Args) extends Job(args) { - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - join TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .join(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) .size .write(TypedText.tsv[(Int, Long)]("out")) //Also check simple joins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - join TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .join(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) //Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1, kvw._2._2) } + .map(kvw => (kvw._1, kvw._2._1, kvw._2._2)) .write(TypedText.tsv[(Int, Int, Int)]("out2")) //Also check simple leftJoins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - leftJoin TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) + TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .leftJoin(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) //Flatten out to three values: .toTypedPipe .map { kvw: (Int, (Int, Option[Int])) => @@ -534,22 +566,25 @@ class TNiceJoinCountJob(args: Args) extends Job(args) { class TNiceJoinByCountJob(args: Args) extends Job(args) { import com.twitter.scalding.typed.Syntax._ - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - joinBy TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)))(_._1, _._1) + (TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .joinBy(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) .size .write(TypedText.tsv[(Int, Long)]("out")) //Also check simple joins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - joinBy TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)))(_._1, _._1) + (TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .joinBy(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) //Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1._2, kvw._2._2._2) } + .map(kvw => (kvw._1, kvw._2._1._2, kvw._2._2._2)) .write(TypedText.tsv[(Int, Int, Int)]("out2")) //Also check simple leftJoins: - (TypedPipe.from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) - leftJoinBy TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)))(_._1, _._1) + (TypedPipe + .from[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .leftJoinBy(TypedPipe.from[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) //Flatten out to three values: .toTypedPipe .map { kvw: (Int, ((Int, Int), Option[(Int, Int)])) => @@ -561,9 +596,13 @@ class TNiceJoinByCountJob(args: Args) extends Job(args) { class TypedPipeJoinCountTest extends WordSpec with Matchers { import Dsl._ - val joinTests = List("com.twitter.scalding.TJoinCountJob", "com.twitter.scalding.TNiceJoinCountJob", "com.twitter.scalding.TNiceJoinByCountJob") + val joinTests = List( + "com.twitter.scalding.TJoinCountJob", + "com.twitter.scalding.TNiceJoinCountJob", + "com.twitter.scalding.TNiceJoinByCountJob" + ) - joinTests.foreach{ jobName => + joinTests.foreach { jobName => "A " + jobName should { var idx = 0 JobTest(jobName) @@ -579,20 +618,34 @@ class TypedPipeJoinCountTest extends WordSpec with Matchers { idx += 1 } .typedSink(TypedText.tsv[(Int, Int, Int)]("out2")) { outbuf2 => - val outMap = outbuf2.groupBy { _._1 } + val outMap = outbuf2.groupBy(_._1) (idx + ": correctly do a simple join") in { outMap should have size 2 outMap(0).toList.sorted shouldBe List((0, 1, 10), (0, 2, 10)) - outMap(1).toList.sorted shouldBe List((1, 1, 10), (1, 1, 20), (1, 1, 30), (1, 5, 10), (1, 5, 20), (1, 5, 30)) + outMap(1).toList.sorted shouldBe List( + (1, 1, 10), + (1, 1, 20), + (1, 1, 30), + (1, 5, 10), + (1, 5, 20), + (1, 5, 30) + ) } idx += 1 } .typedSink(TypedText.tsv[(Int, Int, Int)]("out3")) { outbuf => - val outMap = outbuf.groupBy { _._1 } + val outMap = outbuf.groupBy(_._1) (idx + ": correctly do a simple leftJoin") in { outMap should have size 3 outMap(0).toList.sorted shouldBe List((0, 1, 10), (0, 2, 10)) - outMap(1).toList.sorted shouldBe List((1, 1, 10), (1, 1, 20), (1, 1, 30), (1, 5, 10), (1, 5, 20), (1, 5, 30)) + outMap(1).toList.sorted shouldBe List( + (1, 1, 10), + (1, 1, 20), + (1, 1, 30), + (1, 5, 10), + (1, 5, 20), + (1, 5, 30) + ) outMap(2).toList.sorted shouldBe List((2, 10, -1)) } idx += 1 @@ -605,7 +658,8 @@ class TypedPipeJoinCountTest extends WordSpec with Matchers { } class TCrossJob(args: Args) extends Job(args) { - (TextLine("in0") cross TextLine("in1")) + TextLine("in0") + .cross(TextLine("in1")) .write(TypedText.tsv[(String, String)]("crossed")) } @@ -619,10 +673,7 @@ class TypedPipeCrossTest extends WordSpec with Matchers { .typedSink(TypedText.tsv[(String, String)]("crossed")) { outbuf => val sortedL = outbuf.toList.sorted (idx + ": create a cross-product") in { - sortedL shouldBe List(("all", "body"), - ("all", "every"), - ("you", "body"), - ("you", "every")) + sortedL shouldBe List(("all", "body"), ("all", "every"), ("you", "body"), ("you", "every")) } idx += 1 } @@ -634,10 +685,11 @@ class TypedPipeCrossTest extends WordSpec with Matchers { } class TJoinTakeJob(args: Args) extends Job(args) { - val items0 = TextLine("in0").flatMap { s => (1 to 10).map((_, s)) }.group - val items1 = TextLine("in1").map { s => (s.toInt, ()) }.group + val items0 = TextLine("in0").flatMap(s => (1 to 10).map((_, s))).group + val items1 = TextLine("in1").map(s => (s.toInt, ())).group - items0.join(items1.take(1)) + items0 + .join(items1.take(1)) .mapValues(_._1) // discard the () .toTypedPipe .write(TypedText.tsv[(Int, String)]("joined")) @@ -665,10 +717,7 @@ class TypedJoinTakeTest extends WordSpec with Matchers { } class TGroupAllJob(args: Args) extends Job(args) { - TextLine("in") - .groupAll - .sorted - .values + TextLine("in").groupAll.sorted.values .write(TypedText.tsv[String]("out")) } @@ -681,7 +730,7 @@ class TypedGroupAllTest extends WordSpec with Matchers { .source(TextLine("in"), input) .typedSink(TypedText.tsv[String]("out")) { outbuf => val sortedL = outbuf.toList - val correct = input.map { _._2 }.sorted + val correct = input.map(_._2).sorted (idx + ": create sorted output") in { sortedL shouldBe correct } @@ -714,22 +763,22 @@ class TSelfJoinTest extends WordSpec with Matchers { class TJoinWordCount(args: Args) extends Job(args) { - def countWordsIn(pipe: TypedPipe[(String)]) = { - pipe.flatMap { _.split("\\s+").map(_.toLowerCase) } + def countWordsIn(pipe: TypedPipe[String]) = + pipe + .flatMap(_.split("\\s+").map(_.toLowerCase)) .groupBy(identity) .mapValueStream(input => Iterator(input.size)) .forceToReducers - } val first = countWordsIn(TypedPipe.from(TextLine("in0"))) val second = countWordsIn(TypedPipe.from(TextLine("in1"))) - first.outerJoin(second) + first + .outerJoin(second) .toTypedPipe - .map { - case (word, (firstCount, secondCount)) => - (word, firstCount.getOrElse(0), secondCount.getOrElse(0)) + .map { case (word, (firstCount, secondCount)) => + (word, firstCount.getOrElse(0), secondCount.getOrElse(0)) } .write(TypedText.tsv[(String, Int, Int)]("out")) } @@ -739,16 +788,13 @@ class TypedJoinWCTest extends WordSpec with Matchers { TUtil.printStack { val in0 = List((0, "you all everybody"), (1, "a b c d"), (2, "a b c")) val in1 = List((0, "you"), (1, "a b c d"), (2, "a a b b c c")) - def count(in: List[(Int, String)]): Map[String, Int] = { - in.flatMap { _._2.split("\\s+").map { _.toLowerCase } }.groupBy { identity }.mapValues { _.size } - } - def outerjoin[K, U, V](m1: Map[K, U], z1: U, m2: Map[K, V], z2: V): Map[K, (U, V)] = { - (m1.keys ++ m2.keys).map { k => (k, (m1.getOrElse(k, z1), m2.getOrElse(k, z2))) }.toMap - } - val correct = outerjoin(count(in0), 0, count(in1), 0) - .toList - .map { tup => (tup._1, tup._2._1, tup._2._2) } - .sorted + def count(in: List[(Int, String)]): Map[String, Int] = + in.flatMap(_._2.split("\\s+").map(_.toLowerCase)).groupBy(identity).mapValues(_.size) + def outerjoin[K, U, V](m1: Map[K, U], z1: U, m2: Map[K, V], z2: V): Map[K, (U, V)] = + (m1.keys ++ m2.keys).map(k => (k, (m1.getOrElse(k, z1), m2.getOrElse(k, z2)))).toMap + val correct = outerjoin(count(in0), 0, count(in1), 0).toList.map { tup => + (tup._1, tup._2._1, tup._2._2) + }.sorted JobTest(new TJoinWordCount(_)) .source(TextLine("in0"), in0) @@ -773,7 +819,7 @@ class TypedLimitJob(args: Args) extends Job(args) { class TypedLimitTest extends WordSpec with Matchers { "A TypedLimitJob" should { JobTest(new TypedLimitJob(_)) - .source(TypedText.tsv[String]("input"), (0 to 100).map { i => Tuple1(i.toString) }) + .source(TypedText.tsv[String]("input"), (0 to 100).map(i => Tuple1(i.toString))) .typedSink(TypedText.tsv[String]("output")) { outBuf => "not have more than the limited outputs" in { outBuf.size should be <= 10 @@ -785,7 +831,9 @@ class TypedLimitTest extends WordSpec with Matchers { } class TypedFlattenJob(args: Args) extends Job(args) { - TypedText.tsv[String]("input").map { _.split(" ").toList } + TypedText + .tsv[String]("input") + .map(_.split(" ").toList) .flatten .write(TypedText.tsv[String]("output")) } @@ -838,7 +886,7 @@ class TypedMergeTest extends WordSpec with Matchers { class TypedShardJob(args: Args) extends Job(args) { (TypedPipe.from(TypedText.tsv[String]("input")) ++ - (TypedPipe.empty.map { _ => "hey" }) ++ + TypedPipe.empty.map { _ => "hey" } ++ TypedPipe.from(List("item"))) .shard(10) .write(TypedText.tsv[String]("output")) @@ -863,8 +911,9 @@ class TypedShardTest extends WordSpec with Matchers { } class TypedLocalSumJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[String]("input")) - .flatMap { s => s.split(" ").map((_, 1L)) } + TypedPipe + .from(TypedText.tsv[String]("input")) + .flatMap(s => s.split(" ").map((_, 1L))) .sumByLocalKeys .write(TypedText.tsv[(String, Long)]("output")) } @@ -881,7 +930,7 @@ class TypedLocalSumTest extends WordSpec with Matchers { s"$idx: not expand and have correct total sum" in { import com.twitter.algebird.MapAlgebra.sumByKey val lres = outBuf.toList - val fmapped = mk.flatMap { s => s.split(" ").map((_, 1L)) } + val fmapped = mk.flatMap(s => s.split(" ").map((_, 1L))) lres.size should be <= (fmapped.size) sumByKey(lres) shouldBe (sumByKey(fmapped)) } @@ -894,7 +943,8 @@ class TypedLocalSumTest extends WordSpec with Matchers { } class TypedHeadJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[(Int, Int)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("input")) .group .head .write(TypedText.tsv[(Int, Int)]("output")) @@ -905,7 +955,7 @@ class TypedHeadTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedHeadJob(_)) .source(TypedText.tsv[(Int, Int)]("input"), mk) .typedSink(TypedText.tsv[(Int, Int)]("output")) { outBuf => @@ -923,16 +973,12 @@ class TypedHeadTest extends WordSpec with Matchers { class TypedSortWithTakeJob(args: Args) extends Job(args) { val in = TypedPipe.from(TypedText.tsv[(Int, Int)]("input")) - in - .group + in.group .sortedReverseTake(5) .flattenValues .write(TypedText.tsv[(Int, Int)]("output")) - in - .group - .sorted - .reverse + in.group.sorted.reverse .bufferedTake(5) .write(TypedText.tsv[(Int, Int)]("output2")) } @@ -942,7 +988,7 @@ class TypedSortWithTakeTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedSortWithTakeJob(_)) .source(TypedText.tsv[(Int, Int)]("input"), mk) .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("output")) { outBuf => @@ -963,7 +1009,8 @@ class TypedSortWithTakeTest extends WordSpec with Matchers { } class TypedLookupJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[Int]("input0")) + TypedPipe + .from(TypedText.tsv[Int]("input0")) .hashLookup(TypedPipe.from(TypedText.tsv[(Int, String)]("input1")).group) .mapValues { o: Option[String] => o.getOrElse("") } .write(TypedText.tsv[(Int, String)]("output")) @@ -974,16 +1021,19 @@ class TypedLookupJobTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt.toString) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt.toString)) JobTest(new TypedLookupJob(_)) .source(TypedText.tsv[Int]("input0"), (-1 to 100)) .source(TypedText.tsv[(Int, String)]("input1"), mk) .typedSink(TypedText.tsv[(Int, String)]("output")) { outBuf => "correctly TypedPipe.hashLookup" in { val data = mk.groupBy(_._1) - val correct = (-1 to 100).flatMap { k => - data.get(k).getOrElse(List((k, ""))) - }.toList.sorted + val correct = (-1 to 100) + .flatMap { k => + data.get(k).getOrElse(List((k, ""))) + } + .toList + .sorted outBuf should have size (correct.size) outBuf.toList.sorted shouldBe correct } @@ -994,7 +1044,8 @@ class TypedLookupJobTest extends WordSpec with Matchers { } class TypedLookupReduceJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[Int]("input0")) + TypedPipe + .from(TypedText.tsv[Int]("input0")) .hashLookup(TypedPipe.from(TypedText.tsv[(Int, String)]("input1")).group.max) .mapValues { o: Option[String] => o.getOrElse("") } .write(TypedText.tsv[(Int, String)]("output")) @@ -1005,20 +1056,24 @@ class TypedLookupReduceJobTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt.toString) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt.toString)) JobTest(new TypedLookupReduceJob(_)) .source(TypedText.tsv[Int]("input0"), (-1 to 100)) .source(TypedText.tsv[(Int, String)]("input1"), mk) .typedSink(TypedText.tsv[(Int, String)]("output")) { outBuf => "correctly TypedPipe.hashLookup" in { - val data = mk.groupBy(_._1) + val data = mk + .groupBy(_._1) .mapValues { kvs => val (k, v) = kvs.maxBy(_._2) (k, v) } - val correct = (-1 to 100).map { k => - data.get(k).getOrElse((k, "")) - }.toList.sorted + val correct = (-1 to 100) + .map { k => + data.get(k).getOrElse((k, "")) + } + .toList + .sorted outBuf should have size (correct.size) outBuf.toList.sorted shouldBe correct } @@ -1029,9 +1084,10 @@ class TypedLookupReduceJobTest extends WordSpec with Matchers { } class TypedFilterJob(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[Int]("input")) - .filter { _ > 50 } - .filterNot { _ % 2 == 0 } + TypedPipe + .from(TypedText.tsv[Int]("input")) + .filter(_ > 50) + .filterNot(_ % 2 == 0) .write(TypedText.tsv[Int]("output")) } @@ -1040,7 +1096,7 @@ class TypedFilterTest extends WordSpec with Matchers { "filter and filterNot elements" in { val input = -1 to 100 val isEven = (i: Int) => i % 2 == 0 - val expectedOutput = input filter { _ > 50 } filterNot isEven + val expectedOutput = input.filter(_ > 50).filterNot(isEven) TUtil.printStack { JobTest(new com.twitter.scalding.TypedFilterJob(_)) @@ -1057,7 +1113,7 @@ class TypedFilterTest extends WordSpec with Matchers { } class TypedPartitionJob(args: Args) extends Job(args) { - val (p1, p2) = TypedPipe.from(TypedText.tsv[Int]("input")).partition { _ > 50 } + val (p1, p2) = TypedPipe.from(TypedText.tsv[Int]("input")).partition(_ > 50) p1.write(TypedText.tsv[Int]("output1")) p2.write(TypedText.tsv[Int]("output2")) } @@ -1066,7 +1122,7 @@ class TypedPartitionTest extends WordSpec with Matchers { "A TypedPipe" should { "partition elements" in { val input = -1 to 100 - val (expected1, expected2) = input partition { _ > 50 } + val (expected1, expected2) = input.partition(_ > 50) TUtil.printStack { JobTest(new com.twitter.scalding.TypedPartitionJob(_)) @@ -1107,7 +1163,7 @@ class TypedMultiJoinJobTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 100 * 100 val KEYS = 10 - def mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + def mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) val mk0 = mk val mk1 = mk val mk2 = mk @@ -1132,9 +1188,8 @@ class TypedMultiJoinJobTest extends WordSpec with Matchers { v2 <- d2.get(k) } yield (v0s, (k, v1, v2))) } - .flatMap { - case (v0s, (k, v1, v2)) => - v0s.map { (k, _, v1, v2) } + .flatMap { case (v0s, (k, v1, v2)) => + v0s.map((k, _, v1, v2)) } .sorted @@ -1149,9 +1204,11 @@ class TypedMultiJoinJobTest extends WordSpec with Matchers { class TypedMultiSelfJoinJob(args: Args) extends Job(args) { val zero = TypedPipe.from(TypedText.tsv[(Int, Int)]("input0")) - val one = TypedPipe.from(TypedText.tsv[(Int, Int)]("input1")) + val one = TypedPipe + .from(TypedText.tsv[(Int, Int)]("input1")) // forceToReducers makes sure the first and the second part of - .group.forceToReducers + .group + .forceToReducers val cogroup = zero.group .join(one.max) @@ -1172,7 +1229,7 @@ class TypedMultiSelfJoinJobTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - def mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + def mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) val mk0 = mk val mk1 = mk JobTest(new TypedMultiSelfJoinJob(_)) @@ -1181,9 +1238,8 @@ class TypedMultiSelfJoinJobTest extends WordSpec with Matchers { .typedSink(TypedText.tsv[(Int, Int, Int, Int)]("output")) { outBuf => "correctly do a multi-self-join" in { def group(it: Seq[(Int, Int)])(red: (Int, Int) => Int): Map[Int, Int] = - it.groupBy(_._1).map { - case (k, kvs) => - (k, kvs.map(_._2).reduce(red)) + it.groupBy(_._1).map { case (k, kvs) => + (k, kvs.map(_._2).reduce(red)) } val d0 = mk0.groupBy(_._1).mapValues(_.map { case (_, v) => v }) @@ -1198,9 +1254,8 @@ class TypedMultiSelfJoinJobTest extends WordSpec with Matchers { v2 <- d2.get(k) } yield (v0s, (k, v1, v2))) } - .flatMap { - case (v0s, (k, v1, v2)) => - v0s.map { (k, _, v1, v2) } + .flatMap { case (v0s, (k, v1, v2)) => + v0s.map((k, _, v1, v2)) } .sorted @@ -1214,9 +1269,10 @@ class TypedMultiSelfJoinJobTest extends WordSpec with Matchers { } class TypedMapGroup(args: Args) extends Job(args) { - TypedPipe.from(TypedText.tsv[(Int, Int)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("input")) .group - .mapGroup { (k, iters) => iters.map(_ * k) } + .mapGroup((k, iters) => iters.map(_ * k)) .max .write(TypedText.tsv[(Int, Int)]("output")) } @@ -1226,15 +1282,14 @@ class TypedMapGroupTest extends WordSpec with Matchers { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedMapGroup(_)) .source(TypedText.tsv[(Int, Int)]("input"), mk) .typedSink(TypedText.tsv[(Int, Int)]("output")) { outBuf => "correctly do a mapGroup" in { def mapGroup(it: Seq[(Int, Int)]): Map[Int, Int] = - it.groupBy(_._1).map { - case (k, kvs) => - (k, kvs.map { case (k, v) => k * v }.max) + it.groupBy(_._1).map { case (k, kvs) => + (k, kvs.map { case (k, v) => k * v }.max) } val correct = mapGroup(mk).toList.sorted @@ -1309,7 +1364,8 @@ class TypedSelfLeftCrossTest extends WordSpec with Matchers { class JoinMapGroupJob(args: Args) extends Job(args) { def r1 = TypedPipe.from(Seq((1, 10))) def r2 = TypedPipe.from(Seq((1, 1), (2, 2), (3, 3))) - r1.groupBy(_._1).join(r2.groupBy(_._1)) + r1.groupBy(_._1) + .join(r2.groupBy(_._1)) .mapGroup { case (a, b) => Iterator("a") } .write(TypedText.tsv("output")) } @@ -1356,7 +1412,8 @@ class MapValueStreamNonEmptyIteratorTest extends WordSpec with Matchers { } class NullSinkJob(args: Args, m: scala.collection.mutable.Buffer[Int]) extends Job(args) { - TypedPipe.from(0 to 100) + TypedPipe + .from(0 to 100) .map { i => m += i; i } // side effect .write(source.NullSink) } @@ -1384,13 +1441,12 @@ class TypedSketchJoinJob(args: Args) extends Job(args) { zero .sketch(args("reducers").toInt) .join(one) - .map{ case (k, (v0, v1)) => (k, v0, v1) } + .map { case (k, (v0, v1)) => (k, v0, v1) } .write(TypedText.tsv[(Int, Int, Int)]("output-sketch")) - zero - .group + zero.group .join(one.group) - .map{ case (k, (v0, v1)) => (k, v0, v1) } + .map { case (k, (v0, v1)) => (k, v0, v1) } .write(TypedText.tsv[(Int, Int, Int)]("output-join")) } @@ -1403,13 +1459,12 @@ class TypedSketchLeftJoinJob(args: Args) extends Job(args) { zero .sketch(args("reducers").toInt) .leftJoin(one) - .map{ case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } + .map { case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } .write(TypedText.tsv[(Int, Int, Int)]("output-sketch")) - zero - .group + zero.group .leftJoin(one.group) - .map{ case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } + .map { case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } .write(TypedText.tsv[(Int, Int, Int)]("output-join")) } @@ -1421,11 +1476,15 @@ object TypedSketchJoinTestHelper { (0 to size).flatMap { i => val k = next - (1 to dist(k)).map { j => (k, next) } + (1 to dist(k)).map(j => (k, next)) }.toList } - def runJobWithArguments(fn: (Args) => Job, reducers: Int, dist: (Int) => Int): (List[(Int, Int, Int)], List[(Int, Int, Int)]) = { + def runJobWithArguments( + fn: (Args) => Job, + reducers: Int, + dist: (Int) => Int + ): (List[(Int, Int, Int)], List[(Int, Int, Int)]) = { val sketchResult = Buffer[(Int, Int, Int)]() val innerResult = Buffer[(Int, Int, Int)]() @@ -1433,8 +1492,8 @@ object TypedSketchJoinTestHelper { .arg("reducers", reducers.toString) .source(TypedText.tsv[(Int, Int)]("input0"), generateInput(1000, 100, dist)) .source(TypedText.tsv[(Int, Int)]("input1"), generateInput(100, 100, x => 1)) - .typedSink(TypedText.tsv[(Int, Int, Int)]("output-sketch")) { outBuf => sketchResult ++= outBuf } - .typedSink(TypedText.tsv[(Int, Int, Int)]("output-join")) { outBuf => innerResult ++= outBuf } + .typedSink(TypedText.tsv[(Int, Int, Int)]("output-sketch"))(outBuf => sketchResult ++= outBuf) + .typedSink(TypedText.tsv[(Int, Int, Int)]("output-join"))(outBuf => innerResult ++= outBuf) .run .runHadoop .finish() @@ -1510,8 +1569,9 @@ class TypedPipeRequireTest extends FunSuite { def ex(req: Boolean) = { val ex = - TypedPipe.from((1 to 1000)) - .map { k => (k.toString, k) } + TypedPipe + .from((1 to 1000)) + .map(k => (k.toString, k)) .join(TypedPipe.from((1 to 1000 by 5)).map(_.toString).asKeys) val g = if (req) ex.group.requireSingleValuePerKey.toTypedPipe @@ -1520,13 +1580,16 @@ class TypedPipeRequireTest extends FunSuite { g.toIterableExecution } - assert(ex(false).waitFor(Config.empty, Local(true)).get.toList.sorted == - ex(true).waitFor(Config.empty, Local(true)).get.toList.sorted) + assert( + ex(false).waitFor(Config.empty, Local(true)).get.toList.sorted == + ex(true).waitFor(Config.empty, Local(true)).get.toList.sorted + ) } } object TypedPipeConverterTest { - class TypedTsvWithCustomConverter[T: TypeDescriptor](nonSerializableObj: Any, path: String*) extends FixedTypedText[T](TypedText.TAB, path: _*) { + class TypedTsvWithCustomConverter[T: TypeDescriptor](nonSerializableObj: Any, path: String*) + extends FixedTypedText[T](TypedText.TAB, path: _*) { override def converter[U >: T]: TupleConverter[U] = super.converter.andThen { t: T => nonSerializableObj; t } } @@ -1536,7 +1599,8 @@ object TypedPipeConverterTest { val source = new TypedTsvWithCustomConverter[Int](new NonSerializableObj(), "input") class JobWithCustomConverter(args: Args) extends Job(args) { - TypedPipe.from(source) + TypedPipe + .from(source) .map(i => i + 1) .write(TypedText.tsv[Int]("output")) } @@ -1551,7 +1615,7 @@ class TypedPipeConverterTest extends FunSuite { JobTest(new JobWithCustomConverter(_)) .source(source, expected.map(_ - 1)) - .typedSink(TypedText.tsv[Int]("output")) { outBuf => result ++= outBuf } + .typedSink(TypedText.tsv[Int]("output"))(outBuf => result ++= outBuf) .runHadoop .finish() @@ -1566,13 +1630,15 @@ object TypedPipeCrossWithMapWithToPipeTest { val sink2 = TypedText.tsv[Int]("sink2") class TestJob(args: Args) extends Job(args) { - val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe.from(source) + val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe + .from(source) .groupAll .toList .mapValues(values => values.map(v => (v, v)).toMap) .values - val crossedMapped = TypedPipe.from(source) + val crossedMapped = TypedPipe + .from(source) .cross(mapPipe) .map { case (value, map) => map(value) } @@ -1611,7 +1677,8 @@ object TypedPipeCrossWithDifferentMapsAfterTest { val sink2 = TypedText.tsv[Int]("sink2") class TestJob(args: Args) extends Job(args) { - val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe.from(source) + val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe + .from(source) .groupAll .toList .mapValues(values => values.map(v => (v, v)).toMap) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala index 243b63041a..4b793504e4 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.FlowDef @@ -44,7 +44,8 @@ class TypedSinkWithTypedImplementationRecursive(path: String) extends TypedSink[ } class TypedSinkWithTypedImplementationJob(args: Args) extends Job(args) { - TypedPipe.from(List("test")) + TypedPipe + .from(List("test")) .write(new TypedSinkWithTypedImplementation("output")) } @@ -61,7 +62,8 @@ class TypedSinkWithTypedImplementationTest extends WordSpec with Matchers { "A TypedSinkWithTypedImplementation" should { "should work with .writeExecution" in { val elements = List("test") - val elementsFromExecution = TypedPipe.from(elements) + val elementsFromExecution = TypedPipe + .from(elements) .writeExecution(new TypedSinkWithTypedImplementation("output")) .flatMap(_ => TypedPipe.from(TypedTsv[String]("output")).toIterableExecution) .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) @@ -75,12 +77,13 @@ class TypedSinkWithTypedImplementationTest extends WordSpec with Matchers { "A TypedSinkWithTypedImplementation" should { "should work with Execution.fromFn" in { val elements = List("test") - val elementsFromExecution = Execution.fromFn { case (confArg, modeArg) => - implicit val flowDef = new FlowDef - implicit val mode = modeArg - TypedPipe.from(elements).write(new TypedSinkWithTypedImplementation("output")) - flowDef - } + val elementsFromExecution = Execution + .fromFn { case (confArg, modeArg) => + implicit val flowDef = new FlowDef + implicit val mode = modeArg + TypedPipe.from(elements).write(new TypedSinkWithTypedImplementation("output")) + flowDef + } .flatMap(_ => TypedPipe.from(TypedTsv[String]("output")).toIterableExecution) .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) .get @@ -92,10 +95,13 @@ class TypedSinkWithTypedImplementationTest extends WordSpec with Matchers { "A TypedSinkWithTypedImplementationRecursive" should { "should fail" in { - assert(TypedPipe.from(List("test")) - .writeExecution(new TypedSinkWithTypedImplementationRecursive("output")) - .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) - .isFailure) + assert( + TypedPipe + .from(List("test")) + .writeExecution(new TypedSinkWithTypedImplementationRecursive("output")) + .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) + .isFailure + ) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala index 275f4ca652..4aa144eef9 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding -import org.scalatest.{ WordSpec, Matchers } +import org.scalatest.{Matchers, WordSpec} class TypedSketchJoinJobForEmptyKeys(args: Args) extends Job(args) { // Deal with when a key appears in left but not right @@ -17,9 +17,8 @@ class TypedSketchJoinJobForEmptyKeys(args: Args) extends Job(args) { sketched.values sketched - .map { - case (a, (b, c)) => - (a, b, c.getOrElse(-1)) + .map { case (a, (b, c)) => + (a, b, c.getOrElse(-1)) } .write(TypedTsv("output")) } @@ -31,7 +30,7 @@ class TypedSketchJoinJobForEmptyKeysTest extends WordSpec with Matchers { .sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("output")) { outBuf => outBuf should have size 1 val unordered = outBuf.toSet - unordered should contain (1, 1111, -1) + unordered should contain(1, 1111, -1) } .run .finish() diff --git a/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala index 94e76edff3..42b8561c2a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala @@ -2,11 +2,11 @@ package com.twitter.scalding import cascading.flow.FlowException import cascading.pipe.CoGroup -import cascading.pipe.joiner.{ JoinerClosure, InnerJoin } +import cascading.pipe.joiner.{InnerJoin, JoinerClosure} import cascading.tuple.Tuple -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} -import java.util.{ Iterator => JIterator } +import java.util.{Iterator => JIterator} class CheckFlowProcessJoiner(uniqueID: UniqueID) extends InnerJoin { override def getIterator(joinerClosure: JoinerClosure): JIterator[Tuple] = { @@ -64,7 +64,7 @@ class WrappedJoinerTest extends WordSpec with Matchers { fail("The test Job without WrappedJoiner should fail.") } catch { case ex: FlowException => - ex.getCause.getMessage should include ("the FlowProcess for unique id") + ex.getCause.getMessage should include("the FlowProcess for unique id") } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala index dd93ef5193..d84edbce49 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import cascading.flow.planner.PlannerException class XHandlerTest extends WordSpec with Matchers { @@ -28,7 +28,9 @@ class XHandlerTest extends WordSpec with Matchers { rxh.handlers.find(h => h(new NoSuchMethodError)) should not be empty rxh.handlers.find(h => h(new AbstractMethodError)) should not be empty rxh.handlers.find(h => h(new NoClassDefFoundError)) should not be empty - rxh.handlers.find(h => h(new ModeLoadException("dummy", new ClassNotFoundException))) should not be empty + rxh.handlers.find(h => + h(new ModeLoadException("dummy", new ClassNotFoundException)) + ) should not be empty } "be handled if exist in custom mapping" in { val cRxh = RichXHandler(RichXHandler.mapping ++ Map(classOf[NullPointerException] -> "NPE")) @@ -58,11 +60,21 @@ class XHandlerTest extends WordSpec with Matchers { val PlannerExceptionString = "cascadingflowplannerplannerexception" val ModeLoadExceptionString = "comtwitterscaldingmodeloadexception" RichXHandler.createXUrl(new PlannerException) shouldBe (RichXHandler.gitHubUrl + PlannerExceptionString) - RichXHandler.createXUrl(new InvalidSourceException("Invalid Source")) shouldBe (RichXHandler.gitHubUrl + InvalidSouceExceptionString) - RichXHandler.createXUrl(new NoSuchMethodError) shouldBe (RichXHandler.gitHubUrl + NoSuchMethodErrorString) - RichXHandler.createXUrl(new AbstractMethodError) shouldBe (RichXHandler.gitHubUrl + AbstractMethodErrorString) - RichXHandler.createXUrl(new NoClassDefFoundError) shouldBe (RichXHandler.gitHubUrl + NoClassDefFoundErrorString) - RichXHandler.createXUrl(ModeLoadException("dummy", new ClassNotFoundException)) shouldBe (RichXHandler.gitHubUrl + ModeLoadExceptionString) + RichXHandler.createXUrl( + new InvalidSourceException("Invalid Source") + ) shouldBe (RichXHandler.gitHubUrl + InvalidSouceExceptionString) + RichXHandler.createXUrl( + new NoSuchMethodError + ) shouldBe (RichXHandler.gitHubUrl + NoSuchMethodErrorString) + RichXHandler.createXUrl( + new AbstractMethodError + ) shouldBe (RichXHandler.gitHubUrl + AbstractMethodErrorString) + RichXHandler.createXUrl( + new NoClassDefFoundError + ) shouldBe (RichXHandler.gitHubUrl + NoClassDefFoundErrorString) + RichXHandler.createXUrl( + ModeLoadException("dummy", new ClassNotFoundException) + ) shouldBe (RichXHandler.gitHubUrl + ModeLoadExceptionString) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala index 60975b4c08..8be0829dbe 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.bdd -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.Dsl._ import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer @@ -11,45 +11,33 @@ class MultipleSourcesSpecTest extends WordSpec with Matchers with BddDsl { "A test with two sources" should { "accept an operation with two input pipes" in { Given { - List(("Stefano", "110"), ("Rajah", "220")) withSchema ('name, 'points) - } And { - List(("Stefano", "home1"), ("Rajah", "home2")) withSchema ('name, 'address) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => - { - pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { - address: String => address + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String, String)] => - { - buffer.forall({ - case (_, _, _, addressTransf) => addressTransf.endsWith("_transf") - }) shouldBe true - } + List(("Stefano", "110"), ("Rajah", "220")).withSchema('name, 'points) + }.And { + List(("Stefano", "home1"), ("Rajah", "home2")).withSchema('name, 'address) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { address: String => + address + "_transf" + } + }.Then { buffer: Buffer[(String, String, String, String)] => + buffer.forall { case (_, _, _, addressTransf) => + addressTransf.endsWith("_transf") + } shouldBe true } } "accept an operation with two input pipes using Tuples" in { Given { - List(new Tuple("Stefano", "110"), new Tuple("Rajah", "220")) withSchema ('name, 'points) - } And { - List(new Tuple("Stefano", "home1"), new Tuple("Rajah", "home2")) withSchema ('name, 'address) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => - { - pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { - address: String => address + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String, String)] => - { - buffer.forall({ - case (_, _, _, addressTransf) => addressTransf.endsWith("_transf") - }) shouldBe true - } + List(new Tuple("Stefano", "110"), new Tuple("Rajah", "220")).withSchema('name, 'points) + }.And { + List(new Tuple("Stefano", "home1"), new Tuple("Rajah", "home2")).withSchema('name, 'address) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { address: String => + address + "_transf" + } + }.Then { buffer: Buffer[(String, String, String, String)] => + buffer.forall { case (_, _, _, addressTransf) => + addressTransf.endsWith("_transf") + } shouldBe true } } } @@ -57,27 +45,21 @@ class MultipleSourcesSpecTest extends WordSpec with Matchers with BddDsl { "A test with three sources" should { "accept an operation with three input pipes" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col4) - } When { - (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => - { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .joinWithSmaller('col1 -> 'col1, pipe3) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.When { (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .joinWithSmaller('col1 -> 'col1, pipe3) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } @@ -86,59 +68,47 @@ class MultipleSourcesSpecTest extends WordSpec with Matchers with BddDsl { "compile mixing an operation with inconsistent number of input pipes but fail at runtime" in { an[IllegalArgumentException] should be thrownBy { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col4) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col5) - } When { - (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => - { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .joinWithSmaller('col1 -> 'col1, pipe3) - .joinWithSmaller('col1 -> 'col1, pipe3) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col5) + }.When { (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .joinWithSmaller('col1 -> 'col1, pipe3) + .joinWithSmaller('col1 -> 'col1, pipe3) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } "be used with a function accepting a list of sources because there is no implicit for functions with more than three input pipes" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col4) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col5) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col6) - } When { - (pipes: List[RichPipe]) => - { - pipes.head - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .joinWithSmaller('col1 -> 'col1, pipes(2)) - .joinWithSmaller('col1 -> 'col1, pipes(3)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col5) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col6) + }.When { (pipes: List[RichPipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .joinWithSmaller('col1 -> 'col1, pipes(2)) + .joinWithSmaller('col1 -> 'col1, pipes(3)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala index d095034fff..dc6d1452ae 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.bdd -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer import cascading.pipe.Pipe @@ -11,95 +11,65 @@ class SingleSourceSpecTest extends WordSpec with Matchers with BddDsl { "A test with single source" should { "accept an operation with a single input rich pipe" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => - { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String)] => - { - buffer.forall({ - case (_, _, transformed) => transformed.endsWith("_transf") - }) shouldBe true - } + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[(String, String, String)] => + buffer.forall { case (_, _, transformed) => + transformed.endsWith("_transf") + } shouldBe true } } "accept an operation with a single input pipe" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: Pipe => - { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String)] => - { - buffer.forall({ - case (_, _, transformed) => transformed.endsWith("_transf") - }) shouldBe true - } + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: Pipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[(String, String, String)] => + buffer.forall { case (_, _, transformed) => + transformed.endsWith("_transf") + } shouldBe true } } "work with output as Tuple" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => - { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true - } + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work with input as simple type" in { Given { - List("col1_1", "col1_2") withSchema ('col1) - } When { - pipe: RichPipe => - { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(1).endsWith("_transf")) shouldBe true - } + List("col1_1", "col1_2").withSchema('col1) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(1).endsWith("_transf")) shouldBe true } } "work with input as Tuple" in { Given { - List(new Tuple("col1_1", "col2_1"), new Tuple("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => - { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true - } + List(new Tuple("col1_1", "col2_1"), new Tuple("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala index e659942834..7bef024c43 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.bdd -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer import cascading.tuple.Tuple @@ -14,23 +14,18 @@ class SourceListSpecTest extends WordSpec with Matchers with BddDsl { an[IllegalArgumentException] should be thrownBy { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col4))) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => - { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4)) + ) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } @@ -38,92 +33,72 @@ class SourceListSpecTest extends WordSpec with Matchers with BddDsl { "work properly with a multi rich-pipe function with same cardinality" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3))) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => - { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) + ) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a multi pipe function with same cardinality" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3))) - } When { - (pipe1: Pipe, pipe2: Pipe) => - { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) + ) + }.When { (pipe1: Pipe, pipe2: Pipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a function accepting a list of rich pipes" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3))) - } When { - (pipes: List[RichPipe]) => - { - pipes.head - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) + ) + }.When { (pipes: List[RichPipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a function accepting a list of pipes" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema ('col1, 'col3))) - } When { - (pipes: List[Pipe]) => - { - pipes.head - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => - { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) + ) + }.When { (pipes: List[Pipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala index 07f08ee60b..cd72c1023f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala @@ -2,7 +2,7 @@ package com.twitter.scalding.bdd import cascading.flow.FlowException import com.twitter.scalding._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.math._ import scala.collection.mutable @@ -17,30 +17,29 @@ class TypedApiTest extends WordSpec with Matchers with TBddDsl { "accept an operation from working with a single tuple-typed pipe" in { Given { List(("Joe", "M", 40), ("Sarah", "F", 22)) - } When { - in: TypedPipe[(String, String, Int)] => - in.map[(String, Double)] { - case (name, "M", age) => (name, (1000.0 / (72 - age))) - case (name, _, age) => (name, (1000.0 / (80 - age))) - } - } Then { - buffer: mutable.Buffer[(String, Double)] => - buffer.toList shouldBe List(("Joe", 1000.0 / 32), ("Sarah", 1000.0 / 58)) + }.When { in: TypedPipe[(String, String, Int)] => + in.map[(String, Double)] { + case (name, "M", age) => (name, (1000.0 / (72 - age))) + case (name, _, age) => (name, (1000.0 / (80 - age))) + } + }.Then { buffer: mutable.Buffer[(String, Double)] => + buffer.toList shouldBe List(("Joe", 1000.0 / 32), ("Sarah", 1000.0 / 58)) } } "accept an operation from single case class-typed pipe" in { Given { List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) - } When { - in: TypedPipe[UserInfo] => - in.map { - case UserInfo(name, "M", age) => EstimatedContribution(name, (1000.0 / (72 - age))) - case UserInfo(name, _, age) => EstimatedContribution(name, (1000.0 / (80 - age))) - } - } Then { - buffer: mutable.Buffer[EstimatedContribution] => - buffer.toList shouldBe List(EstimatedContribution("Joe", 1000.0 / 32), EstimatedContribution("Sarah", 1000.0 / 58)) + }.When { in: TypedPipe[UserInfo] => + in.map { + case UserInfo(name, "M", age) => EstimatedContribution(name, (1000.0 / (72 - age))) + case UserInfo(name, _, age) => EstimatedContribution(name, (1000.0 / (80 - age))) + } + }.Then { buffer: mutable.Buffer[EstimatedContribution] => + buffer.toList shouldBe List( + EstimatedContribution("Joe", 1000.0 / 32), + EstimatedContribution("Sarah", 1000.0 / 58) + ) } } } @@ -50,42 +49,37 @@ class TypedApiTest extends WordSpec with Matchers with TBddDsl { "accept an operation from two tuple-typed pipes" in { Given { List(("Joe", "M"), ("Sarah", "F")) - } And { + }.And { List(("Joe", 40), ("Sarah", 22)) - } When { - (gender: TypedPipe[(String, String)], age: TypedPipe[(String, Int)]) => - gender - .group - .join(age.group) - .toTypedPipe - .map { value: (String, (String, Int)) => - val (name, (gender, age)) = value - (name, gender, age) - } - } Then { - buffer: mutable.Buffer[(String, String, Int)] => - buffer.toList shouldBe List(("Joe", "M", 40), ("Sarah", "F", 22)) + }.When { (gender: TypedPipe[(String, String)], age: TypedPipe[(String, Int)]) => + gender.group + .join(age.group) + .toTypedPipe + .map { value: (String, (String, Int)) => + val (name, (gender, age)) = value + (name, gender, age) + } + }.Then { buffer: mutable.Buffer[(String, String, Int)] => + buffer.toList shouldBe List(("Joe", "M", 40), ("Sarah", "F", 22)) } } "accept an operation from two case classes-typed pipes" in { Given { List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")) - } And { + }.And { List(UserWithAge("Joe", 40), UserWithAge("Sarah", 22)) - } When { - (gender: TypedPipe[UserWithGender], age: TypedPipe[UserWithAge]) => - gender - .groupBy(_.name) - .join(age.groupBy(_.name)) - .mapValues { value: (UserWithGender, UserWithAge) => - val (withGender, withAge) = value - UserInfo(withGender.name, withGender.gender, withAge.age) - } - .values - } Then { - buffer: mutable.Buffer[UserInfo] => - buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + }.When { (gender: TypedPipe[UserWithGender], age: TypedPipe[UserWithAge]) => + gender + .groupBy(_.name) + .join(age.groupBy(_.name)) + .mapValues { value: (UserWithGender, UserWithAge) => + val (withGender, withAge) = value + UserInfo(withGender.name, withGender.gender, withAge.age) + } + .values + }.Then { buffer: mutable.Buffer[UserInfo] => + buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) } } } @@ -95,9 +89,33 @@ class TypedApiTest extends WordSpec with Matchers with TBddDsl { GivenSources { List( List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")), - List(UserWithAge("Joe", 40), UserWithAge("Sarah", 22))) - } When { - pipes: List[TypedPipe[_]] => + List(UserWithAge("Joe", 40), UserWithAge("Sarah", 22)) + ) + }.When { pipes: List[TypedPipe[_]] => + val gender = pipes(0).asInstanceOf[TypedPipe[UserWithGender]] // linter:ignore + val age = pipes(1).asInstanceOf[TypedPipe[UserWithAge]] // linter:ignore + + gender + .groupBy(_.name) + .join(age.groupBy(_.name)) + .mapValues { value: (UserWithGender, UserWithAge) => + val (withGender, withAge) = value + UserInfo(withGender.name, withGender.gender, withAge.age) + } + .values + }.Then { buffer: mutable.Buffer[UserInfo] => + buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + } + } + + "not checking the types of the sources and fail if any error occurs" in { + an[FlowException] should be thrownBy { + GivenSources { + List( + List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")), + List(("Joe", 40), ("Sarah", 22)) + ) + }.When { pipes: List[TypedPipe[_]] => val gender = pipes(0).asInstanceOf[TypedPipe[UserWithGender]] // linter:ignore val age = pipes(1).asInstanceOf[TypedPipe[UserWithAge]] // linter:ignore @@ -109,34 +127,8 @@ class TypedApiTest extends WordSpec with Matchers with TBddDsl { UserInfo(withGender.name, withGender.gender, withAge.age) } .values - } Then { - buffer: mutable.Buffer[UserInfo] => + }.Then { buffer: mutable.Buffer[UserInfo] => buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) - } - } - - "not checking the types of the sources and fail if any error occurs" in { - an[FlowException] should be thrownBy { - GivenSources { - List( - List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")), - List(("Joe", 40), ("Sarah", 22))) - } When { - pipes: List[TypedPipe[_]] => - val gender = pipes(0).asInstanceOf[TypedPipe[UserWithGender]] // linter:ignore - val age = pipes(1).asInstanceOf[TypedPipe[UserWithAge]] // linter:ignore - - gender - .groupBy(_.name) - .join(age.groupBy(_.name)) - .mapValues { value: (UserWithGender, UserWithAge) => - val (withGender, withAge) = value - UserInfo(withGender.name, withGender.gender, withAge.age) - } - .values - } Then { - buffer: mutable.Buffer[UserInfo] => - buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) } } } @@ -144,44 +136,41 @@ class TypedApiTest extends WordSpec with Matchers with TBddDsl { "be created when adding a source to four sources" in { Given { List(("Joe", "user1"), ("Sarah", "user2")) - } And { + }.And { List(("user1", "M"), ("user2", "F")) - } And { + }.And { List(("user1", 40), ("user2", 22)) - } And { - List(("user1", 1000l), ("user2", 800l)) - } And { + }.And { + List(("user1", 1000L), ("user2", 800L)) + }.And { List(("user1", true), ("user2", false)) - } When { - pipes: List[TypedPipe[_]] => - val withUserID = pipes(0).asInstanceOf[TypedPipe[(String, String)]] // linter:ignore - val withGender = pipes(1).asInstanceOf[TypedPipe[(String, String)]] - val withAge = pipes(2).asInstanceOf[TypedPipe[(String, Int)]] - val withIncome = pipes(3).asInstanceOf[TypedPipe[(String, Long)]] - val withSmoker = pipes(4).asInstanceOf[TypedPipe[(String, Boolean)]] + }.When { pipes: List[TypedPipe[_]] => + val withUserID = pipes(0).asInstanceOf[TypedPipe[(String, String)]] // linter:ignore + val withGender = pipes(1).asInstanceOf[TypedPipe[(String, String)]] + val withAge = pipes(2).asInstanceOf[TypedPipe[(String, Int)]] + val withIncome = pipes(3).asInstanceOf[TypedPipe[(String, Long)]] + val withSmoker = pipes(4).asInstanceOf[TypedPipe[(String, Boolean)]] - withUserID - .swap.group - .join(withGender.group) - .join(withAge.group) - .join(withIncome.group) - .join(withSmoker.group) - .flatMapValues { - case ((((name: String, gender: String), age: Int), income: Long), smoker) => - val lifeExpectancy = (gender, smoker) match { - case ("M", true) => 68 - case ("M", false) => 72 - case (_, true) => 76 - case (_, false) => 80 - } + withUserID.swap.group + .join(withGender.group) + .join(withAge.group) + .join(withIncome.group) + .join(withSmoker.group) + .flatMapValues { + case ((((name: String, gender: String), age: Int), income: Long), smoker) => + val lifeExpectancy = (gender, smoker) match { + case ("M", true) => 68 + case ("M", false) => 72 + case (_, true) => 76 + case (_, false) => 80 + } - Some(EstimatedContribution(name, floor(income / (lifeExpectancy - age)))) - case _ => None - } - .values - } Then { - buffer: mutable.Buffer[EstimatedContribution] => - buffer.toList shouldBe List(EstimatedContribution("Joe", 35.0), EstimatedContribution("Sarah", 13.0)) + Some(EstimatedContribution(name, floor(income / (lifeExpectancy - age)))) + case _ => None + } + .values + }.Then { buffer: mutable.Buffer[EstimatedContribution] => + buffer.toList shouldBe List(EstimatedContribution("Joe", 35.0), EstimatedContribution("Sarah", 13.0)) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala index 0a2a34e767..0b8ce6e6ec 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding.estimation.memory import org.apache.hadoop.mapred.JobConf -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class MemoryEstimatorStepStrategyTest extends WordSpec with Matchers { "A Memory estimator step strategy" should { @@ -28,9 +28,8 @@ class MemoryEstimatorStepStrategyTest extends WordSpec with Matchers { def confWith(values: Map[String, String]): JobConf = { val conf = new JobConf(false) - values.foreach { - case (k, v) => - conf.set(k, v) + values.foreach { case (k, v) => + conf.set(k, v) } conf diff --git a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala index 0173a9a16f..ac73f9878a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala @@ -1,12 +1,12 @@ package com.twitter.scalding.estimation.memory import cascading.flow.FlowStep -import com.twitter.scalding.estimation.{ FlowStepHistory, FlowStrategyInfo, HistoryService, Task } +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} import org.apache.hadoop.mapred.JobConf import org.mockito.Mockito._ import org.mockito.Matchers._ -import org.scalatest.{ Matchers, WordSpec } -import scala.util.{ Success, Try } +import org.scalatest.{Matchers, WordSpec} +import scala.util.{Success, Try} class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { import Utils._ @@ -18,8 +18,7 @@ class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { "estimate correct numbers for only reducers" in { val estimation = SmoothedMemoryEstimator - .makeHistory(Seq( - "REDUCE" -> 1024.megabytes)) + .makeHistory(Seq("REDUCE" -> 1024.megabytes)) .estimate(TestFlowStrategyInfo.dummy) estimation shouldBe reduceEstimate((1228, 1536)) @@ -27,8 +26,7 @@ class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { "estimate correct numbers for only mappers" in { val estimation = SmoothedMemoryEstimator - .makeHistory(Seq( - "MAP" -> 1024.megabytes)) + .makeHistory(Seq("MAP" -> 1024.megabytes)) .estimate(TestFlowStrategyInfo.dummy) estimation shouldBe mapEstimate((1228, 1536)) @@ -36,15 +34,18 @@ class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { "estimate correct numbers" in { val estimation = SmoothedMemoryEstimator - .makeHistory(Seq( - "MAP" -> 800.megabytes, - "REDUCE" -> 800.megabytes, - "MAP" -> 1024.megabytes, - "REDUCE" -> 1024.megabytes, - "MAP" -> 1300.megabytes, - "REDUCE" -> 1300.megabytes, - "MAP" -> 723.megabytes, - "REDUCE" -> 723.megabytes)) + .makeHistory( + Seq( + "MAP" -> 800.megabytes, + "REDUCE" -> 800.megabytes, + "MAP" -> 1024.megabytes, + "REDUCE" -> 1024.megabytes, + "MAP" -> 1300.megabytes, + "REDUCE" -> 1300.megabytes, + "MAP" -> 723.megabytes, + "REDUCE" -> 723.megabytes + ) + ) .estimate(TestFlowStrategyInfo.dummy) estimation shouldBe Some(MemoryEstimate(Some((1228, 1536)), Some((1228, 1536)))) @@ -53,13 +54,15 @@ class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { "estimate less than max cap" in { val conf = TestFlowStrategyInfo.dummy.step.getConfig val estimation = SmoothedMemoryEstimator - .makeHistory(Seq( - "MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte))) + .makeHistory(Seq("MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte))) .estimate(TestFlowStrategyInfo.dummy) val expectedEstimation = ( - (MemoryEstimatorConfig.getMaxContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor(conf)).toLong, - MemoryEstimatorConfig.getMaxContainerMemory(conf)) + (MemoryEstimatorConfig.getMaxContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor( + conf + )).toLong, + MemoryEstimatorConfig.getMaxContainerMemory(conf) + ) estimation shouldBe mapEstimate(expectedEstimation) } @@ -67,13 +70,17 @@ class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { "estimate not less than min cap" in { val conf = TestFlowStrategyInfo.dummy.step.getConfig val estimation = SmoothedMemoryEstimator - .makeHistory(Seq( - "MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte))) + .makeHistory( + Seq("MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte)) + ) .estimate(TestFlowStrategyInfo.dummy) val expectedEstimation = ( - (MemoryEstimatorConfig.getMinContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor(conf)).toLong, - MemoryEstimatorConfig.getMinContainerMemory(conf)) + (MemoryEstimatorConfig.getMinContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor( + conf + )).toLong, + MemoryEstimatorConfig.getMinContainerMemory(conf) + ) estimation shouldBe mapEstimate(expectedEstimation) } @@ -86,42 +93,41 @@ object EmptyHistoryService extends HistoryService { } class DummyHistoryService(val history: Seq[(String, Long)]) extends HistoryService { - override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = { - Success(history.map { - case (taskType, memory) => - val task = Task( - details = Map( - Task.TaskType -> taskType), - counters = Map( - SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory)) - val tasks = Seq(task) - FlowStepHistory( - keys = null, - submitTimeMillis = 0, - launchTimeMillis = 0L, - finishTimeMillis = 0L, - totalMaps = 0L, - totalReduces = 0L, - finishedMaps = 0L, - finishedReduces = 0L, - failedMaps = 0L, - failedReduces = 0L, - mapFileBytesRead = 0L, - mapFileBytesWritten = 0L, - mapOutputBytes = 0l, - reduceFileBytesRead = 0l, - hdfsBytesRead = 0l, - hdfsBytesWritten = 0L, - mapperTimeMillis = 0L, - reducerTimeMillis = 0L, - reduceShuffleBytes = 0L, - cost = 1.1, - tasks = tasks) + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Success(history.map { case (taskType, memory) => + val task = Task( + details = Map(Task.TaskType -> taskType), + counters = Map(SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory) + ) + val tasks = Seq(task) + FlowStepHistory( + keys = null, + submitTimeMillis = 0, + launchTimeMillis = 0L, + finishTimeMillis = 0L, + totalMaps = 0L, + totalReduces = 0L, + finishedMaps = 0L, + finishedReduces = 0L, + failedMaps = 0L, + failedReduces = 0L, + mapFileBytesRead = 0L, + mapFileBytesWritten = 0L, + mapOutputBytes = 0L, + reduceFileBytesRead = 0L, + hdfsBytesRead = 0L, + hdfsBytesWritten = 0L, + mapperTimeMillis = 0L, + reducerTimeMillis = 0L, + reduceShuffleBytes = 0L, + cost = 1.1, + tasks = tasks + ) }) - } } -class SmoothedMemoryEstimator(override val historyService: HistoryService) extends SmoothedHistoryMemoryEstimator +class SmoothedMemoryEstimator(override val historyService: HistoryService) + extends SmoothedHistoryMemoryEstimator object SmoothedMemoryEstimator { def empty: SmoothedMemoryEstimator = new SmoothedMemoryEstimator(EmptyHistoryService) @@ -164,4 +170,4 @@ object Utils { def reduceEstimate(value: (Long, Long)): Some[MemoryEstimate] = Some(MemoryEstimate(None, reduceMemoryInMB = Some(value))) -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala b/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala index e396c66a49..2db8e0c6cc 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.filecache import cascading.tuple.Tuple import com.twitter.scalding._ import java.net.URI -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.mutable // TODO: fix? is it worth having the dep on mockito just for this? @@ -42,7 +42,7 @@ class DistributedCacheFileSpec extends WordSpec with Matchers { lazy val testMode = smartMock[Test] lazy val localMode = smartMock[Local] -*/ + */ val uriString = "hdfs://foo.example:1234/path/to/the/stuff/thefilename.blah" val uri = new URI(uriString) val hashHex = URIHasher(uri) @@ -80,5 +80,5 @@ class DistributedCacheFileSpec extends WordSpec with Matchers { an[RuntimeException] should be thrownBy (dcf.add()(mode)) } } - */ + */ } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala index c3ad38ea6c..ef4996e52f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala @@ -15,15 +15,15 @@ */ package com.twitter.scalding.macros -import cascading.tuple.{ Tuple => CTuple, TupleEntry } -import com.twitter.bijection.macros.{ IsCaseClass, MacroGenerated } +import cascading.tuple.{Tuple => CTuple, TupleEntry} +import com.twitter.bijection.macros.{IsCaseClass, MacroGenerated} import com.twitter.scalding._ import com.twitter.scalding.serialization.Externalizer import org.scalacheck.Arbitrary import org.scalacheck.Prop import org.scalacheck.Prop.forAll import org.scalacheck.Properties -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.reflect.runtime.universe._ // We avoid nesting these just to avoid any complications in the serialization test @@ -45,9 +45,8 @@ object MacroProperties extends Properties("TypeDescriptor.roundTrip") { converter(new TupleEntry(fields, setter(t))) == t } - def propertyFor[T: TypeTag: Arbitrary: TypeDescriptor]: Unit = { + def propertyFor[T: TypeTag: Arbitrary: TypeDescriptor]: Unit = property(typeTag[T].tpe.toString) = roundTrip[T] - } propertyFor[Int] propertyFor[Option[Int]] @@ -74,18 +73,21 @@ class MacrosUnitTests extends WordSpec with Matchers { def fields = sys.error("dummy") } - def isMacroTupleConverterAvailable[T](implicit proof: TupleConverter[T] = dummy.asInstanceOf[TupleConverter[T]]) = + def isMacroTupleConverterAvailable[T](implicit + proof: TupleConverter[T] = dummy.asInstanceOf[TupleConverter[T]] + ) = proof.isInstanceOf[MacroGenerated] - def isMacroTypeDescriptorAvailable[T](implicit proof: TypeDescriptor[T] = dummy2.asInstanceOf[TypeDescriptor[T]]) = + def isMacroTypeDescriptorAvailable[T](implicit + proof: TypeDescriptor[T] = dummy2.asInstanceOf[TypeDescriptor[T]] + ) = proof.isInstanceOf[MacroGenerated] def mgConv[T](te: TupleEntry)(implicit conv: TupleConverter[T]): T = isMg(conv)(te) def mgSet[T](t: T)(implicit set: TupleSetter[T]): TupleEntry = new TupleEntry(isMg(set)(t)) - def shouldRoundTrip[T: IsCaseClass: TupleSetter: TupleConverter](t: T): Unit = { + def shouldRoundTrip[T: IsCaseClass: TupleSetter: TupleConverter](t: T): Unit = t shouldBe mgConv(mgSet(t)) - } def shouldRoundTripOther[T: IsCaseClass: TupleSetter: TupleConverter](te: TupleEntry, t: T): Unit = { val inter = mgConv(te) @@ -93,9 +95,8 @@ class MacrosUnitTests extends WordSpec with Matchers { mgSet(inter) shouldBe te } - def canExternalize(t: AnyRef): Unit = { + def canExternalize(t: AnyRef): Unit = Externalizer(t).javaWorks shouldBe true - } "MacroGenerated TupleConverter" should { "Not compile for Option[Option[Int]]" in { @@ -135,12 +136,16 @@ class MacrosUnitTests extends WordSpec with Matchers { "Generate the converter SampleClassE" in { Macros.caseClassTupleConverter[SampleClassE] } "Generate the converter SampleClassF" in { Macros.caseClassTupleConverter[SampleClassF] } "Generate the converter SampleClassG" in { Macros.caseClassTupleConverterWithUnknown[SampleClassG] } - "Generate the converter Option[(Int, String)]" in { Macros.caseClassTupleConverter[Option[(Int, String)]] } + "Generate the converter Option[(Int, String)]" in { + Macros.caseClassTupleConverter[Option[(Int, String)]] + } "Generate the converter Option[(Int, Option[(Long, String)])]" in { Macros.caseClassTupleConverter[Option[(Int, Option[(Long, String)])]] } - "Not generate a convertor for SampleClassFail" in { isMacroTupleConverterAvailable[SampleClassFail] shouldBe false } + "Not generate a convertor for SampleClassFail" in { + isMacroTupleConverterAvailable[SampleClassFail] shouldBe false + } def doesJavaWork[T](implicit conv: TupleConverter[T]): Unit = canExternalize(isMg(conv)) @@ -161,7 +166,9 @@ class MacrosUnitTests extends WordSpec with Matchers { "Generate the converter SampleClassF" in { Macros.caseClassTypeDescriptor[SampleClassF] } "Generate the converter SampleClassG" in { Macros.caseClassTypeDescriptorWithUnknown[SampleClassG] } - "Not generate a convertor for SampleClassFail" in { isMacroTypeDescriptorAvailable[SampleClassFail] shouldBe false } + "Not generate a convertor for SampleClassFail" in { + isMacroTypeDescriptorAvailable[SampleClassFail] shouldBe false + } def doesJavaWork[T](implicit conv: TypeDescriptor[T]): Unit = canExternalize(isMg(conv)) @@ -179,7 +186,13 @@ class MacrosUnitTests extends WordSpec with Matchers { shouldRoundTrip(SampleClassB(SampleClassA(100, "onehundred"), SampleClassA(-1, "zero"), "what")) val a = SampleClassA(73, "hrmA1") val b = SampleClassB(a, a, "hrmB1") - val c = SampleClassC(a, b, SampleClassA(123980, "heyA2"), SampleClassB(a, SampleClassA(-1, "zeroA3"), "zooB2"), b) + val c = SampleClassC( + a, + b, + SampleClassA(123980, "heyA2"), + SampleClassB(a, SampleClassA(-1, "zeroA3"), "zooB2"), + b + ) shouldRoundTrip(b) shouldRoundTrip(c) shouldRoundTrip(SampleClassD(Some(c))) @@ -191,11 +204,13 @@ class MacrosUnitTests extends WordSpec with Matchers { } "Case Class should form expected tuple" in { - val input = SampleClassC(SampleClassA(1, "asdf"), + val input = SampleClassC( + SampleClassA(1, "asdf"), SampleClassB(SampleClassA(2, "bcdf"), SampleClassA(5, "jkfs"), "wetew"), SampleClassA(9, "xcmv"), SampleClassB(SampleClassA(23, "ck"), SampleClassA(13, "dafk"), "xcv"), - SampleClassB(SampleClassA(34, "were"), SampleClassA(654, "power"), "adsfmx")) + SampleClassB(SampleClassA(34, "were"), SampleClassA(654, "power"), "adsfmx") + ) val setter = implicitly[TupleSetter[SampleClassC]] val tup = setter(input) assert(tup.size == 19) @@ -254,11 +269,18 @@ class MacrosUnitTests extends WordSpec with Matchers { "Case Class should form expected Fields" in { val fields = Macros.toFields[SampleClassB] assert(fields.size === 5) - assert(fields.getTypes === Array[java.lang.reflect.Type](classOf[Int], classOf[String], classOf[Int], classOf[String], classOf[String])) + assert( + fields.getTypes === Array[java.lang.reflect.Type]( + classOf[Int], + classOf[String], + classOf[Int], + classOf[String], + classOf[String] + ) + ) val names = List("a1.x", "a1.y", "a2.x", "a2.y", "y") - names.zipWithIndex.foreach { - case (name, indx) => - assert(fields.get(indx) === name) + names.zipWithIndex.foreach { case (name, indx) => + assert(fields.get(indx) === name) } } @@ -277,11 +299,18 @@ class MacrosUnitTests extends WordSpec with Matchers { "Case Class should form expected Indexed Fields" in { val fields = Macros.toIndexedFields[SampleClassB] assert(fields.size === 5) - assert(fields.getTypes === Array[java.lang.reflect.Type](classOf[Int], classOf[String], classOf[Int], classOf[String], classOf[String])) - val names = (0 until fields.size) - names.zipWithIndex.foreach { - case (name, indx) => - assert(fields.get(indx) === name) + assert( + fields.getTypes === Array[java.lang.reflect.Type]( + classOf[Int], + classOf[String], + classOf[Int], + classOf[String], + classOf[String] + ) + ) + val names = 0 until fields.size + names.zipWithIndex.foreach { case (name, indx) => + assert(fields.get(indx) === name) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala index 9008203fcc..9a9f179971 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ class CombinatoricsJob(args: Args) extends Job(args) { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala index 8940d4e534..35d7495fd8 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala @@ -12,23 +12,25 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ class HistogramJob(args: Args) extends Job(args) { try { val hist = Tsv("input", 'n) - .groupAll{ _.histogram('n -> 'hist) } + .groupAll(_.histogram('n -> 'hist)) hist - .flatMapTo('hist -> ('bin, 'cdf)){ h: Histogram => h.cdf } + .flatMapTo('hist -> ('bin, 'cdf)) { h: Histogram => h.cdf } .write(Tsv("cdf-output")) hist - .mapTo('hist -> ('min, 'max, 'sum, 'mean, 'stdDev)){ h: Histogram => (h.min, h.max, h.sum, h.mean, h.stdDev) } + .mapTo('hist -> ('min, 'max, 'sum, 'mean, 'stdDev)) { h: Histogram => + (h.min, h.max, h.sum, h.mean, h.stdDev) + } .write(Tsv("stats-output")) } catch { @@ -43,7 +45,7 @@ class HistogramJobTest extends WordSpec with Matchers { val cdfOutput = Set((1.0, 0.3), (2.0, 0.5), (3.0, 0.8), (4.0, 0.9), (8.0, 1.0)) "A HistogramJob" should { JobTest(new HistogramJob(_)) - .source(Tsv("input", ('n)), inputData) + .source(Tsv("input", 'n), inputData) .sink[(Double, Double, Double, Double, Double)](Tsv("stats-output")) { buf => val (min, max, sum, mean, stdDev) = buf.head "correctly compute the min" in { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala index 00af2dbcfc..581fd54a91 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import org.scalacheck.Arbitrary @@ -21,7 +21,7 @@ import org.scalacheck.Properties import org.scalacheck.Prop.forAll import org.scalacheck._ import org.scalacheck.Gen._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ import Matrix2._ import cascading.flow.FlowDef @@ -29,8 +29,7 @@ import com.twitter.algebird.Ring import com.twitter.scalding.IterableSource /** - * Unit tests used in development - * (stronger properties are tested in ScalaCheck tests at the end) + * Unit tests used in development (stronger properties are tested in ScalaCheck tests at the end) */ class Matrix2OptimizationSpec extends WordSpec with Matchers { import com.twitter.scalding.Test @@ -44,33 +43,49 @@ class Matrix2OptimizationSpec extends WordSpec with Matchers { implicit val ord1: Ordering[Int] = Ordering.Int implicit val ord2: Ordering[(Int, Int)] = Ordering.Tuple2[Int, Int] - def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] - def product(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Product[Any, Any, Any, Double] = Product(left, right, ring) - def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = Sum(left, right, ring) + def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = + MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] + def product( + left: Matrix2[Any, Any, Double], + right: Matrix2[Any, Any, Double] + ): Product[Any, Any, Any, Double] = Product(left, right, ring) + def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = + Sum(left, right, ring) /** * Values used in tests */ // ((A1(A2 A3))((A4 A5) A6) val optimizedPlan = product( // linter:ignore - product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)))), product( - product(literal(globM, FiniteHint(5, 10)), - literal(globM, FiniteHint(10, 20))), - literal(globM, FiniteHint(20, 25)))) + literal(globM, FiniteHint(30, 35)), + product(literal(globM, FiniteHint(35, 15)), literal(globM, FiniteHint(15, 5))) + ), + product( + product(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20))), + literal(globM, FiniteHint(20, 25)) + ) + ) val optimizedPlanCost = 1850 // originally 15125.0 // A1(A2(A3(A4(A5 A6)))) - val unoptimizedPlan = product(literal(globM, FiniteHint(30, 35)), // linter:ignore - product(literal(globM, FiniteHint(35, 15)), - product(literal(globM, FiniteHint(15, 5)), - product(literal(globM, FiniteHint(5, 10)), - product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))))))) + val unoptimizedPlan = product( + literal(globM, FiniteHint(30, 35)), // linter:ignore + product( + literal(globM, FiniteHint(35, 15)), + product( + literal(globM, FiniteHint(15, 5)), + product( + literal(globM, FiniteHint(5, 10)), + product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))) + ) + ) + ) + ) - val simplePlan = product(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) // linter:ignore + val simplePlan = + product(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) // linter:ignore val simplePlanCost = 750 //originally 26250 @@ -82,50 +97,78 @@ class Matrix2OptimizationSpec extends WordSpec with Matchers { // A1 * (A2 * (A3 * ( A4 + A4 ) * (A5 * (A6)))) - val unoptimizedGlobalPlan = product(literal(globM, FiniteHint(30, 35)), // linter:ignore - product(literal(globM, FiniteHint(35, 15)), - product(literal(globM, FiniteHint(15, 5)), - product(sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), - product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))))))) + val unoptimizedGlobalPlan = product( + literal(globM, FiniteHint(30, 35)), // linter:ignore + product( + literal(globM, FiniteHint(35, 15)), + product( + literal(globM, FiniteHint(15, 5)), + product( + sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), + product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))) + ) + ) + ) + ) // ((A1(A2 A3))(((A4 + A4) A5) A6) val optimizedGlobalPlan = product( // linter:ignore - product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)))), product( - product(sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), - literal(globM, FiniteHint(10, 20))), - literal(globM, FiniteHint(20, 25)))) - - val productSequence = IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20)), - literal(globM, FiniteHint(20, 25))) - - val combinedSequence = List(IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20)), - literal(globM, FiniteHint(20, 25))), IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25)))) - - val planWithSum = product(literal(globM, FiniteHint(30, 35)), sum(literal(globM, FiniteHint(35, 25)), literal(globM, FiniteHint(35, 25)))) // linter:ignore + literal(globM, FiniteHint(30, 35)), + product(literal(globM, FiniteHint(35, 15)), literal(globM, FiniteHint(15, 5))) + ), + product( + product( + sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), + literal(globM, FiniteHint(10, 20)) + ), + literal(globM, FiniteHint(20, 25)) + ) + ) + + val productSequence = IndexedSeq( + literal(globM, FiniteHint(30, 35)), + literal(globM, FiniteHint(35, 15)), + literal(globM, FiniteHint(15, 5)), + literal(globM, FiniteHint(5, 10)), + literal(globM, FiniteHint(10, 20)), + literal(globM, FiniteHint(20, 25)) + ) + + val combinedSequence = List( + IndexedSeq( + literal(globM, FiniteHint(30, 35)), + literal(globM, FiniteHint(35, 15)), + literal(globM, FiniteHint(15, 5)), + literal(globM, FiniteHint(5, 10)), + literal(globM, FiniteHint(10, 20)), + literal(globM, FiniteHint(20, 25)) + ), + IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) + ) + + val planWithSum = product( + literal(globM, FiniteHint(30, 35)), + sum(literal(globM, FiniteHint(35, 25)), literal(globM, FiniteHint(35, 25))) + ) // linter:ignore val g = literal(globM, FiniteHint(30, 30)) // linter:ignore val g2 = product(g, g) // linter:ignore val g4 = product(g2, g2) // linter:ignore val optimizedGraph8 = product(g4, g4) // linter:ignore - val unoptimizedGraphVectorPlan = (g ^ (5)) * literal(globM, FiniteHint(Long.MaxValue, 1)) + val unoptimizedGraphVectorPlan = (g ^ 5) * literal(globM, FiniteHint(Long.MaxValue, 1)) val optimizedGraphVectorPlan = product( // linter:ignore - product( - literal(globM, FiniteHint(30, 30)), - literal(globM, FiniteHint(30, 30))), + product(literal(globM, FiniteHint(30, 30)), literal(globM, FiniteHint(30, 30))), product( literal(globM, FiniteHint(30, 30)), product( literal(globM, FiniteHint(30, 30)), - product( - literal(globM, FiniteHint(30, 30)), - literal(globM, FiniteHint(Long.MaxValue, 1)))))) + product(literal(globM, FiniteHint(30, 30)), literal(globM, FiniteHint(Long.MaxValue, 1))) + ) + ) + ) "Matrix multiplication chain optimization" should { "handle a single matrix" in { @@ -196,9 +239,14 @@ object Matrix2Props extends Properties("Matrix2") { implicit val ring: Ring[Double] = Ring.doubleRing implicit val ord1: Ordering[Int] = Ordering.Int - def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] - def product(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Product[Any, Any, Any, Double] = Product(left, right, ring) - def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = Sum(left, right, ring) + def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = + MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] + def product( + left: Matrix2[Any, Any, Double], + right: Matrix2[Any, Any, Double] + ): Product[Any, Any, Any, Double] = Product(left, right, ring) + def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = + Sum(left, right, ring) /** * Helper methods used in tests for randomized generations @@ -218,21 +266,24 @@ object Matrix2Props extends Properties("Matrix2") { } } - def productChainGen(current: Int, target: Int, prevCol: Long, result: List[MatrixLiteral[Any, Any, Double]]): List[MatrixLiteral[Any, Any, Double]] = { + def productChainGen( + current: Int, + target: Int, + prevCol: Long, + result: List[MatrixLiteral[Any, Any, Double]] + ): List[MatrixLiteral[Any, Any, Double]] = if (current == target) result else { val (randomMatrix, cols) = genLeaf((prevCol, 0)) // linter:ignore productChainGen(current + 1, target, cols, result ++ List(randomMatrix)) } - } - def randomProduct(p: Int): Matrix2[Any, Any, Double] = { + def randomProduct(p: Int): Matrix2[Any, Any, Double] = if (p == 1) genLeaf((0, 0))._1 else { val full = productChainGen(0, p, 0, Nil).toIndexedSeq generateRandomPlan(0, full.size - 1, full) } - } def genNode(depth: Int): Gen[Matrix2[Any, Any, Double]] = for { v <- arbitrary[Int] @@ -255,7 +306,11 @@ object Matrix2Props extends Properties("Matrix2") { implicit def arbSeq: Arbitrary[IndexedSeq[MatrixLiteral[Any, Any, Double]]] = Arbitrary(genProdSeq) - def generateRandomPlan(i: Int, j: Int, p: IndexedSeq[MatrixLiteral[Any, Any, Double]]): Matrix2[Any, Any, Double] = { + def generateRandomPlan( + i: Int, + j: Int, + p: IndexedSeq[MatrixLiteral[Any, Any, Double]] + ): Matrix2[Any, Any, Double] = if (i == j) p(i) else { val genK = Gen.choose(i, j - 1) @@ -264,24 +319,24 @@ object Matrix2Props extends Properties("Matrix2") { val Y = generateRandomPlan(k + 1, j, p) // linter:ignore Product(X, Y, ring) } - } /** - * Function that recursively estimates a cost of a given MatrixFormula / plan. - * This is the used in the tests for checking whether an optimized plan has - * a cost <= a randomized plan. - * The cost estimation of this evaluation should return the same values as the one - * used in building optimized plans -- this is checked in the tests below. - * @return resulting cost + * Function that recursively estimates a cost of a given MatrixFormula / plan. This is the used in the tests + * for checking whether an optimized plan has a cost <= a randomized plan. The cost estimation of this + * evaluation should return the same values as the one used in building optimized plans -- this is checked + * in the tests below. + * @return + * resulting cost */ def evaluate(mf: Matrix2[Any, Any, Double]): BigInt = { /** - * This function strips off the formula into a list of independent product chains - * (i.e. same as matrixFormulaToChains in Prototype, but has Products - * instead of IndexedSeq[Literal]) + * This function strips off the formula into a list of independent product chains (i.e. same as + * matrixFormulaToChains in Prototype, but has Products instead of IndexedSeq[Literal]) */ - def toProducts(mf: Matrix2[Any, Any, Double]): (Option[Product[Any, Any, Any, Double]], List[Product[Any, Any, Any, Double]]) = { + def toProducts( + mf: Matrix2[Any, Any, Double] + ): (Option[Product[Any, Any, Any, Double]], List[Product[Any, Any, Any, Double]]) = mf match { case element @ MatrixLiteral(_, _) => (None, Nil) case Sum(left, right, _) => { @@ -310,14 +365,14 @@ object Matrix2Props extends Properties("Matrix2") { if (lastLP.isDefined && lastRP.isDefined) { (Some(Product(lastLP.get, lastRP.get, ring)), leftR ++ rightR) } else { - val newP = if (lastLP.isDefined) List(lastLP.get) else if (lastRP.isDefined) List(lastRP.get) else Nil + val newP = + if (lastLP.isDefined) List(lastLP.get) else if (lastRP.isDefined) List(lastRP.get) else Nil (None, newP ++ leftR ++ rightR) } } case HadamardProduct(_, _, _) => sys.error("Hadamard unexpected here") } - } /** * To create a companion tree which has respective ranges of each product @@ -326,7 +381,7 @@ object Matrix2Props extends Properties("Matrix2") { def diff: Int = range._2 - range._1 } - def labelTree(p: Matrix2[Any, Any, Double], start: Int): Option[LabeledTree] = { + def labelTree(p: Matrix2[Any, Any, Double], start: Int): Option[LabeledTree] = p match { case Product(left @ MatrixLiteral(_, _), right @ MatrixLiteral(_, _), _, _) => { Some(new LabeledTree((start, start + 1), None, None)) @@ -346,42 +401,50 @@ object Matrix2Props extends Properties("Matrix2") { } case _ => None } - } /** - * This function evaluates a product chain in the same way - * as the dynamic programming procedure computes cost - * (optimizeProductChain - computeCosts in Prototype) + * This function evaluates a product chain in the same way as the dynamic programming procedure computes + * cost (optimizeProductChain - computeCosts in Prototype) */ - def evaluateProduct(p: Matrix2[Any, Any, Double], labels: LabeledTree): Option[(BigInt, Matrix2[Any, Any, Double], Matrix2[Any, Any, Double])] = { + def evaluateProduct( + p: Matrix2[Any, Any, Double], + labels: LabeledTree + ): Option[(BigInt, Matrix2[Any, Any, Double], Matrix2[Any, Any, Double])] = p match { case Product(left @ MatrixLiteral(_, _), right @ MatrixLiteral(_, _), _, _) => { // reflects optimize when k==i: p(i).sizeHint * (p(k).sizeHint * p(j).sizeHint) - Some((left.sizeHint * (left.sizeHint * right.sizeHint)).total.get, - left, right) + Some((left.sizeHint * (left.sizeHint * right.sizeHint)).total.get, left, right) } case Product(left @ MatrixLiteral(_, _), right @ Product(_, _, _, _), _, _) => { val (cost, pLeft, pRight) = evaluateProduct(right, labels.right.get).get // linter:ignore // reflects optimize when k==i: p(i).sizeHint * (p(k).sizeHint * p(j).sizeHint) // diff is computed in the labeled tree - it measures "spread" of the tree // diff corresponds to (k - i) or (j - k - 1) in optimize: (k - i) * computeCosts(p, i, k) + (j - k - 1) * computeCosts(p, k + 1, j) - Some(labels.right.get.diff * cost + (left.sizeHint * (left.sizeHint * pRight.sizeHint)).total.get, - left, pRight) + Some( + labels.right.get.diff * cost + (left.sizeHint * (left.sizeHint * pRight.sizeHint)).total.get, + left, + pRight + ) } case Product(left @ Product(_, _, _, _), right @ MatrixLiteral(_, _), _, _) => { val (cost, pLeft, pRight) = evaluateProduct(left, labels.left.get).get // linter:ignore - Some(labels.left.get.diff * cost + (pLeft.sizeHint * (pRight.sizeHint * right.sizeHint)).total.get, - pLeft, right) + Some( + labels.left.get.diff * cost + (pLeft.sizeHint * (pRight.sizeHint * right.sizeHint)).total.get, + pLeft, + right + ) } case Product(left, right, _, _) => { val (cost1, p1Left, p1Right) = evaluateProduct(left, labels.left.get).get // linter:ignore val (cost2, p2Left, p2Right) = evaluateProduct(right, labels.right.get).get // linter:ignore - Some(labels.left.get.diff * cost1 + labels.right.get.diff * cost2 + (p1Left.sizeHint * (p1Right.sizeHint * p2Right.sizeHint)).total.get, - p1Left, p2Right) + Some( + labels.left.get.diff * cost1 + labels.right.get.diff * cost2 + (p1Left.sizeHint * (p1Right.sizeHint * p2Right.sizeHint)).total.get, + p1Left, + p2Right + ) } case _ => None } - } val (last, productList) = toProducts(mf) val products = if (last.isDefined) last.get :: productList else productList @@ -390,20 +453,21 @@ object Matrix2Props extends Properties("Matrix2") { // ScalaCheck properties /** - * Verifying "evaluate" function - that it does return - * the same overall costs as what is estimated in the optimization procedure + * Verifying "evaluate" function - that it does return the same overall costs as what is estimated in the + * optimization procedure */ property("evaluate function returns the same cost as optimize") = forAll { (a: Matrix2[Any, Any, Double]) => optimize(a)._1 == evaluate(optimize(a)._2) } /** - * "Proof": the goal property that estimated costs of optimized plans or product chains - * are less than or equal to costs of randomized equivalent plans or product chains + * "Proof": the goal property that estimated costs of optimized plans or product chains are less than or + * equal to costs of randomized equivalent plans or product chains */ - property("a cost of an optimized chain of matrix products is <= a random one") = forAll { (a: IndexedSeq[MatrixLiteral[Any, Any, Double]]) => - optimizeProductChain(a, Some(ring, MatrixJoiner2.default))._1 <= - evaluate(generateRandomPlan(0, a.length - 1, a)) + property("a cost of an optimized chain of matrix products is <= a random one") = forAll { + (a: IndexedSeq[MatrixLiteral[Any, Any, Double]]) => + optimizeProductChain(a, Some(ring, MatrixJoiner2.default))._1 <= + evaluate(generateRandomPlan(0, a.length - 1, a)) } property("cost of a random plan is <= a random one") = forAll { (a: Matrix2[Any, Any, Double]) => diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala index ba966a7f78..c7f87ae86e 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import com.twitter.scalding._ import com.twitter.scalding.serialization._ import com.twitter.scalding.source.TypedText -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.algebird.field._ class Matrix2Sum(args: Args) extends Job(args) { @@ -198,8 +198,17 @@ class Matrix2PropJob(args: Args) extends Job(args) { val tsv3 = TypedText.tsv[(Int, Double)]("row") val row = MatrixLiteral(TypedPipe.from(tsv3).map { case (idx, v) => ((), idx, v) }, NoClue) - mat.binarizeAs[Boolean].propagate(col).toTypedPipe.map { case (idx, x, v) => (idx, v) }.write(TypedText.tsv[(Int, Double)]("prop-col")) - row.propagateRow(mat.binarizeAs[Boolean]).toTypedPipe.map { case (x, idx, v) => (idx, v) }.write(TypedText.tsv[(Int, Double)]("prop-row")) + mat + .binarizeAs[Boolean] + .propagate(col) + .toTypedPipe + .map { case (idx, x, v) => (idx, v) } + .write(TypedText.tsv[(Int, Double)]("prop-col")) + row + .propagateRow(mat.binarizeAs[Boolean]) + .toTypedPipe + .map { case (x, idx, v) => (idx, v) } + .write(TypedText.tsv[(Int, Double)]("prop-row")) } class Matrix2Cosine(args: Args) extends Job(args) { @@ -258,12 +267,10 @@ class Scalar2Ops(args: Args) extends Job(args) { class Matrix2Test extends WordSpec with Matchers { import Dsl._ - def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = { - iter.map { it => ((it._1, it._2), it._3) }.toMap - } - def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = { - iter.map { it => ((it._1, it._1), it._2) }.toMap - } + def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = + iter.map(it => ((it._1, it._2), it._3)).toMap + def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = + iter.map(it => ((it._1, it._1), it._2)).toMap "A MatrixSum job" should { TUtil.printStack { @@ -272,7 +279,13 @@ class Matrix2Test extends WordSpec with Matchers { .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => "correctly compute sums" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 8.0, (1, 3) -> 3.0, (2, 1) -> 8.0, (2, 2) -> 3.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) } } .runHadoop @@ -287,7 +300,13 @@ class Matrix2Test extends WordSpec with Matchers { .source(TypedText.tsv[(Int, Int, Double)]("mat2"), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => "correctly compute sums" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 8.0, (1, 3) -> 3.0, (2, 1) -> 8.0, (2, 2) -> 3.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) } } .runHadoop @@ -298,12 +317,16 @@ class Matrix2Test extends WordSpec with Matchers { "A Matrix2Sum3 job, where the Matrix contains tuples as values," should { TUtil.printStack { JobTest(new Matrix2Sum3(_)) - .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0)))) + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0))) + ) .typedSink(TypedText.tsv[(Int, Int, (Double, Double, Double))]("sum")) { ob => "correctly compute sums" in { // Treat (Double, Double, Double) as string because that is what is actually returned // when using runHadoop - val result = Map((1, 1) -> (2.0, 6.0, 10.0), (2, 2) -> (6.0, 4.0, 2.0), (1, 2) -> (8.0, 10.0, 4.0)) + val result = + Map((1, 1) -> (2.0, 6.0, 10.0), (2, 2) -> (6.0, 4.0, 2.0), (1, 2) -> (8.0, 10.0, 4.0)) toSparseMat(ob) shouldBe result } }(implicitly[TypeDescriptor[(Int, Int, (Double, Double, Double))]].converter) @@ -320,7 +343,13 @@ class Matrix2Test extends WordSpec with Matchers { .source(Tsv("mat3", ('x3, 'y3, 'v3)), List((1, 3, 4.0), (2, 1, 1.0), (1, 2, 4.0))) .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => "correctly compute sums" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 12.0, (1, 3) -> 7.0, (2, 1) -> 9.0, (2, 2) -> 3.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 12.0, + (1, 3) -> 7.0, + (2, 1) -> 9.0, + (2, 2) -> 3.0 + ) } } .runHadoop @@ -420,13 +449,13 @@ class Matrix2Test extends WordSpec with Matchers { TUtil.printStack { JobTest(new Matrix2PropJob(_)) /* Sparse representation of the input matrix: - * [[0 1 1], - * [0 0 1], - * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) - * - * Sparse representation of the input vector: - * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) - */ + * [[0 1 1], + * [0 0 1], + * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) + * + * Sparse representation of the input vector: + * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) + */ .source(TypedText.tsv[(Int, Int, Int)]("graph"), List((0, 1, 1), (0, 2, 1), (1, 2, 1), (2, 0, 1))) .source(TypedText.tsv[(Int, Double)]("row"), List((0, 1.0), (1, 2.0), (2, 4.0))) .source(TypedText.tsv[(Int, Double)]("col"), List((0, 1.0), (1, 2.0), (2, 4.0))) @@ -451,7 +480,12 @@ class Matrix2Test extends WordSpec with Matchers { .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .typedSink(TypedText.tsv[(Int, Int, Double)]("cosine")) { ob => "correctly compute cosine similarity" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 0.9701425001453319, (2, 1) -> 0.9701425001453319, (2, 2) -> 1.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 0.9701425001453319, + (2, 1) -> 0.9701425001453319, + (2, 2) -> 1.0 + ) } } .runHadoop @@ -462,16 +496,36 @@ class Matrix2Test extends WordSpec with Matchers { "A Matrix2 Normalize job" should { TUtil.printStack { JobTest(new Matrix2Normalize(_)) - .source(TypedText.tsv[(Int, Int, Double)]("mat1"), List((1, 1, 4.0), (1, 2, 1.0), (2, 2, 1.0), (3, 1, 1.0), (3, 2, 3.0), (3, 3, 4.0))) - .source(TypedText.tsv[(Int, Int, Long)]("mat2"), List((1, 1, 4L), (1, 2, 1L), (2, 2, 1L), (3, 1, 1L), (3, 2, 3L), (3, 3, 4L))) + .source( + TypedText.tsv[(Int, Int, Double)]("mat1"), + List((1, 1, 4.0), (1, 2, 1.0), (2, 2, 1.0), (3, 1, 1.0), (3, 2, 3.0), (3, 3, 4.0)) + ) + .source( + TypedText.tsv[(Int, Int, Long)]("mat2"), + List((1, 1, 4L), (1, 2, 1L), (2, 2, 1L), (3, 1, 1L), (3, 2, 3L), (3, 3, 4L)) + ) .typedSink(TypedText.tsv[(Int, Int, Double)]("normalized")) { ob => "correctly compute l1 normalization for matrix with double values" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 0.8, (1, 2) -> 0.2, (2, 2) -> 1.0, (3, 1) -> 0.125, (3, 2) -> 0.375, (3, 3) -> 0.5) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 0.8, + (1, 2) -> 0.2, + (2, 2) -> 1.0, + (3, 1) -> 0.125, + (3, 2) -> 0.375, + (3, 3) -> 0.5 + ) } } - .typedSink(TypedText.tsv[(Int, Int, Double)]("long_normalized")){ ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("long_normalized")) { ob => "correctly compute l1 normalization for matrix with long values" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 0.8, (1, 2) -> 0.2, (2, 2) -> 1.0, (3, 1) -> 0.125, (3, 2) -> 0.375, (3, 3) -> 0.5) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 0.8, + (1, 2) -> 0.2, + (2, 2) -> 1.0, + (3, 1) -> 0.125, + (3, 2) -> 0.375, + (3, 3) -> 0.5 + ) } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala index 9c2dac4dd7..a5de270d41 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala @@ -12,18 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import com.twitter.scalding._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.algebird.Group import com.twitter.algebird.field._ object TUtil { - def printStack(fn: => Unit): Unit = { - try { fn } catch { case e: Throwable => e.printStackTrace; throw e } - } + def printStack(fn: => Unit): Unit = + try { fn } + catch { case e: Throwable => e.printStackTrace; throw e } } class MatrixProd(args: Args) extends Job(args) { @@ -42,13 +42,13 @@ class MatrixBlockProd(args: Args) extends Job(args) { import Matrix._ val mat1 = Tsv("mat1", ('x1, 'y1, 'v1)) - .mapToBlockMatrix(('x1, 'y1, 'v1)) { (rcv: (String, Int, Double)) => (rcv._1(0), rcv._1, rcv._2, rcv._3) } + .mapToBlockMatrix(('x1, 'y1, 'v1))((rcv: (String, Int, Double)) => (rcv._1(0), rcv._1, rcv._2, rcv._3)) val mat2 = Tsv("mat1", ('x1, 'y1, 'v1)) .toMatrix[String, Int, Double]('x1, 'y1, 'v1) .toBlockMatrix(s => (s(0), s)) - val gram = mat1 dotProd mat2.transpose + val gram = mat1.dotProd(mat2.transpose) gram.pipe.write(Tsv("product")) } @@ -140,8 +140,7 @@ class ScalarOps(args: Args) extends Job(args) { class DiagonalOps(args: Args) extends Job(args) { import Matrix._ - val mat = Tsv("mat1", ('x1, 'y1, 'v1)) - .read + val mat = Tsv("mat1", ('x1, 'y1, 'v1)).read .toMatrix[Int, Int, Double]('x1, 'y1, 'v1) (mat * mat.diagonal).write(Tsv("mat-diag")) (mat.diagonal * mat).write(Tsv("diag-mat")) @@ -167,8 +166,8 @@ class MatrixMapWithVal(args: Args) extends Job(args) { val mat = TypedTsv[(Int, Int, Int)]("graph").toMatrix val row = TypedTsv[(Int, Double)]("row").toRow - mat.mapWithIndex { (v, r, c) => if (r == c) v else 0 }.write(Tsv("diag")) - row.mapWithIndex { (v, c) => if (c == 0) v else 0.0 }.write(Tsv("first")) + mat.mapWithIndex((v, r, c) => if (r == c) v else 0).write(Tsv("diag")) + row.mapWithIndex((v, c) => if (c == 0) v else 0.0).write(Tsv("first")) } class RowMatProd(args: Args) extends Job(args) { @@ -218,7 +217,7 @@ class RowRowHad(args: Args) extends Job(args) { val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row1 = mat1.getRow(1) - val rowSum = row1 hProd row1 + val rowSum = row1.hProd(row1) rowSum.pipe.write(Tsv("rowRowHad")) } @@ -274,7 +273,7 @@ class ScalarRowRight(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (row1 * sca1).write(Tsv("scalarObjRowRight")) @@ -289,7 +288,7 @@ class ScalarRowLeft(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (sca1 * row1).write(Tsv("scalarObjRowLeft")) @@ -304,7 +303,7 @@ class ScalarColRight(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (col1 * sca1).write(Tsv("scalarObjColRight")) @@ -319,7 +318,7 @@ class ScalarColLeft(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (sca1 * col1).write(Tsv("scalarObjColLeft")) @@ -334,7 +333,7 @@ class ScalarDiagRight(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (diag1 * sca1).write(Tsv("scalarObjDiagRight")) @@ -349,7 +348,7 @@ class ScalarDiagLeft(args: Args) extends Job(args) { // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) (sca1 * diag1).write(Tsv("scalarObjDiagLeft")) @@ -383,12 +382,10 @@ class RowNormalize(args: Args) extends Job(args) { class MatrixTest extends WordSpec with Matchers { import Dsl._ - def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = { - iter.map { it => ((it._1, it._2), it._3) }.toMap - } - def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = { - iter.map { it => ((it._1, it._1), it._2) }.toMap - } + def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = + iter.map(it => ((it._1, it._2), it._3)).toMap + def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = + iter.map(it => ((it._1, it._1), it._2)).toMap "A MatrixProd job" should { TUtil.printStack { @@ -407,7 +404,19 @@ class MatrixTest extends WordSpec with Matchers { "A MatrixBlockProd job" should { TUtil.printStack { JobTest(new MatrixBlockProd(_)) - .source(Tsv("mat1", ('x1, 'y1, 'v1)), List(("alpha1", 1, 1.0), ("alpha1", 2, 2.0), ("beta1", 1, 5.0), ("beta1", 2, 6.0), ("alpha2", 1, 3.0), ("alpha2", 2, 4.0), ("beta2", 1, 7.0), ("beta2", 2, 8.0))) + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List( + ("alpha1", 1, 1.0), + ("alpha1", 2, 2.0), + ("beta1", 1, 5.0), + ("beta1", 2, 6.0), + ("alpha2", 1, 3.0), + ("alpha2", 2, 4.0), + ("beta2", 1, 7.0), + ("beta2", 2, 8.0) + ) + ) .sink[(String, String, Double)](Tsv("product")) { ob => "correctly compute block products" in { toSparseMat(ob) shouldBe Map( @@ -418,7 +427,8 @@ class MatrixTest extends WordSpec with Matchers { ("beta1", "beta1") -> 61.0, ("beta1", "beta2") -> 83.0, ("beta2", "beta1") -> 83.0, - ("beta2", "beta2") -> 113.0) + ("beta2", "beta2") -> 113.0 + ) } } .run @@ -433,7 +443,13 @@ class MatrixTest extends WordSpec with Matchers { .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) .sink[(Int, Int, Double)](Tsv("sum")) { ob => "correctly compute sums" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 8.0, (1, 3) -> 3.0, (2, 1) -> 8.0, (2, 2) -> 3.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) } } .run @@ -444,10 +460,17 @@ class MatrixTest extends WordSpec with Matchers { "A MatrixSum job, where the Matrix contains tuples as values," should { TUtil.printStack { JobTest("com.twitter.scalding.mathematics.MatrixSum3") - .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0)))) + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0))) + ) .sink[(Int, Int, (Double, Double, Double))](Tsv("sum")) { ob => "correctly compute sums" in { - toSparseMat(ob) shouldBe Map((1, 1) -> (2.0, 6.0, 10.0), (2, 2) -> (6.0, 4.0, 2.0), (1, 2) -> (8.0, 10.0, 4.0)) + toSparseMat(ob) shouldBe Map( + (1, 1) -> (2.0, 6.0, 10.0), + (2, 2) -> (6.0, 4.0, 2.0), + (1, 2) -> (8.0, 10.0, 4.0) + ) } } .run @@ -459,15 +482,15 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { JobTest(new Randwalk(_)) /* - * 1.0 4.0 - * 0.0 3.0 - * row normalized: - * 1.0/5.0 4.0/5.0 - * 0.0 1.0 - * product with itself: - * 1.0/25.0 (4.0/25.0 + 4.0/5.0) - * 0.0 1.0 - */ + * 1.0 4.0 + * 0.0 3.0 + * row normalized: + * 1.0/5.0 4.0/5.0 + * 0.0 1.0 + * product with itself: + * 1.0/25.0 (4.0/25.0 + 4.0/5.0) + * 0.0 1.0 + */ .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .sink[(Int, Int, Double)](Tsv("randwalk")) { ob => "correctly compute matrix randwalk" in { @@ -475,9 +498,10 @@ class MatrixTest extends WordSpec with Matchers { val exact = Map((1, 1) -> (1.0 / 25.0), (1, 2) -> (4.0 / 25.0 + 4.0 / 5.0), (2, 2) -> 1.0) val grp = implicitly[Group[Map[(Int, Int), Double]]] // doubles are hard to compare - grp.minus(pMap, exact) - .mapValues { x => x * x } - .map { _._2 } + grp + .minus(pMap, exact) + .mapValues(x => x * x) + .map(_._2) .sum should be < 0.0001 } } @@ -491,7 +515,12 @@ class MatrixTest extends WordSpec with Matchers { .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .sink[(Int, Int, Double)](Tsv("cosine")) { ob => "correctly compute cosine similarity" in { - toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 0.9701425001453319, (2, 1) -> 0.9701425001453319, (2, 2) -> 1.0) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 0.9701425001453319, + (2, 1) -> 0.9701425001453319, + (2, 2) -> 1.0 + ) } } .run @@ -579,8 +608,8 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { JobTest(new DiagonalOps(_)) /* [[1.0 4.0] - * [0.0 3.0]] - */ + * [0.0 3.0]] + */ .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .sink[(Int, Int, Double)](Tsv("diag-mat")) { ob => "correctly compute diag * matrix" in { @@ -616,10 +645,10 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { JobTest(new PropJob(_)) /* [[0 1 1], - * [0 0 1], - * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) - * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) - */ + * [0 0 1], + * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) + * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) + */ .source(TypedTsv[(Int, Int, Int)]("graph"), List((0, 1, 1), (0, 2, 1), (1, 2, 1), (2, 0, 1))) .source(TypedTsv[(Int, Double)]("row"), List((0, 1.0), (1, 2.0), (2, 4.0))) .source(TypedTsv[(Int, Double)]("col"), List((0, 1.0), (1, 2.0), (2, 4.0))) @@ -804,7 +833,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarRowRight(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarRowRight")) { ob => s"$idx: correctly compute a new row vector" in { @@ -827,7 +856,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarRowLeft(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarRowLeft")) { ob => s"$idx: correctly compute a new row vector" in { @@ -850,7 +879,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarColRight(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarColRight")) { ob => s"$idx: correctly compute a new col vector" in { @@ -873,7 +902,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarColLeft(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarColLeft")) { ob => s"$idx: correctly compute a new col vector" in { @@ -896,7 +925,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarDiagRight(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarDiagRight")) { ob => s"$idx: correctly compute a new diag matrix" in { @@ -919,7 +948,7 @@ class MatrixTest extends WordSpec with Matchers { TUtil.printStack { var idx = 0 JobTest(new ScalarDiagLeft(_)) - .source(Tsv("sca1", ('v)), List(3.0)) + .source(Tsv("sca1", 'v), List(3.0)) .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) .sink[(Int, Double)](Tsv("scalarDiagLeft")) { ob => s"$idx: correctly compute a new diag matrix" in { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala index 4d41b7fa70..34d80be15a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import org.scalacheck.Arbitrary @@ -24,28 +24,34 @@ object SizeHintProps extends Properties("SizeHint") { val noClueGen = const(NoClue) - val finiteHintGen = for ( - rows <- choose(-1L, 1000000L); + val finiteHintGen = for { + rows <- choose(-1L, 1000000L) cols <- choose(-1L, 1000000L) - ) yield FiniteHint(rows, cols) + } yield FiniteHint(rows, cols) - val sparseHintGen = for ( - rows <- choose(-1L, 1000000L); - cols <- choose(-1L, 1000000L); + val sparseHintGen = for { + rows <- choose(-1L, 1000000L) + cols <- choose(-1L, 1000000L) sparsity <- choose(0.0, 1.0) - ) yield SparseHint(sparsity, rows, cols) + } yield SparseHint(sparsity, rows, cols) - implicit val finiteArb: Arbitrary[FiniteHint] = Arbitrary { finiteHintGen } - implicit val sparseArb: Arbitrary[SparseHint] = Arbitrary { sparseHintGen } - implicit val genHint: Arbitrary[SizeHint] = Arbitrary { oneOf(noClueGen, finiteHintGen, sparseHintGen) } + implicit val finiteArb: Arbitrary[FiniteHint] = Arbitrary(finiteHintGen) + implicit val sparseArb: Arbitrary[SparseHint] = Arbitrary(sparseHintGen) + implicit val genHint: Arbitrary[SizeHint] = Arbitrary(oneOf(noClueGen, finiteHintGen, sparseHintGen)) property("a+b is at least as big as a") = forAll { (a: SizeHint, b: SizeHint) => - val addT = for (ta <- a.total; tsum <- (a + b).total) yield (tsum >= ta) + val addT = for { + ta <- a.total + tsum <- (a + b).total + } yield (tsum >= ta) addT.getOrElse(true) } property("a#*#b is at most as big as a") = forAll { (a: SizeHint, b: SizeHint) => - val addT = for (ta <- a.total; tsum <- (a #*# b).total) yield (tsum <= ta) + val addT = for { + ta <- a.total + tsum <- (a #*# b).total + } yield (tsum <= ta) addT.getOrElse(true) } @@ -57,8 +63,9 @@ object SizeHintProps extends Properties("SizeHint") { (a + b).asInstanceOf[SparseHint].sparsity >= a.sparsity } - property("Hadamard product does not increase sparsity fraction") = forAll { (a: SparseHint, b: SparseHint) => - (a #*# b).asInstanceOf[SparseHint].sparsity == (a.sparsity min b.sparsity) + property("Hadamard product does not increase sparsity fraction") = forAll { + (a: SparseHint, b: SparseHint) => + (a #*# b).asInstanceOf[SparseHint].sparsity == (a.sparsity.min(b.sparsity)) } property("transpose preserves size") = forAll { (a: SizeHint) => @@ -80,10 +87,10 @@ object SizeHintProps extends Properties("SizeHint") { } property("adding a sparse matrix to itself doesn't decrease size") = forAll { (a: SparseHint) => - (for ( - doubleSize <- (a + a).total; + (for { + doubleSize <- (a + a).total asize <- a.total - ) yield (doubleSize >= asize)).getOrElse(true) + } yield (doubleSize >= asize)).getOrElse(true) } property("diagonals are smaller") = forAll { (a: FiniteHint) => @@ -91,8 +98,8 @@ object SizeHintProps extends Properties("SizeHint") { } property("diagonals are about as big as the min(rows,cols)") = forAll { (a: FiniteHint) => - SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) <= (a.rows min a.cols) - SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) >= ((a.rows min a.cols) - 1L) + SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) <= (a.rows.min(a.cols)) + SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) >= ((a.rows.min(a.cols)) - 1L) } property("transpose law is obeyed in total") = forAll { (a: SizeHint, b: SizeHint) => diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala index a6c589e37c..f6e37d218a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import com.twitter.scalding._ @@ -21,7 +21,7 @@ import com.twitter.algebird.Group import TDsl._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import GraphOperations._ @@ -34,10 +34,10 @@ class TypedCosineSimJob(args: Args) extends Job(args) { .map { case (from, to) => Edge(from, to, ()) } } // Just keep the degree - .map { edge => edge.mapData { _._2 } } + .map(edge => edge.mapData(_._2)) simOf(graph, { n: Int => n % 2 == 0 }, { n: Int => n % 2 == 1 }) - .map { edge => (edge.from, edge.to, edge.data) } + .map(edge => (edge.from, edge.to, edge.data)) .write(TypedTsv[(Int, Int, Double)]("out")) } @@ -50,7 +50,7 @@ class TypedDimsumCosineSimJob(args: Args) extends Job(args) { } simOf(graph, { n: Int => n % 2 == 0 }, { n: Int => n % 2 == 1 }) - .map { edge => (edge.from, edge.to, edge.data) } + .map(edge => (edge.from, edge.to, edge.data)) .toPipe('from, 'to, 'data) .write(TypedTsv[(Int, Int, Double)]("out")) } @@ -60,7 +60,7 @@ class TypedSimilarityTest extends WordSpec with Matchers { val rand = new java.util.Random(1) val edges = (0 to nodes).flatMap { n => // try to get at least 6 edges for each node - (0 to ((nodes / 5) max (6))).foldLeft(Set[(Int, Int)]()) { (set, idx) => + (0 to ((nodes / 5).max(6))).foldLeft(Set[(Int, Int)]()) { (set, idx) => if (set.size > 6) { set } else { set + (n -> rand.nextInt(nodes)) @@ -71,7 +71,7 @@ class TypedSimilarityTest extends WordSpec with Matchers { val MaxWeight = 2 val weightedEdges = (0 to nodes).flatMap { n => // try to get at least 10 edges for each node - (0 to ((nodes / 5) max (10))).foldLeft(Set[(Int, Int, Double)]()) { (set, idx) => + (0 to ((nodes / 5).max(10))).foldLeft(Set[(Int, Int, Double)]()) { (set, idx) => if (set.size > 10) { set } else { set + ((n, rand.nextInt(nodes), rand.nextDouble * MaxWeight)) @@ -82,21 +82,21 @@ class TypedSimilarityTest extends WordSpec with Matchers { def cosineOf(es: Seq[(Int, Int)]): Map[(Int, Int), Double] = { // Get followers of each node: val matrix: Map[Int, Map[Int, Double]] = - es.groupBy { _._2 }.mapValues { seq => seq.map { case (from, to) => (from, 1.0) }.toMap } - for ( - (k1, v1) <- matrix if (k1 % 2 == 0); - (k2, v2) <- matrix if (k2 % 2 == 1) - ) yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) + es.groupBy(_._2).mapValues(seq => seq.map { case (from, to) => (from, 1.0) }.toMap) + for { + (k1, v1) <- matrix if k1 % 2 == 0 + (k2, v2) <- matrix if k2 % 2 == 1 + } yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) } def weightedCosineOf(es: Seq[(Int, Int, Double)]): Map[(Int, Int), Double] = { // Get followers of each node: val matrix: Map[Int, Map[Int, Double]] = - es.groupBy { _._2 }.mapValues { seq => seq.map { case (from, to, weight) => (from, weight) }.toMap } - for ( - (k1, v1) <- matrix if (k1 % 2 == 0); - (k2, v2) <- matrix if (k2 % 2 == 1) - ) yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) + es.groupBy(_._2).mapValues(seq => seq.map { case (from, to, weight) => (from, weight) }.toMap) + for { + (k1, v1) <- matrix if k1 % 2 == 0 + (k2, v2) <- matrix if k2 % 2 == 1 + } yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) } "A TypedCosineJob" should { diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala index cfb15176e4..7327ccda84 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala @@ -12,17 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ private[typed] object LongIntPacker { def lr(l: Int, r: Int): Long = (l.toLong << 32) | r def l(rowCol: Long) = (rowCol >>> 32).toInt - def r(rowCol: Long) = (rowCol & 0xFFFFFFFF).toInt + def r(rowCol: Long) = (rowCol & 0xffffffff).toInt } class MutatedSourceJob(args: Args) extends Job(args) { @@ -34,9 +34,10 @@ class MutatedSourceJob(args: Args) extends Job(args) { val in0: TypedPipe[(Int, Int)] = TypedPipe.from(BijectedSourceSink(TypedTsv[Long]("input0"))) - in0.map { tup: (Int, Int) => - (tup._1 * 2, tup._2 * 2) - } + in0 + .map { tup: (Int, Int) => + (tup._1 * 2, tup._2 * 2) + } .write(BijectedSourceSink(TypedTsv[Long]("output"))) } @@ -51,11 +52,11 @@ class MutatedSourceTest extends WordSpec with Matchers { unordered should have size 3 // Simple case, 2*8L won't run into the packer logic - unordered should contain (16L) + unordered should contain(16L) // Big one that should be in both the high and low 4 bytes of the Long val big = 4123423431L val newBig = LongIntPacker.lr(LongIntPacker.l(big) * 2, LongIntPacker.r(big) * 2) - unordered should contain (newBig) + unordered should contain(newBig) } .run .runHadoop @@ -65,7 +66,8 @@ class MutatedSourceTest extends WordSpec with Matchers { } class ContraMappedAndThenSourceJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[Long]("input0").andThen { x => (LongIntPacker.l(x), LongIntPacker.r(x)) }) + TypedPipe + .from(TypedTsv[Long]("input0").andThen(x => (LongIntPacker.l(x), LongIntPacker.r(x)))) .map { case (l, r) => (l * 2, r * 2) } .write(TypedTsv[Long]("output").contraMap { case (l, r) => LongIntPacker.lr(l, r) }) } @@ -81,11 +83,11 @@ class ContraMappedAndThenSourceTest extends WordSpec with Matchers { unordered should have size 3 // Simple case, 2*8L won't run into the packer logic - unordered should contain (16L) + unordered should contain(16L) // Big one that should be in both the high and low 4 bytes of the Long val big = 4123423431L val newBig = LongIntPacker.lr(LongIntPacker.l(big) * 2, LongIntPacker.r(big) * 2) - unordered should contain (newBig) + unordered should contain(newBig) } .run .runHadoop diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala index 0359c50f50..a029cc1899 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala @@ -11,7 +11,9 @@ class CoGroupableTest extends FunSuite { assert(CoGroupable.atMostOneValue(init.group.mapValues(_ + 100).sum)) assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum)) assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.mapValues(_ - 100))) - assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.filter { case (k, v) => k > v })) + assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.filter { + case (k, v) => k > v + })) assert(CoGroupable.atMostOneValue(init.group.mapValues(_ * 2).sum.join(init.group.sum))) assert(!CoGroupable.atMostOneValue(init.group)) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala index 504b966a37..b3649241a8 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala @@ -1,16 +1,16 @@ package com.twitter.scalding.typed -import org.scalacheck.{ Arbitrary, Prop } -import org.scalatest.{ FunSuite, PropSpec } -import org.scalatest.prop.{ Checkers, PropertyChecks } +import org.scalacheck.{Arbitrary, Prop} +import org.scalatest.{FunSuite, PropSpec} +import org.scalatest.prop.{Checkers, PropertyChecks} import scala.reflect.ClassTag object HashArrayEqualsWrapperLaws { - def check2[T](ordToTest: Ordering[HashEqualsArrayWrapper[T]])(implicit ord: Ordering[T], arb: Arbitrary[Array[T]]): Prop = - + def check2[T]( + ordToTest: Ordering[HashEqualsArrayWrapper[T]] + )(implicit ord: Ordering[T], arb: Arbitrary[Array[T]]): Prop = Prop.forAll { (left: Array[T], right: Array[T]) => - val leftWrapped = HashEqualsArrayWrapper.wrap(left) val rightWrapped = HashEqualsArrayWrapper.wrap(right) @@ -29,7 +29,6 @@ object HashArrayEqualsWrapperLaws { } def check[T](ordToTest: Ordering[Array[T]])(implicit ord: Ordering[T], arb: Arbitrary[Array[T]]): Prop = - Prop.forAll { (left: Array[T], right: Array[T]) => import scala.Ordering.Implicits.seqDerivedOrdering diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala index 108884acce..207a24adaf 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala @@ -1,14 +1,14 @@ package com.twitter.example.scalding.typed import com.twitter.scalding._ -import scala.concurrent.{ ExecutionContext => SExecutionContext, _ } +import scala.concurrent.{ExecutionContext => SExecutionContext, _} import SExecutionContext.Implicits.global object InAnotherPackage { - def buildF: Future[TypedPipe[(Int, Int)]] = { + def buildF: Future[TypedPipe[(Int, Int)]] = Future { - TypedPipe.from(List(1, 2, 3, 4, 555, 3)) + TypedPipe + .from(List(1, 2, 3, 4, 555, 3)) .map { case x => (x, x) } } - } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala index 96177ee165..9025f4523c 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import org.scalatest.WordSpec @@ -23,7 +23,7 @@ class MultiJoinTest extends WordSpec { def addKeys[V](t: Seq[V]): Seq[(Int, V)] = t.iterator.zipWithIndex.map { case (v, k) => (k, v) }.toSeq - val doubles = TypedPipe.from(addKeys(List(1.0D, 2.0D, 3.0D))) + val doubles = TypedPipe.from(addKeys(List(1.0d, 2.0d, 3.0d))) val longs = TypedPipe.from(addKeys(List(10L, 20L, 30L))) val strings = TypedPipe.from(addKeys(List("one", "two", "three"))) val sets = TypedPipe.from(addKeys(List(Set(1), Set(2), Set(3)))) @@ -40,13 +40,17 @@ class MultiJoinTest extends WordSpec { "actually match the outputs of joins" in { val joinedFlat: CoGrouped[Int, (Double, Long, String, Set[Int], Map[Int, Int])] = - joined.mapValues { x => flattenNestedTuple(x) } + joined.mapValues(x => flattenNestedTuple(x)) - val leftJoinedFlat: CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = - leftJoined.mapValues { x => flattenNestedTuple(x) } + val leftJoinedFlat + : CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = + leftJoined.mapValues(x => flattenNestedTuple(x)) - val outerJoinedFlat: CoGrouped[Int, (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = - outerJoined.mapValues { x => flattenNestedOptionTuple(x) } + val outerJoinedFlat: CoGrouped[ + Int, + (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]]) + ] = + outerJoined.mapValues(x => flattenNestedOptionTuple(x)) } "Have implicit flattenValueTuple methods for low arity" in { @@ -54,12 +58,16 @@ class MultiJoinTest extends WordSpec { val joinedFlat: CoGrouped[Int, (Double, Long, String, Set[Int], Map[Int, Int])] = joined.flattenValueTuple - val leftJoinedFlat: CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = + val leftJoinedFlat + : CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = leftJoined.flattenValueTuple - val outerJoinedFlat: CoGrouped[Int, (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = + val outerJoinedFlat: CoGrouped[ + Int, + (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]]) + ] = outerJoined.flattenValueTuple } } -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala index 6475d28fc0..93842b0dc3 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala @@ -12,15 +12,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import org.scalatest.WordSpec import com.twitter.scalding._ -import scala.concurrent.{ ExecutionContext => SExecutionContext, _ } +import scala.concurrent.{ExecutionContext => SExecutionContext, _} import SExecutionContext.Implicits.global -import scala.concurrent.duration.{ Duration => SDuration } +import scala.concurrent.duration.{Duration => SDuration} import cascading.flow.FlowDef import org.apache.hadoop.conf.Configuration @@ -39,8 +39,7 @@ class NoStackLineNumberTest extends WordSpec { val pipe = Await.result(pipeFut, SDuration.Inf) // We pick up line number info via TypedPipe.withLine // So this should have some non-scalding info in it. - val allDesc = RichPipe(pipe) - .upstreamPipes + val allDesc = RichPipe(pipe).upstreamPipes .map(RichPipe.getPipeDescriptions(_).toSet) .foldLeft(Set.empty[String])(_ | _) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala index 7a3168cd02..6d7182a7e8 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala @@ -2,22 +2,22 @@ package com.twitter.scalding.typed import cascading.flow.FlowDef import cascading.tuple.Fields -import com.stripe.dagon.{ Dag, Rule } +import com.stripe.dagon.{Dag, Rule} import com.twitter.algebird.Monoid -import com.twitter.scalding.source.{ TypedText, NullSink } +import com.twitter.scalding.source.{NullSink, TypedText} import org.apache.hadoop.conf.Configuration -import com.twitter.scalding.{ Config, ExecutionContext, Local, Hdfs, FlowState, FlowStateMap, IterableSource } +import com.twitter.scalding.{Config, ExecutionContext, FlowState, FlowStateMap, Hdfs, IterableSource, Local} import com.twitter.scalding.typed.cascading_backend.CascadingBackend import com.twitter.scalding.typed.memory_backend.MemoryMode import org.scalatest.FunSuite import org.scalatest.prop.PropertyChecks -import org.scalacheck.{ Arbitrary, Gen } -import scala.util.{ Failure, Success, Try } +import org.scalacheck.{Arbitrary, Gen} +import scala.util.{Failure, Success, Try} object TypedPipeGen { val srcGen: Gen[TypedPipe[Int]] = { val g1 = Gen.listOf(Arbitrary.arbitrary[Int]).map(TypedPipe.from(_)) - val src = Gen.identifier.map { f => TypedPipe.from(TypedText.tsv[Int](f)) } + val src = Gen.identifier.map(f => TypedPipe.from(TypedText.tsv[Int](f))) Gen.oneOf(g1, src, Gen.const(TypedPipe.empty)) } @@ -25,33 +25,66 @@ object TypedPipeGen { val commonFreq = 10 val next1: Gen[TypedPipe[Int] => TypedPipe[Int]] = Gen.frequency( - (1, tpGen(srcGen).map { p: TypedPipe[Int] => - { x: TypedPipe[Int] => x.cross(p).keys } - }), - (2, tpGen(srcGen).map { p: TypedPipe[Int] => - { x: TypedPipe[Int] => x.cross(ValuePipe(2)).values } - }), + ( + 1, + tpGen(srcGen).map { + p: TypedPipe[Int] => + x: TypedPipe[Int] => x.cross(p).keys + } + ), + ( + 2, + tpGen(srcGen).map { + p: TypedPipe[Int] => + x: TypedPipe[Int] => x.cross(ValuePipe(2)).values + } + ), //Gen.const({ t: TypedPipe[Int] => t.debug }), debug spews a lot to the terminal - (commonFreq, Arbitrary.arbitrary[Int => Boolean].map { fn => - { t: TypedPipe[Int] => t.filter(fn) } - }), - (commonFreq, Arbitrary.arbitrary[Int => Int].map { fn => - { t: TypedPipe[Int] => t.map(fn) } - }), - (commonFreq, Arbitrary.arbitrary[Int => List[Int]].map { fn => - { t: TypedPipe[Int] => t.flatMap(fn.andThen(_.take(4))) } // the take is to not get too big - }), - (2, Gen.const({ t: TypedPipe[Int] => t.forceToDisk })), - (2, Gen.const({ t: TypedPipe[Int] => t.fork })), - (5, tpGen(srcGen).map { p: TypedPipe[Int] => - { x: TypedPipe[Int] => x ++ p } - }), - (1, Gen.identifier.map { id => - { t: TypedPipe[Int] => t.addTrap(TypedText.tsv[Int](id)) } - }), - (1, Gen.identifier.map { id => - { t: TypedPipe[Int] => t.withDescription(id) } - })) + ( + commonFreq, + Arbitrary.arbitrary[Int => Boolean].map { + fn => + t: TypedPipe[Int] => t.filter(fn) + } + ), + ( + commonFreq, + Arbitrary.arbitrary[Int => Int].map { + fn => + t: TypedPipe[Int] => t.map(fn) + } + ), + ( + commonFreq, + Arbitrary.arbitrary[Int => List[Int]].map { + fn => + t: TypedPipe[Int] => t.flatMap(fn.andThen(_.take(4))) // the take is to not get too big + } + ), + (2, Gen.const { t: TypedPipe[Int] => t.forceToDisk }), + (2, Gen.const { t: TypedPipe[Int] => t.fork }), + ( + 5, + tpGen(srcGen).map { + p: TypedPipe[Int] => + x: TypedPipe[Int] => x ++ p + } + ), + ( + 1, + Gen.identifier.map { + id => + t: TypedPipe[Int] => t.addTrap(TypedText.tsv[Int](id)) + } + ), + ( + 1, + Gen.identifier.map { + id => + t: TypedPipe[Int] => t.withDescription(id) + } + ) + ) val one = for { n <- next1 @@ -60,8 +93,9 @@ object TypedPipeGen { val next2: Gen[TypedPipe[(Int, Int)] => TypedPipe[Int]] = Gen.oneOf( - Gen.const({ p: TypedPipe[(Int, Int)] => p.values }), - Gen.const({ p: TypedPipe[(Int, Int)] => p.keys })) + Gen.const { p: TypedPipe[(Int, Int)] => p.values }, + Gen.const { p: TypedPipe[(Int, Int)] => p.keys } + ) val two = for { n <- next2 @@ -82,7 +116,7 @@ object TypedPipeGen { single <- tpGen(srcGen) fn <- Arbitrary.arbitrary[Int => List[(Int, Int)]] } yield single.flatMap(fn.andThen(_.take(4))) // take to not get too big - ) + ) val two = Gen.oneOf( for { @@ -105,13 +139,13 @@ object TypedPipeGen { } yield pair.sumByLocalKeys, for { pair <- keyRec - } yield pair.group.mapGroup { (k, its) => its }.toTypedPipe, + } yield pair.group.mapGroup((k, its) => its).toTypedPipe, for { pair <- keyRec - } yield pair.group.sorted.mapGroup { (k, its) => its }.toTypedPipe, + } yield pair.group.sorted.mapGroup((k, its) => its).toTypedPipe, for { pair <- keyRec - } yield pair.group.sorted.withReducers(2).mapGroup { (k, its) => its }.toTypedPipe, + } yield pair.group.sorted.withReducers(2).mapGroup((k, its) => its).toTypedPipe, for { p1 <- keyRec p2 <- keyRec @@ -123,7 +157,8 @@ object TypedPipeGen { for { p1 <- keyRec p2 <- keyRec - } yield p1.join(p2).mapValues { case (a, b) => a + 31 * b }.toTypedPipe) + } yield p1.join(p2).mapValues { case (a, b) => a + 31 * b }.toTypedPipe + ) // bias to consuming Int, since the we can stack overflow with the (Int, Int) // cases @@ -134,17 +169,16 @@ object TypedPipeGen { Gen.lzy(Gen.frequency((1, srcGen), (1, mapped(srcGen)))) /** - * This generates a TypedPipe that can't necessarily - * be run because it has fake sources + * This generates a TypedPipe that can't necessarily be run because it has fake sources */ val genWithFakeSources: Gen[TypedPipe[Int]] = tpGen(srcGen) /** - * This can always be run because all the sources are - * Iterable sources + * This can always be run because all the sources are Iterable sources */ val genWithIterableSources: Gen[TypedPipe[Int]] = - Gen.choose(0, 16) // don't make giant lists which take too long to evaluate + Gen + .choose(0, 16) // don't make giant lists which take too long to evaluate .flatMap { sz => tpGen(Gen.listOfN(sz, Arbitrary.arbitrary[Int]).map(TypedPipe.from(_))) } @@ -177,7 +211,8 @@ object TypedPipeGen { EmptyIterableIsEmpty, HashToShuffleCoGroup, ForceToDiskBeforeHashJoin, - MapValuesInReducers) + MapValuesInReducers + ) def genRuleFrom(rs: List[Rule[TypedPipe]]): Gen[Rule[TypedPipe]] = for { @@ -216,8 +251,7 @@ class ThrowingOptimizer extends OptimizationPhases { } /** - * Just convert everything to a constant - * so we can check that the optimization was applied + * Just convert everything to a constant so we can check that the optimization was applied */ class ConstantOptimizer extends OptimizationPhases { def phases = List(new Rule[TypedPipe] { @@ -243,9 +277,10 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { } test("optimization rules are reproducible") { - import TypedPipeGen.{ genWithFakeSources, genRule } + import TypedPipeGen.{genWithFakeSources, genRule} - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 500) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) forAll(genWithFakeSources, genRule) { (t, rule) => val optimized = Dag.applyRule(t, toLiteral, rule) val optimized2 = Dag.applyRule(t, toLiteral, rule) @@ -256,7 +291,8 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { test("standard rules are reproducible") { import TypedPipeGen.genWithFakeSources - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 500) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) forAll(genWithFakeSources) { t => val (dag1, id1) = Dag(t, toLiteral) val opt1 = dag1.applySeq(OptimizationRules.standardMapReduceRules) @@ -274,9 +310,14 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { // We don't want any further optimization on this job val conf = Config.empty.setOptimizationPhases(classOf[EmptyOptimizationPhases]) - assert(TypedPipeDiff.diff(t, optimized) - .toIterableExecution - .waitFor(conf, Local(true)).get.isEmpty) + assert( + TypedPipeDiff + .diff(t, optimized) + .toIterableExecution + .waitFor(conf, Local(true)) + .get + .isEmpty + ) } def optimizationLawMemory[T: Ordering](t: TypedPipe[T], rule: Rule[TypedPipe]) = { @@ -284,9 +325,14 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { // We don't want any further optimization on this job val conf = Config.empty.setOptimizationPhases(classOf[EmptyOptimizationPhases]) - assert(TypedPipeDiff.diff(t, optimized) - .toIterableExecution - .waitFor(conf, MemoryMode.empty).get.isEmpty) + assert( + TypedPipeDiff + .diff(t, optimized) + .toIterableExecution + .waitFor(conf, MemoryMode.empty) + .get + .isEmpty + ) } def optimizationReducesSteps[T](init: TypedPipe[T], rule: Rule[TypedPipe]) = { @@ -296,33 +342,38 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { } test("all optimization rules don't change results") { - import TypedPipeGen.{ genWithIterableSources, genRule } - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 50) + import TypedPipeGen.{genWithIterableSources, genRule} + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) forAll(genWithIterableSources, genRule)(optimizationLaw[Int] _) } test("dediamonding never changes results") { import TypedPipeGen.genWithIterableSources - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 50) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) forAll(genWithIterableSources)(optimizationLawMemory[Int](_, OptimizationRules.DeDiamondMappers)) } test("some past failures of the optimizationLaw") { import TypedPipe._ - val arg01 = (TypedPipe.empty.withDescription("foo") ++ TypedPipe.empty.withDescription("bar")).addTrap(TypedText.tsv[Int]("foo")) + val arg01 = (TypedPipe.empty.withDescription("foo") ++ TypedPipe.empty.withDescription("bar")) + .addTrap(TypedText.tsv[Int]("foo")) optimizationLaw(arg01, Rule.empty) } test("all optimization rules do not increase steps") { - import TypedPipeGen.{ allRules, genWithIterableSources, genRuleFrom } - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 200) + import TypedPipeGen.{allRules, genWithIterableSources, genRuleFrom} + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 200) val possiblyIncreasesSteps: Set[Rule[TypedPipe]] = - Set(OptimizationRules.AddExplicitForks, // explicit forks can cause cascading to add steps instead of recomputing values + Set( + OptimizationRules.AddExplicitForks, // explicit forks can cause cascading to add steps instead of recomputing values OptimizationRules.ForceToDiskBeforeHashJoin, // adding a forceToDisk can increase the number of steps OptimizationRules.HashToShuffleCoGroup // obviously changing a hashjoin to a cogroup can increase steps - ) + ) val gen = genRuleFrom(allRules.filterNot(possiblyIncreasesSteps)) @@ -336,7 +387,7 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { implicit val mode = Hdfs(true, conf) implicit val fd = new FlowDef Try(CascadingBackend.toPipe(t, new Fields("value"))) match { - case Failure(ex) => assert(ex.getMessage == "booom") + case Failure(ex) => assert(ex.getMessage == "booom") case Success(res) => fail(s"expected failure, got $res") } } @@ -346,7 +397,7 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { val config = Config.empty.setOptimizationPhases(classOf[ThrowingOptimizer]) ex.waitFor(config, Local(true)) match { - case Failure(ex) => assert(ex.getMessage == "booom") + case Failure(ex) => assert(ex.getMessage == "booom") case Success(res) => fail(s"expected failure, got $res") } } @@ -380,7 +431,7 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { val config = Config.empty.setOptimizationPhases(classOf[ConstantOptimizer]) ex.waitFor(config, Local(true)) match { - case Failure(ex) => fail(s"$ex") + case Failure(ex) => fail(s"$ex") case Success(res) => assert(res.isEmpty) } } @@ -392,51 +443,51 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { invert(TypedPipe.from(List(1, 2, 3))) invert(TypedPipe.from(List(1, 2, 3)).map(_ * 2)) invert { - TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey.toTypedPipe + TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey.toTypedPipe } invert { - val p = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey - p.mapGroup { (k, its) => Iterator.single(its.sum * k) } + p.mapGroup((k, its) => Iterator.single(its.sum * k)) } invert { - val p = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey p.cross(TypedPipe.from(List("a", "b", "c")).sum) } invert { - val p = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey p.cross(TypedPipe.from(List("a", "b", "c"))) } invert { - val p = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey p.forceToDisk } invert { - val p = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) }.sumByKey + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey p.fork } invert { - val p1 = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) } + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) p1.join(p2).toTypedPipe } invert { - val p1 = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) } + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) p1.hashJoin(p2) } invert { - val p1 = TypedPipe.from(List(1, 2, 3)).map { i => (i, i) } + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) p1.join(p2).filterKeys(_ % 2 == 0) @@ -450,9 +501,8 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { val filterFn = { i: Int => i % 2 == 0 } val fn1 = { i: Int => (0 to i) } - def eqCheck[T](t: => T) = { + def eqCheck[T](t: => T) = assert(t == t) - } eqCheck(tp.map(fn0)) eqCheck(tp.filter(filterFn)) @@ -503,10 +553,12 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { assert((end - start) / (1000L * 1000L) < millis) } - test("Dagon relies on fast hashCodes and fast equality. Test some example ones to make sure they are not exponential") { + test( + "Dagon relies on fast hashCodes and fast equality. Test some example ones to make sure they are not exponential" + ) { def testFib(fn: (TypedPipe[Int], TypedPipe[Int]) => TypedPipe[Int]) = - isFasterThan(1000){ + isFasterThan(1000) { fib(TypedPipe.from(List(0)), TypedPipe.from(List(1, 2)), 45)(fn).hashCode } @@ -522,8 +574,10 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { // without linear time equality, this fails when fib count is 35, at 50 // it would take a huge amount of time isFasterThan(1000) { - assert(fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _) == - fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _)) + assert( + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _) == + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _) + ) } } @@ -552,31 +606,52 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { def kv(s: String) = TypedPipe.from(TypedText.tsv[(Int, Int)](s)) - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").sumByKey.toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").join(kv("b")).toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").sumByKey.toTypedPipe.join(kv("b")).toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").join(kv("b").sumByKey.toTypedPipe).toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").join(kv("b").sumByKey.toTypedPipe.mapValues(_ * 2)).toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").join(kv("b").sumByKey.toTypedPipe.flatMapValues(0 to _)).toTypedPipe.group.max } - optimizedSteps(OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), 1) { + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { kv("a").join(kv("b").sumByKey.toTypedPipe.filterKeys(_ > 2)).toTypedPipe.group.max } } @@ -633,17 +708,21 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { } test("merging pipes does not make them unplannable") { - val pipe1 = (TypedPipe.from(0 to 1000).map { x => (x, x) } ++ - (TypedPipe.from(0 to 2000).groupBy(_ % 17).sum.toTypedPipe)) + val pipe1 = TypedPipe.from(0 to 1000).map { x => + (x, x) + } ++ + (TypedPipe.from(0 to 2000).groupBy(_ % 17).sum.toTypedPipe) - val pipe2 = (TypedPipe.from(0 to 1000) ++ - TypedPipe.from(0 to 2000).filter(_ % 17 == 0)) + val pipe2 = TypedPipe.from(0 to 1000) ++ + TypedPipe.from(0 to 2000).filter(_ % 17 == 0) - val pipe3 = (TypedPipe.from(TypedText.tsv[Int]("src1")).map { x => (x, x) } ++ - (TypedPipe.from(TypedText.tsv[Int]("src2")).groupBy(_ % 17).sum.toTypedPipe)) + val pipe3 = TypedPipe.from(TypedText.tsv[Int]("src1")).map { x => + (x, x) + } ++ + (TypedPipe.from(TypedText.tsv[Int]("src2")).groupBy(_ % 17).sum.toTypedPipe) - val pipe4 = (TypedPipe.from(TypedText.tsv[Int]("src1")) ++ - TypedPipe.from(TypedText.tsv[Int]("src2")).filter(_ % 17 == 0)) + val pipe4 = TypedPipe.from(TypedText.tsv[Int]("src1")) ++ + TypedPipe.from(TypedText.tsv[Int]("src2")).filter(_ % 17 == 0) optimizedSteps(OptimizationRules.standardMapReduceRules, 2)(pipe1) optimizedSteps(OptimizationRules.standardMapReduceRules, 1)(pipe2) @@ -692,13 +771,13 @@ class OptimizationRulesTest extends FunSuite with PropertyChecks { // we need to use a flatMap to make sure that none of the optimizations are applied val pipe: TypedPipe[(Int, Int)] = TypedPipe .from(0 to 1000) - .flatMap { k => (k % 11, k % 13) :: Nil } + .flatMap(k => (k % 11, k % 13) :: Nil) .sumByKey .toTypedPipe // Do all kinds of different map only operations, but see them merged down to one flatMap val diamond = pipe.map(identity) ++ pipe.mapValues(_ ^ 11) ++ - pipe.flatMapValues { i => (0 until (i % 7)) } ++ + pipe.flatMapValues(i => (0 until (i % 7))) ++ pipe.flatMap { case (k, v) => (k, v) :: (v, k) :: Nil } ++ pipe.filter { case (k, v) => k > v } ++ pipe.filterKeys(_ % 3 == 0) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala index 9a262717a8..86feadc9fc 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala @@ -16,9 +16,9 @@ package com.twitter.scalding.typed import java.io.File -import scala.io.{ Source => ScalaSource } +import scala.io.{Source => ScalaSource} -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ import TDsl._ @@ -57,7 +57,7 @@ class PartitionedDelimitedTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(singlePartition)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("A", "B") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala index cc5967c941..71d76a3d7f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala @@ -16,9 +16,9 @@ package com.twitter.scalding.typed import java.io.File -import scala.io.{ Source => ScalaSource } +import scala.io.{Source => ScalaSource} -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ @@ -65,7 +65,7 @@ class PartitionedTextLineTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(singlePartition)) println(directory) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("A", "B") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) @@ -93,7 +93,11 @@ class PartitionedTextLineTest extends WordSpec with Matchers { val directory = new File(testMode.getWritePathFor(multiplePartition)) println(directory) - directory.listFiles.flatMap(d => d.listFiles.map(d.getName + "/" + _.getName)).toSet shouldBe Set("A/X", "A/Y", "B/Z") + directory.listFiles.flatMap(d => d.listFiles.map(d.getName + "/" + _.getName)).toSet shouldBe Set( + "A/X", + "A/Y", + "B/Z" + ) val axSource = ScalaSource.fromFile(new File(directory, "A/X/part-00000-00000")) val aySource = ScalaSource.fromFile(new File(directory, "A/Y/part-00000-00001")) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala index 9e1fbf2617..1f66fb0df8 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala @@ -12,20 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.serialization.OrderedSerialization import com.twitter.scalding.serialization.StringOrderedSerialization import com.twitter.scalding.serialization.RequireOrderedSerializationMode -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class NoOrderdSerJob(args: Args, requireOrderedSerializationMode: String) extends Job(args) { - override def config = super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) - TypedPipe.from(TypedTsv[(String, String)]("input")) + TypedPipe + .from(TypedTsv[(String, String)]("input")) .group .max .write(TypedTsv[(String, String)]("output")) @@ -35,9 +37,11 @@ class OrderdSerJob(args: Args, requireOrderedSerializationMode: String) extends implicit def stringOS: OrderedSerialization[String] = new StringOrderedSerialization - override def config = super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) - TypedPipe.from(TypedTsv[(String, String)]("input")) + TypedPipe + .from(TypedTsv[(String, String)]("input")) .group .sorted .max @@ -51,7 +55,7 @@ class RequireOrderedSerializationTest extends WordSpec with Matchers { def test(job: Args => Job) = JobTest(job) .source(TypedTsv[(String, String)]("input"), List(("a", "a"), ("b", "b"))) - .sink[(String, String)](TypedTsv[(String, String)]("output")) { outBuf => () } + .sink[(String, String)](TypedTsv[(String, String)]("output"))(outBuf => ()) .run .finish() diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala index 80c64cf7ff..0977c16f8e 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala @@ -33,33 +33,33 @@ class ResolverTest extends FunSuite { test("orElse order is correct") { - assert((pair(k1, v1) orElse pair(k1, v2))(k1) == Some(v1)) - assert((pair(k1, v2) orElse pair(k1, v1))(k1) == Some(v2)) - assert((pair(k2, v1) orElse pair(k1, v2))(k1) == Some(v2)) - assert((pair(k2, v2) orElse pair(k1, v1))(k1) == Some(v1)) + assert((pair(k1, v1).orElse(pair(k1, v2)))(k1) == Some(v1)) + assert((pair(k1, v2).orElse(pair(k1, v1)))(k1) == Some(v2)) + assert((pair(k2, v1).orElse(pair(k1, v2)))(k1) == Some(v2)) + assert((pair(k2, v2).orElse(pair(k1, v1)))(k1) == Some(v1)) - assert(((pair(k1, v1) orElse pair(k1, v2)) orElse pair(k1, v3))(k1) == Some(v1)) - assert(((pair(k1, v2) orElse pair(k1, v1)) orElse pair(k1, v3))(k1) == Some(v2)) - assert(((pair(k1, v1) orElse pair(k1, v2)) orElse pair(k2, v3))(k2) == Some(v3)) + assert(((pair(k1, v1).orElse(pair(k1, v2))).orElse(pair(k1, v3)))(k1) == Some(v1)) + assert(((pair(k1, v2).orElse(pair(k1, v1))).orElse(pair(k1, v3)))(k1) == Some(v2)) + assert(((pair(k1, v1).orElse(pair(k1, v2))).orElse(pair(k2, v3)))(k2) == Some(v3)) assert(custom(k1) == Some(v3)) assert(custom(k2) == None) - assert((custom orElse pair(k1, v2))(k1) == Some(v3)) - assert((custom orElse pair(k2, v2))(k2) == Some(v2)) - assert((pair(k1, v2) orElse custom)(k1) == Some(v2)) - assert((pair(k2, v2) orElse custom)(k1) == Some(v3)) - assert((pair(k2, v2) orElse custom)(k2) == Some(v2)) + assert((custom.orElse(pair(k1, v2)))(k1) == Some(v3)) + assert((custom.orElse(pair(k2, v2)))(k2) == Some(v2)) + assert((pair(k1, v2).orElse(custom))(k1) == Some(v2)) + assert((pair(k2, v2).orElse(custom))(k1) == Some(v3)) + assert((pair(k2, v2).orElse(custom))(k2) == Some(v2)) } test("test remapping with andThen") { - val remap = Resolver.pair(k1, k2) orElse Resolver.pair(k2, k3) orElse Resolver.pair(k3, k1) + val remap = Resolver.pair(k1, k2).orElse(Resolver.pair(k2, k3)).orElse(Resolver.pair(k3, k1)) - assert((remap andThen (custom orElse pair(k1, v2)))(k1) == None) - assert((remap andThen (custom orElse pair(k2, v2)))(k2) == None) - assert((remap andThen (pair(k1, v2) orElse custom))(k3) == Some(v2)) - assert((remap andThen (pair(k2, v2) orElse custom))(k3) == Some(v3)) - assert((remap andThen (pair(k2, v2) orElse custom))(k1) == Some(v2)) + assert((remap.andThen(custom.orElse(pair(k1, v2))))(k1) == None) + assert((remap.andThen(custom.orElse(pair(k2, v2))))(k2) == None) + assert((remap.andThen(pair(k1, v2).orElse(custom)))(k3) == Some(v2)) + assert((remap.andThen(pair(k2, v2).orElse(custom)))(k3) == Some(v3)) + assert((remap.andThen(pair(k2, v2).orElse(custom)))(k1) == Some(v2)) } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala index 1cb23a8313..e5c2dde262 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala @@ -2,9 +2,9 @@ package com.twitter.scalding.typed import com.twitter.algebird.MapAlgebra import com.twitter.scalding.TypedPipeChecker.InMemoryToListEnrichment -import org.scalacheck.{ Arbitrary, Prop } -import org.scalatest.prop.{ Checkers, PropertyChecks } -import org.scalatest.{ FunSuite, PropSpec } +import org.scalacheck.{Arbitrary, Prop} +import org.scalatest.prop.{Checkers, PropertyChecks} +import org.scalatest.{FunSuite, PropSpec} import scala.reflect.ClassTag @@ -12,7 +12,7 @@ class NoOrdering(val x: String) { override def equals(other: Any): Boolean = other match { case that: NoOrdering => x.equals(that.x) - case _ => false + case _ => false } override def hashCode(): Int = x.hashCode @@ -22,7 +22,7 @@ class NoOrderingHashCollisions(val x: String) { override def equals(other: Any): Boolean = other match { case that: NoOrderingHashCollisions => x.equals(that.x) - case _ => false + case _ => false } override def hashCode(): Int = 0 @@ -34,21 +34,16 @@ class TypedPipeDiffTest extends FunSuite { val right = List("hi", "bye", "foo", "baz") val expectedSortedDiff = List(("bar", (1, 0)), ("baz", (0, 1)), ("hi", (2, 1))).sorted - val leftArr = List( - Array[Byte](3, 3, 5, 3, 2), - Array[Byte](2, 2, 2), - Array[Byte](0, 1, 0)) + val leftArr = List(Array[Byte](3, 3, 5, 3, 2), Array[Byte](2, 2, 2), Array[Byte](0, 1, 0)) - val rightArr = List( - Array[Byte](2, 2, 2), - Array[Byte](2, 2, 2), - Array[Byte](3, 3, 5, 3, 2), - Array[Byte](0, 1, 1)) + val rightArr = + List(Array[Byte](2, 2, 2), Array[Byte](2, 2, 2), Array[Byte](3, 3, 5, 3, 2), Array[Byte](0, 1, 1)) val expectedSortedArrDiff = List( (Array[Byte](0, 1, 0).toSeq, (1, 0)), (Array[Byte](0, 1, 1).toSeq, (0, 1)), - (Array[Byte](2, 2, 2).toSeq, (1, 2))) + (Array[Byte](2, 2, 2).toSeq, (1, 2)) + ) test("diff works for objects with ordering and good hashcodes") { val pipe1 = TypedPipe.from(left) @@ -98,11 +93,13 @@ class TypedPipeDiffTest extends FunSuite { } test("diffArrayPipesWithoutOrdering works for arrays of objects with no ordering") { - val pipe1 = TypedPipe.from(leftArr.map { arr => arr.map { b => new NoOrdering(b.toString) } }) - val pipe2 = TypedPipe.from(rightArr.map { arr => arr.map { b => new NoOrdering(b.toString) } }) + val pipe1 = TypedPipe.from(leftArr.map(arr => arr.map(b => new NoOrdering(b.toString)))) + val pipe2 = TypedPipe.from(rightArr.map(arr => arr.map(b => new NoOrdering(b.toString)))) val diff = TypedPipeDiff.diffArrayPipes(pipe1, pipe2) - assert(expectedSortedArrDiff === sort(diff.inMemoryToList.map{ case (arr, counts) => (arr.map(_.x.toByte).toSeq, counts) })) + assert(expectedSortedArrDiff === sort(diff.inMemoryToList.map { case (arr, counts) => + (arr.map(_.x.toByte).toSeq, counts) + })) } } @@ -112,25 +109,33 @@ object TypedPipeDiffLaws { def checkDiff[T](left: List[T], right: List[T], diff: List[(T, (Long, Long))]): Boolean = { val noDuplicates = diff.size == diff.map(_._1).toSet.size - val expected = MapAlgebra.sumByKey(left.map((_, (1L, 0L))).iterator ++ right.map((_, (0L, 1L))).iterator) + val expected = MapAlgebra + .sumByKey(left.map((_, (1L, 0L))).iterator ++ right.map((_, (0L, 1L))).iterator) .filter { case (t, (rCount, lCount)) => rCount != lCount } noDuplicates && expected == diff.toMap } - def checkArrayDiff[T](left: List[Array[T]], right: List[Array[T]], diff: List[(Seq[T], (Long, Long))]): Boolean = { + def checkArrayDiff[T]( + left: List[Array[T]], + right: List[Array[T]], + diff: List[(Seq[T], (Long, Long))] + ): Boolean = checkDiff(left.map(_.toSeq), right.map(_.toSeq), diff) - } def diffLaw[T: Ordering: Arbitrary]: Prop = Prop.forAll { (left: List[T], right: List[T]) => val diff = TypedPipe.from(left).diff(TypedPipe.from(right)).toTypedPipe.inMemoryToList checkDiff(left, right, diff) } - def diffArrayLaw[T](implicit arb: Arbitrary[List[Array[T]]], ct: ClassTag[T]): Prop = Prop.forAll { (left: List[Array[T]], right: List[Array[T]]) => - val diff = TypedPipe.from(left).diffArrayPipes(TypedPipe.from(right)).inMemoryToList - .map { case (arr, counts) => (arr.toSeq, counts) } - checkArrayDiff(left, right, diff) + def diffArrayLaw[T](implicit arb: Arbitrary[List[Array[T]]], ct: ClassTag[T]): Prop = Prop.forAll { + (left: List[Array[T]], right: List[Array[T]]) => + val diff = TypedPipe + .from(left) + .diffArrayPipes(TypedPipe.from(right)) + .inMemoryToList + .map { case (arr, counts) => (arr.toSeq, counts) } + checkArrayDiff(left, right, diff) } def diffByGroupLaw[T: Arbitrary]: Prop = Prop.forAll { (left: List[T], right: List[T]) => @@ -141,7 +146,8 @@ object TypedPipeDiffLaws { } class TypedPipeDiffLaws extends PropSpec with PropertyChecks with Checkers { - override implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 5) + override implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 5) property("diffLaws") { check(TypedPipeDiffLaws.diffLaw[Int]) @@ -154,7 +160,7 @@ class TypedPipeDiffLaws extends PropSpec with PropertyChecks with Checkers { for { strs <- Arbitrary.arbitrary[Array[String]] } yield { - strs.map { new NoOrdering(_) } + strs.map(new NoOrdering(_)) } } @@ -162,7 +168,7 @@ class TypedPipeDiffLaws extends PropSpec with PropertyChecks with Checkers { for { strs <- Arbitrary.arbitrary[Array[String]] } yield { - strs.map { new NoOrderingHashCollisions(_) } + strs.map(new NoOrderingHashCollisions(_)) } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala index a8b92748a7..ff473c5bc6 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding package typed -import com.twitter.algebird.Monoid.{ plus, sum, zero } +import com.twitter.algebird.Monoid.{plus, sum, zero} import org.scalatest.FunSuite import org.scalatest.prop.PropertyChecks diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala index 7a0c5c239a..5ad537650c 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala @@ -1,8 +1,8 @@ package com.twitter.scalding.typed import com.twitter.algebird.Monoid -import com.twitter.scalding.{ Config, Execution, Local, TupleConverter, TupleGetter } -import com.twitter.scalding.source.{ TypedText, NullSink } +import com.twitter.scalding.{Config, Execution, Local, TupleConverter, TupleGetter} +import com.twitter.scalding.source.{NullSink, TypedText} import com.twitter.scalding.typed.cascading_backend.CascadingBackend import com.stripe.dagon.Dag import org.scalatest.FunSuite @@ -13,12 +13,12 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { TypedText.tsv[String](s"source_$id").asInstanceOf[TypedSource[T]] case class WriteState( - writes: List[WritePartitioner.PairK[TypedPipe, TypedSink, _]], - materializations: List[WritePartitioner.PairK[TypedPipe, TypedSource, _]]) { + writes: List[WritePartitioner.PairK[TypedPipe, TypedSink, _]], + materializations: List[WritePartitioner.PairK[TypedPipe, TypedSource, _]] + ) { def ++(that: WriteState): WriteState = - WriteState(writes ::: that.writes, - materializations ::: that.materializations) + WriteState(writes ::: that.writes, materializations ::: that.materializations) } object WriteState { @@ -49,33 +49,35 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { } def sequence_[A](as: Seq[State[A]]): State[Unit] = // just merge them all together: - State(as.foldLeft(WriteState.empty) { (old, n) => - n.writes ++ old - }, ()) + State( + as.foldLeft(WriteState.empty) { (old, n) => + n.writes ++ old + }, + () + ) } } test("When we break at forks we have at most 2 + hashJoin steps") { - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 100) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) def afterPartitioningEachStepIsSize1[T](init: TypedPipe[T]) = { val phases = CascadingBackend.defaultOptimizationRules(Config.empty) val writes = WritePartitioner.materialize[State](phases, List((init, NullSink))).writes - writes.writes.foreach { - case (tp, _) => - val (dag, _) = Dag(tp, OptimizationRules.toLiteral) - val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum - // we can have at most 2 + hcg jobs - assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") + writes.writes.foreach { case (tp, _) => + val (dag, _) = Dag(tp, OptimizationRules.toLiteral) + val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum + // we can have at most 2 + hcg jobs + assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") } - writes.materializations.foreach { - case (tp, _) => - val (dag, _) = Dag(tp, OptimizationRules.toLiteral) - val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum - // we can have at most 1 + hcg jobs - assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") + writes.materializations.foreach { case (tp, _) => + val (dag, _) = Dag(tp, OptimizationRules.toLiteral) + val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum + // we can have at most 1 + hcg jobs + assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") } } @@ -88,11 +90,11 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { val writes = WritePartitioner.materialize[State](phases, List((t, NullSink))).writes - val writeSteps = writes.writes.map { - case (tp, _) => TypedPipeGen.steps(tp) + val writeSteps = writes.writes.map { case (tp, _) => + TypedPipeGen.steps(tp) }.sum - val matSteps = writes.materializations.map { - case (tp, _) => TypedPipeGen.steps(tp) + val matSteps = writes.materializations.map { case (tp, _) => + TypedPipeGen.steps(tp) }.sum val (dag, id) = Dag(t, OptimizationRules.toLiteral) val optDag = dag.applySeq(phases) @@ -103,18 +105,51 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { { import TypedPipe._ - val pipe = WithDescriptionTypedPipe(Mapped(ReduceStepPipe(ValueSortedReduce[Int, Int, Int](implicitly[Ordering[Int]], - WithDescriptionTypedPipe(WithDescriptionTypedPipe(Mapped(WithDescriptionTypedPipe(MergedTypedPipe( - WithDescriptionTypedPipe(Fork(WithDescriptionTypedPipe(TrappedPipe(SourcePipe(TypedText.tsv[Int]("oyg")), - TypedText.tsv[Int]("a3QasphTfqhd1namjb"), - TupleConverter.Single(implicitly[TupleGetter[Int]])), List(("org.scalacheck.Gen$R $class.map(Gen.scala:237)", true)))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - IterablePipe(List(-930762680, -1495455462, -1, -903011942, -2147483648, 1539778843, -2147483648))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - implicitly[Ordering[Int]], null /**/ , Some(2), List())), - null /**/ ), List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))) + val pipe = WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + MergedTypedPipe( + WithDescriptionTypedPipe( + Fork( + WithDescriptionTypedPipe( + TrappedPipe( + SourcePipe(TypedText.tsv[Int]("oyg")), + TypedText.tsv[Int]("a3QasphTfqhd1namjb"), + TupleConverter.Single(implicitly[TupleGetter[Int]]) + ), + List(("org.scalacheck.Gen$R $class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + IterablePipe( + List(-930762680, -1495455462, -1, -903011942, -2147483648, 1539778843, -2147483648) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + Some(2), + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) notMoreSteps(pipe) } @@ -122,30 +157,72 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { { import TypedPipe._ - val pipe = WithDescriptionTypedPipe(ForceToDisk(WithDescriptionTypedPipe(Mapped( - ReduceStepPipe(ValueSortedReduce[Int, Int, Int](implicitly[Ordering[Int]], - WithDescriptionTypedPipe(WithDescriptionTypedPipe( - Mapped(WithDescriptionTypedPipe(MergedTypedPipe(WithDescriptionTypedPipe( - Mapped(WithDescriptionTypedPipe(CrossValue( - SourcePipe(TypedText.tsv[Int]("yumwd")), LiteralValue(2)), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - WithDescriptionTypedPipe(Mapped(WithDescriptionTypedPipe(FilterKeys( - WithDescriptionTypedPipe(SumByLocalKeys( - WithDescriptionTypedPipe(FlatMapped( - IterablePipe(List(943704575)), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - implicitly[Monoid[Int]]), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - implicitly[Ordering[Int]], null /**/ , None, List())), - null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))) + val pipe = WithDescriptionTypedPipe( + ForceToDisk( + WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + MergedTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + CrossValue(SourcePipe(TypedText.tsv[Int]("yumwd")), LiteralValue(2)), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + FilterKeys( + WithDescriptionTypedPipe( + SumByLocalKeys( + WithDescriptionTypedPipe( + FlatMapped(IterablePipe(List(943704575)), null /**/ ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Monoid[Int]] + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + None, + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) notMoreSteps(pipe) } @@ -154,45 +231,94 @@ class WritePartitionerTest extends FunSuite with PropertyChecks { import TypedPipe._ val pipe = WithDescriptionTypedPipe( - Fork(WithDescriptionTypedPipe(Mapped(WithDescriptionTypedPipe(CrossValue( - WithDescriptionTypedPipe(TrappedPipe(WithDescriptionTypedPipe(ForceToDisk(WithDescriptionTypedPipe( - Mapped(ReduceStepPipe(ValueSortedReduce[Int, Int, Int](implicitly[Ordering[Int]], - WithDescriptionTypedPipe(WithDescriptionTypedPipe(FilterKeys(WithDescriptionTypedPipe(FlatMapValues( - WithDescriptionTypedPipe(Mapped(IterablePipe(List(1533743286, 0, -1, 0, 1637692751)), - null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - implicitly[Ordering[Int]], null /**/ , Some(2), List())), - null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - TypedText.tsv[Int]("mndlSTwuEmwqhJk7ac"), - TupleConverter.Single(implicitly[TupleGetter[Int]])), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - LiteralValue(2)), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))), - null /**/ ), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)))), - List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true))) + Fork( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + CrossValue( + WithDescriptionTypedPipe( + TrappedPipe( + WithDescriptionTypedPipe( + ForceToDisk( + WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + FilterKeys( + WithDescriptionTypedPipe( + FlatMapValues( + WithDescriptionTypedPipe( + Mapped( + IterablePipe(List(1533743286, 0, -1, 0, 1637692751)), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + Some(2), + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + TypedText.tsv[Int]("mndlSTwuEmwqhJk7ac"), + TupleConverter.Single(implicitly[TupleGetter[Int]]) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + LiteralValue(2) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) notMoreSteps(pipe) } - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 100) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) forAll(TypedPipeGen.genWithFakeSources)(notMoreSteps(_)) } test("breaking things up does not change the results") { - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 100) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) def partitioningDoesNotChange[T: Ordering](init: TypedPipe[T]) = { val phases = CascadingBackend.defaultOptimizationRules(Config.empty) // We don't want any further optimization on this job val ex: Execution[TypedPipe[T]] = WritePartitioner.partitionSingle(phases, init) - assert(ex.flatMap(TypedPipeDiff.diff[T](init, _).toIterableExecution) - .waitFor(Config.empty, Local(true)).get.isEmpty) + assert( + ex.flatMap(TypedPipeDiff.diff[T](init, _).toIterableExecution) + .waitFor(Config.empty, Local(true)) + .get + .isEmpty + ) } forAll(TypedPipeGen.genWithIterableSources)(partitioningDoesNotChange(_)) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala index ffdbb00f61..6e573e4786 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala @@ -2,7 +2,7 @@ package com.twitter.scalding.typed.memory_backend import org.scalatest.FunSuite import org.scalatest.prop.PropertyChecks -import com.twitter.scalding.{ TypedPipe, Execution, Config, Local } +import com.twitter.scalding.{Config, Execution, Local, TypedPipe} import com.twitter.scalding.typed.TypedPipeGen class MemoryTest extends FunSuite with PropertyChecks { @@ -38,7 +38,8 @@ class MemoryTest extends FunSuite with PropertyChecks { } test("basic word count") { - val x = TypedPipe.from(0 until 100) + val x = TypedPipe + .from(0 until 100) .groupBy(_ % 2) .sum .toIterableExecution @@ -47,9 +48,10 @@ class MemoryTest extends FunSuite with PropertyChecks { } test("mapGroup works") { - val x = TypedPipe.from(0 until 100) + val x = TypedPipe + .from(0 until 100) .groupBy(_ % 2) - .mapGroup { (k, vs) => Iterator.single(vs.foldLeft(k)(_ + _)) } + .mapGroup((k, vs) => Iterator.single(vs.foldLeft(k)(_ + _))) .toIterableExecution mapMatch(x) @@ -57,30 +59,32 @@ class MemoryTest extends FunSuite with PropertyChecks { test("hashJoin works") { val input = TypedPipe.from(0 until 100) - val left = input.map { k => (k, k % 2) } - val right = input.map { k => (k, k % 3) } + val left = input.map(k => (k, k % 2)) + val right = input.map(k => (k, k % 3)) mapMatch(left.hashJoin(right).toIterableExecution) } test("join works") { val input = TypedPipe.from(0 until 100) - val left = input.map { k => (k, k % 2) } - val right = input.map { k => (k, k % 3) } + val left = input.map(k => (k, k % 2)) + val right = input.map(k => (k, k % 3)) mapMatch(left.join(right).toIterableExecution) } test("scalding memory mode matches cascading local mode") { import TypedPipeGen.genWithIterableSources - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 50) - forAll(genWithIterableSources) { pipe => sortMatch(pipe.toIterableExecution) } + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) + forAll(genWithIterableSources)(pipe => sortMatch(pipe.toIterableExecution)) } test("writing gives the same result as toIterableExecution") { import TypedPipeGen.genWithIterableSources // we can afford to test a lot more in just memory mode because it is faster than cascading - implicit val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 500) + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) forAll(genWithIterableSources) { pipe => val sink = new MemorySink.LocalVar[Int] @@ -99,7 +103,7 @@ class MemoryTest extends FunSuite with PropertyChecks { test("using sources work") { val srctag = SourceT[Int]("some_source") - val job = TypedPipe.from(srctag).map { i => (i % 31, i) }.sumByKey.toIterableExecution + val job = TypedPipe.from(srctag).map(i => (i % 31, i)).sumByKey.toIterableExecution val jobRes = job.waitFor(Config.empty, MemoryMode.empty.addSourceIterable(srctag, (0 to 10000))) diff --git a/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala b/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala index 23fc883f3a..63902137dd 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.Calendar @@ -32,10 +32,8 @@ object AbsoluteDuration extends java.io.Serializable { val SEC_IN_MS = 1000 val MIN_IN_MS = 60 * SEC_IN_MS val HOUR_IN_MS = 60 * MIN_IN_MS - val UTC_UNITS = List[TimeCons]((Hours, HOUR_IN_MS), - (Minutes, MIN_IN_MS), - (Seconds, SEC_IN_MS), - (Millisecs, 1)).reverse + val UTC_UNITS = + List[TimeCons]((Hours, HOUR_IN_MS), (Minutes, MIN_IN_MS), (Seconds, SEC_IN_MS), (Millisecs, 1)).reverse def exact(fnms: TimeCons): (Long) => Option[AbsoluteDuration] = { ms: Long => if (ms % fnms._2 == 0) { @@ -51,14 +49,17 @@ object AbsoluteDuration extends java.io.Serializable { def fromMillisecs(diffInMs: Long): AbsoluteDuration = fromMillisecs(diffInMs, UTC_UNITS, Nil) @tailrec - private def fromMillisecs(diffInMs: Long, units: List[TimeCons], acc: List[AbsoluteDuration]): AbsoluteDuration = { - + private def fromMillisecs( + diffInMs: Long, + units: List[TimeCons], + acc: List[AbsoluteDuration] + ): AbsoluteDuration = if (diffInMs == 0L) { //We are done: acc match { - case Nil => units.head._1(0) + case Nil => units.head._1(0) case (h :: Nil) => h - case _ => AbsoluteDurationList(acc) + case _ => AbsoluteDurationList(acc) } } else { units match { @@ -79,9 +80,11 @@ object AbsoluteDuration extends java.io.Serializable { // We can't go any further, try to jam the rest into this unit: val (fn, cnt) = tc val theseUnits = diffInMs / cnt - require((theseUnits <= Int.MaxValue) && (theseUnits >= Int.MinValue), + require( + (theseUnits <= Int.MaxValue) && (theseUnits >= Int.MinValue), "diff not representable in an Int: " + theseUnits + AbsoluteDurationList(acc) + - "total: " + (diffInMs + AbsoluteDurationList(acc).toMillisecs)) + "total: " + (diffInMs + AbsoluteDurationList(acc).toMillisecs) + ) val thisPart = fn(theseUnits.toInt) if (acc.isEmpty) thisPart @@ -94,7 +97,6 @@ object AbsoluteDuration extends java.io.Serializable { } } } - } } sealed trait AbsoluteDuration extends Duration with Ordered[AbsoluteDuration] { @@ -119,50 +121,48 @@ sealed trait AbsoluteDuration extends Duration with Ordered[AbsoluteDuration] { AbsoluteDuration.fromMillisecs(this.toMillisecs * that) /** - * Returns the number of times that divides this and the remainder - * The law is: that * result_.1 + result._2 == this + * Returns the number of times that divides this and the remainder The law is: that * result_.1 + result._2 + * == this */ def /(that: AbsoluteDuration): (Long, AbsoluteDuration) = { - val divs = (this.toMillisecs / that.toMillisecs) + val divs = this.toMillisecs / that.toMillisecs val rem = this - (that * divs) (divs, rem) } - override def equals(eq: Any): Boolean = { + override def equals(eq: Any): Boolean = eq match { case eqo: AbsoluteDuration => (eqo.toMillisecs) == this.toMillisecs - case _ => false + case _ => false } - } override def hashCode: Int = toMillisecs.hashCode } -final case class Millisecs(cnt: Int) extends Duration(Calendar.MILLISECOND, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Millisecs(cnt: Int) + extends Duration(Calendar.MILLISECOND, cnt, DateOps.UTC) + with AbsoluteDuration { override def toSeconds = cnt / 1000.0 override def toMillisecs = cnt.toLong } -final case class Seconds(cnt: Int) extends Duration(Calendar.SECOND, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Seconds(cnt: Int) extends Duration(Calendar.SECOND, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt.toDouble override def toMillisecs = (cnt.toLong) * 1000L } -final case class Minutes(cnt: Int) extends Duration(Calendar.MINUTE, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Minutes(cnt: Int) extends Duration(Calendar.MINUTE, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt * 60.0 override def toMillisecs = cnt.toLong * 60L * 1000L } -final case class Hours(cnt: Int) extends Duration(Calendar.HOUR, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Hours(cnt: Int) extends Duration(Calendar.HOUR, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt * 60.0 * 60.0 override def toMillisecs = cnt.toLong * 60L * 60L * 1000L } final case class AbsoluteDurationList(parts: List[AbsoluteDuration]) - extends AbstractDurationList[AbsoluteDuration](parts) with AbsoluteDuration { - override def toSeconds = parts.map{ _.toSeconds }.sum - override def toMillisecs: Long = parts.map{ _.toMillisecs }.sum + extends AbstractDurationList[AbsoluteDuration](parts) + with AbsoluteDuration { + override def toSeconds = parts.map(_.toSeconds).sum + override def toMillisecs: Long = parts.map(_.toMillisecs).sum } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala b/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala index a0d43ab272..ff3ada1f0f 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala @@ -1,24 +1,20 @@ package com.twitter.scalding -import java.util.{ Date, Calendar } +import java.util.{Calendar, Date} import scala.annotation.tailrec /** - * */ object CalendarOps { def truncate(date: Calendar, field: Int): Calendar = { @tailrec - def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar = { + def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar = if (currentField > field) { currentField match { case Calendar.DAY_OF_MONTH => cal.set(currentField, 1) - case Calendar.DAY_OF_WEEK_IN_MONTH | - Calendar.DAY_OF_WEEK | - Calendar.DAY_OF_YEAR | - Calendar.WEEK_OF_MONTH | - Calendar.WEEK_OF_YEAR | - Calendar.HOUR_OF_DAY => () // Skip + case Calendar.DAY_OF_WEEK_IN_MONTH | Calendar.DAY_OF_WEEK | Calendar.DAY_OF_YEAR | + Calendar.WEEK_OF_MONTH | Calendar.WEEK_OF_YEAR | Calendar.HOUR_OF_DAY => + () // Skip case _ => cal.set(currentField, 0) } @@ -26,7 +22,6 @@ object CalendarOps { } else { cal } - } val cloned = date.clone().asInstanceOf[Calendar] diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala index 88ee18c489..ae0906deb4 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone @@ -47,19 +47,37 @@ object DateOps extends java.io.Serializable { private val emptyBegin = """^\s*""" private val emptyEnd = """\s*$""" - case object DATE_WITHOUT_DASH extends Format(DateOps.DATE_WITHOUT_DASH, new Regex(emptyBegin + """\d{8}""" + emptyEnd)) + case object DATE_WITHOUT_DASH + extends Format(DateOps.DATE_WITHOUT_DASH, new Regex(emptyBegin + """\d{8}""" + emptyEnd)) case object DATE_WITH_DASH extends Format(DateOps.DATE_WITH_DASH, new Regex(emptyBegin + date + emptyEnd)) - case object DATEHOUR_WITHOUT_DASH extends Format(DateOps.DATEHOUR_WITHOUT_DASH, new Regex(emptyBegin + """\d{10}""" + emptyEnd)) - case object DATEHOUR_WITH_DASH extends Format(DateOps.DATEHOUR_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d""" + emptyEnd)) - case object DATETIME_WITHOUT_DASH extends Format(DateOps.DATETIME_WITHOUT_DASH, new Regex(emptyBegin + """\d{12}""" + emptyEnd)) - case object DATETIME_WITH_DASH extends Format(DateOps.DATETIME_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d:\d\d""" + emptyEnd)) - case object DATETIME_HMS_WITHOUT_DASH extends Format(DateOps.DATETIME_HMS_WITHOUT_DASH, new Regex(emptyBegin + """\d{14}""" + emptyEnd)) - case object DATETIME_HMS_WITH_DASH extends Format(DateOps.DATETIME_HMS_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d""" + emptyEnd)) - case object DATETIME_HMSM_WITH_DASH extends Format(DateOps.DATETIME_HMSM_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d\.\d{1,3}""" + emptyEnd)) + case object DATEHOUR_WITHOUT_DASH + extends Format(DateOps.DATEHOUR_WITHOUT_DASH, new Regex(emptyBegin + """\d{10}""" + emptyEnd)) + case object DATEHOUR_WITH_DASH + extends Format(DateOps.DATEHOUR_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d""" + emptyEnd)) + case object DATETIME_WITHOUT_DASH + extends Format(DateOps.DATETIME_WITHOUT_DASH, new Regex(emptyBegin + """\d{12}""" + emptyEnd)) + case object DATETIME_WITH_DASH + extends Format( + DateOps.DATETIME_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d""" + emptyEnd) + ) + case object DATETIME_HMS_WITHOUT_DASH + extends Format(DateOps.DATETIME_HMS_WITHOUT_DASH, new Regex(emptyBegin + """\d{14}""" + emptyEnd)) + case object DATETIME_HMS_WITH_DASH + extends Format( + DateOps.DATETIME_HMS_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d""" + emptyEnd) + ) + case object DATETIME_HMSM_WITH_DASH + extends Format( + DateOps.DATETIME_HMSM_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d\.\d{1,3}""" + emptyEnd) + ) } private val prepare: String => String = { (str: String) => - str.replace("T", " ") //We allow T to separate dates and times, just remove it and then validate + str + .replace("T", " ") //We allow T to separate dates and times, just remove it and then validate .replaceAll("[/_]", "-") // Allow for slashes and underscores } @@ -76,9 +94,10 @@ object DateOps extends java.io.Serializable { Format.DATE_WITHOUT_DASH, Format.DATEHOUR_WITHOUT_DASH, Format.DATETIME_WITHOUT_DASH, - Format.DATETIME_HMS_WITHOUT_DASH) + Format.DATETIME_HMS_WITHOUT_DASH + ) - formats.find { _.matches(prepare(s)) } + formats.find(_.matches(prepare(s))) } /** @@ -87,9 +106,9 @@ object DateOps extends java.io.Serializable { def getFormat(s: String): Option[String] = getFormatObject(s).map(_.pattern) /** - * The DateParser returned here is based on SimpleDateFormat, which is not thread-safe. - * Do not share the result across threads. + * The DateParser returned here is based on SimpleDateFormat, which is not thread-safe. Do not share the + * result across threads. */ def getDateParser(s: String): Option[DateParser] = - getFormat(s).map { fmt => DateParser.from(new SimpleDateFormat(fmt)).contramap(prepare) } + getFormat(s).map(fmt => DateParser.from(new SimpleDateFormat(fmt)).contramap(prepare)) } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala index 5a20c344b6..e8a0cc4385 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala @@ -1,4 +1,3 @@ - /* Copyright 2012 Twitter, Inc. @@ -13,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import scala.util.{ Try, Failure } +import scala.util.{Failure, Try} import java.util.TimeZone import java.text.DateFormat @@ -31,37 +30,36 @@ trait DateParser extends java.io.Serializable { self => def rescueWith(second: DateParser): DateParser = new DateParser { - def parse(s: String)(implicit tz: TimeZone) = { - self.parse(s) orElse second.parse(s) - } + def parse(s: String)(implicit tz: TimeZone) = + self.parse(s).orElse(second.parse(s)) } } object DateParser { + /** - * This is scalding's default date parser. You can choose this - * by setting an implicit val DateParser. - * Note that DateParsers using SimpleDateFormat from Java are - * not thread-safe, thus the def here. You can cache the result - * if you are sure + * This is scalding's default date parser. You can choose this by setting an implicit val DateParser. Note + * that DateParsers using SimpleDateFormat from Java are not thread-safe, thus the def here. You can cache + * the result if you are sure */ def default: DateParser = new DateParser { def parse(s: String)(implicit tz: TimeZone) = - DateOps.getDateParser(s) - .map { p => p.parse(s) } + DateOps + .getDateParser(s) + .map(p => p.parse(s)) .getOrElse(Failure(new IllegalArgumentException("Could not find parser for: " + s))) } /** Try these Parsers in order */ def apply(items: Iterable[DateParser]): DateParser = - items.reduce { _.rescueWith(_) } + items.reduce(_.rescueWith(_)) /** Using the type-class pattern */ def parse(s: String)(implicit tz: TimeZone, p: DateParser): Try[RichDate] = p.parse(s)(tz) /** - * Note that DateFormats in Java are generally not thread-safe, - * so you should not share the result here across threads + * Note that DateFormats in Java are generally not thread-safe, so you should not share the result here + * across threads */ implicit def from(df: DateFormat): DateParser = new DateParser { def parse(s: String)(implicit tz: TimeZone) = Try { @@ -83,21 +81,11 @@ object DateParser { /** * //Scalding used to support Natty, this is removed. To add it back, use something like this in your code, - * //possibly with: - * //implicit val myParser = DateParser(Seq(DateParser.default, NattyParser)) - * - * object NattyParser extends DateParser { - * def parse(s: String)(implicit tz: TimeZone) = Try { - * val timeParser = new natty.Parser(tz) - * val dateGroups = timeParser.parse(s) - * if (dateGroups.size == 0) { - * throw new IllegalArgumentException("Could not convert string: '" + str + "' into a date.") - * } - * // a DateGroup can have more than one Date (e.g. if you do "Sept. 11th or 12th"), - * // but we're just going to take the first - * val dates = dateGroups.get(0).getDates() - * RichDate(dates.get(0)) - * } - * } + * //possibly with: //implicit val myParser = DateParser(Seq(DateParser.default, NattyParser)) * + * object NattyParser extends DateParser { def parse(s: String)(implicit tz: TimeZone) = Try { val timeParser + * = new natty.Parser(tz) val dateGroups = timeParser.parse(s) if (dateGroups.size == 0) { throw new + * IllegalArgumentException("Could not convert string: '" + str + "' into a date.") } // a DateGroup can have + * more than one Date (e.g. if you do "Sept. 11th or 12th"), // but we're just going to take the first val + * dates = dateGroups.get(0).getDates() RichDate(dates.get(0)) } } */ diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala index 7d2c6be508..0360ee99d5 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import scala.annotation.tailrec @@ -20,26 +20,26 @@ import scala.annotation.tailrec import java.util.TimeZone object DateRange extends java.io.Serializable { + /** - * Parse this string into a range. - * 2009-10-01 is interpetted as the whole day - * 2009-10-01T12 is interpetted as the whole hour - * 2009-10-01T12:00 is interpetted as a single minute - * 2009-10-01T12:00:02 is interpretted as a single second + * Parse this string into a range. 2009-10-01 is interpetted as the whole day 2009-10-01T12 is interpetted + * as the whole hour 2009-10-01T12:00 is interpetted as a single minute 2009-10-01T12:00:02 is interpretted + * as a single second * - * This is called parse to avoid a collision with implicit conversions - * from String to RichDate + * This is called parse to avoid a collision with implicit conversions from String to RichDate */ def parse(truncatediso8601: String)(implicit tz: TimeZone, dp: DateParser): DateRange = DateRange(RichDate(truncatediso8601), RichDate.upperBound(truncatediso8601)) /** - * We take the upper bound of the second parameter, so we take the latest time that - * could be construed as matching the string passed, e.g. - * ("2011-01-02T04", "2011-01-02T05") includes two full hours (all of 4 and all of 5) + * We take the upper bound of the second parameter, so we take the latest time that could be construed as + * matching the string passed, e.g. ("2011-01-02T04", "2011-01-02T05") includes two full hours (all of 4 and + * all of 5) */ - def parse(iso8601start: String, - iso8601inclusiveUpper: String)(implicit tz: TimeZone, dp: DateParser): DateRange = { + def parse(iso8601start: String, iso8601inclusiveUpper: String)(implicit + tz: TimeZone, + dp: DateParser + ): DateRange = { val start = RichDate(iso8601start) val end = RichDate.upperBound(iso8601inclusiveUpper) @@ -53,13 +53,13 @@ object DateRange extends java.io.Serializable { */ def parse(fromArgs: Seq[String])(implicit tz: TimeZone, dp: DateParser): DateRange = fromArgs match { case Seq(s, e) => parse(s, e) - case Seq(o) => parse(o) - case x => sys.error("--date must have exactly one or two date[time]s. Got: " + x.toString) + case Seq(o) => parse(o) + case x => sys.error("--date must have exactly one or two date[time]s. Got: " + x.toString) } /** - * DateRanges are inclusive. Use this to create a DateRange that excludes - * the last millisecond from the second argument. + * DateRanges are inclusive. Use this to create a DateRange that excludes the last millisecond from the + * second argument. */ def exclusiveUpper(include: RichDate, exclude: RichDate): DateRange = DateRange(include, exclude - Millisecs(1)) @@ -68,11 +68,11 @@ object DateRange extends java.io.Serializable { /** * represents a closed interval of time. * - * TODO: This should be Range[RichDate, Duration] for an appropriate notion - * of Range + * TODO: This should be Range[RichDate, Duration] for an appropriate notion of Range */ case class DateRange(val start: RichDate, val end: RichDate) { - require(start <= end, s"""The start "${start}" must be before or on the end "${end}".""") + require(start <= end, s"""The start "$start" must be before or on the end "$end".""") + /** * shift this by the given unit */ @@ -81,34 +81,33 @@ case class DateRange(val start: RichDate, val end: RichDate) { def isBefore(d: RichDate) = end < d def isAfter(d: RichDate) = d < start + /** - * make the range wider by delta on each side. Good to catch events which - * might spill over. + * make the range wider by delta on each side. Good to catch events which might spill over. */ def embiggen(delta: Duration) = DateRange(start - delta, end + delta) + /** - * Extend the length by moving the end. We can keep the party going, but we - * can't start it earlier. + * Extend the length by moving the end. We can keep the party going, but we can't start it earlier. */ def extend(delta: Duration) = DateRange(start, end + delta) /** - * Extend the length by moving the start. - * Turns out, we can start the party early. + * Extend the length by moving the start. Turns out, we can start the party early. */ def prepend(delta: Duration) = DateRange(start - delta, end) def contains(point: RichDate) = (start <= point) && (point <= end) + /** * Is the given Date range a (non-strict) subset of the given range */ def contains(dr: DateRange) = start <= dr.start && dr.end <= end /** - * produce a contiguous non-overlapping set of DateRanges - * whose union is equivalent to this. - * If it is passed an integral unit of time (not a DurationList), it stops at boundaries - * which are set by the start timezone, else break at start + k * span. + * produce a contiguous non-overlapping set of DateRanges whose union is equivalent to this. If it is passed + * an integral unit of time (not a DurationList), it stops at boundaries which are set by the start + * timezone, else break at start + k * span. */ def each(span: Duration): Iterable[DateRange] = { //tail recursive method which produces output (as a stack, so it is diff --git a/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala b/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala index c1aa49bfd0..8616a0c542 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.Calendar @@ -21,8 +21,8 @@ import java.util.TimeZone import scala.annotation.tailrec /** - * Represents millisecond based duration (non-calendar based): seconds, minutes, hours - * calField should be a java.util.Calendar field + * Represents millisecond based duration (non-calendar based): seconds, minutes, hours calField should be a + * java.util.Calendar field */ object Duration extends java.io.Serializable { // TODO: remove this in 0.9.0 @@ -30,11 +30,15 @@ object Duration extends java.io.Serializable { val MIN_IN_MS: Int = 60 * SEC_IN_MS val HOUR_IN_MS: Int = 60 * MIN_IN_MS val UTC_UNITS: List[(Int => AbsoluteDuration, Int)] = - List[(Int => AbsoluteDuration, Int)]((Hours, HOUR_IN_MS), (Minutes, MIN_IN_MS), (Seconds, SEC_IN_MS), (Millisecs, 1)) + List[(Int => AbsoluteDuration, Int)]( + (Hours, HOUR_IN_MS), + (Minutes, MIN_IN_MS), + (Seconds, SEC_IN_MS), + (Millisecs, 1) + ) } -abstract class Duration(val calField: Int, val count: Int, val tz: TimeZone) - extends java.io.Serializable { +abstract class Duration(val calField: Int, val count: Int, val tz: TimeZone) extends java.io.Serializable { protected def calAdd(that: RichDate, steps: Int) = { val cal = that.toCalendar(tz) cal.setLenient(true) @@ -55,39 +59,32 @@ abstract class Duration(val calField: Int, val count: Int, val tz: TimeZone) } } -case class Days(cnt: Int)(implicit tz: TimeZone) - extends Duration(Calendar.DAY_OF_MONTH, cnt, tz) +case class Days(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.DAY_OF_MONTH, cnt, tz) -case class Weeks(cnt: Int)(implicit tz: TimeZone) - extends Duration(Calendar.WEEK_OF_YEAR, cnt, tz) { +case class Weeks(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.WEEK_OF_YEAR, cnt, tz) { // The library we are using can't handle week truncation... override def floorOf(that: RichDate) = { val step = Days(1) - @tailrec def recentMonday(rd: RichDate): RichDate = { + @tailrec def recentMonday(rd: RichDate): RichDate = rd.toCalendar(tz).get(Calendar.DAY_OF_WEEK) match { case Calendar.MONDAY => rd - case _ => recentMonday(step.subtractFrom(rd)) + case _ => recentMonday(step.subtractFrom(rd)) } - } //Set it to the earliest point in the day: step.floorOf(recentMonday(that)) } } -case class Months(cnt: Int)(implicit tz: TimeZone) - extends Duration(Calendar.MONTH, cnt, tz) +case class Months(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.MONTH, cnt, tz) -case class Years(cnt: Int)(implicit tz: TimeZone) - extends Duration(Calendar.YEAR, cnt, tz) +case class Years(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.YEAR, cnt, tz) abstract class AbstractDurationList[T <: Duration](parts: List[T]) extends Duration(-1, -1, null) { - override def addTo(that: RichDate) = { - parts.foldLeft(that) { (curdate, next) => next.addTo(curdate) } - } - override def subtractFrom(that: RichDate) = { - parts.foldLeft(that) { (curdate, next) => next.subtractFrom(curdate) } - } + override def addTo(that: RichDate) = + parts.foldLeft(that)((curdate, next) => next.addTo(curdate)) + override def subtractFrom(that: RichDate) = + parts.foldLeft(that)((curdate, next) => next.subtractFrom(curdate)) //This does not make sense for a DurationList interval, pass through override def floorOf(that: RichDate) = that } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala b/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala index 0425476ceb..3ea2aee5f7 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone @@ -25,8 +25,13 @@ import java.util.regex.Pattern * current range. This children must be ordered from largest * to smallest in size. */ -class BaseGlobifier(dur: Duration, val sym: String, pattern: String, tz: TimeZone, child: Option[BaseGlobifier]) - extends java.io.Serializable { +class BaseGlobifier( + dur: Duration, + val sym: String, + pattern: String, + tz: TimeZone, + child: Option[BaseGlobifier] +) extends java.io.Serializable { // result <= rd private def greatestLowerBound(rd: RichDate) = dur.floorOf(rd) // rd <= result @@ -38,7 +43,7 @@ class BaseGlobifier(dur: Duration, val sym: String, pattern: String, tz: TimeZon // Generate a lazy list of all children final def children: Stream[BaseGlobifier] = child match { case Some(c) => Stream.cons(c, c.children) - case None => Stream.empty + case None => Stream.empty } final def asteriskChildren(rd: RichDate): String = { @@ -63,9 +68,9 @@ class BaseGlobifier(dur: Duration, val sym: String, pattern: String, tz: TimeZon List(sstr) case Some(c) => /* - * Two cases: we should asterisk our children, or we need - * to recurse. If we fill this entire range, just asterisk, - */ + * Two cases: we should asterisk our children, or we need + * to recurse. If we fill this entire range, just asterisk, + */ val bottom = children.last val fillsright = format(leastUpperBound(dr.end)) == format(bottom.leastUpperBound(dr.end)) @@ -119,17 +124,17 @@ class BaseGlobifier(dur: Duration, val sym: String, pattern: String, tz: TimeZon } case class HourGlob(pat: String)(implicit tz: TimeZone) - extends BaseGlobifier(Hours(1), "%1$tH", pat, tz, None) + extends BaseGlobifier(Hours(1), "%1$tH", pat, tz, None) case class DayGlob(pat: String)(implicit tz: TimeZone) - extends BaseGlobifier(Days(1)(tz), "%1$td", pat, tz, Some(HourGlob(pat))) + extends BaseGlobifier(Days(1)(tz), "%1$td", pat, tz, Some(HourGlob(pat))) case class MonthGlob(pat: String)(implicit tz: TimeZone) - extends BaseGlobifier(Months(1)(tz), "%1$tm", pat, tz, Some(DayGlob(pat))) + extends BaseGlobifier(Months(1)(tz), "%1$tm", pat, tz, Some(DayGlob(pat))) /* * This is the outermost globifier and should generally be used to globify */ case class Globifier(pat: String)(implicit tz: TimeZone) - extends BaseGlobifier(Years(1)(tz), "%1$tY", pat, tz, Some(MonthGlob(pat))) - with java.io.Serializable + extends BaseGlobifier(Years(1)(tz), "%1$tY", pat, tz, Some(MonthGlob(pat))) + with java.io.Serializable diff --git a/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala b/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala index 95d978b680..3b73831dd0 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.text.SimpleDateFormat @@ -22,9 +22,8 @@ import java.util.Date import java.util.TimeZone /** - * RichDate adds some nice convenience functions to the Java date/calendar classes - * We commonly do Date/Time work in analysis jobs, so having these operations convenient - * is very helpful. + * RichDate adds some nice convenience functions to the Java date/calendar classes We commonly do Date/Time + * work in analysis jobs, so having these operations convenient is very helpful. */ object RichDate { // Implicits to Java types: @@ -37,10 +36,11 @@ object RichDate { implicit def apply(d: Date): RichDate = RichDate(d.getTime) implicit def apply(d: Calendar): RichDate = RichDate(d.getTime) + /** - * Parse the string with one of the value DATE_FORMAT_VALIDATORS in the order listed in DateOps. - * We allow either date, date with time in minutes, date with time down to seconds. - * The separator between date and time can be a space or "T". + * Parse the string with one of the value DATE_FORMAT_VALIDATORS in the order listed in DateOps. We allow + * either date, date with time in minutes, date with time down to seconds. The separator between date and + * time can be a space or "T". */ implicit def apply(str: String)(implicit tz: TimeZone, dp: DateParser): RichDate = dp.parse(str).get @@ -51,16 +51,16 @@ object RichDate { def upperBound(s: String)(implicit tz: TimeZone, dp: DateParser) = { val end = apply(s) (DateOps.getFormatObject(s) match { - case Some(DateOps.Format.DATE_WITHOUT_DASH) => end + Days(1) - case Some(DateOps.Format.DATE_WITH_DASH) => end + Days(1) - case Some(DateOps.Format.DATEHOUR_WITHOUT_DASH) => end + Hours(1) - case Some(DateOps.Format.DATEHOUR_WITH_DASH) => end + Hours(1) - case Some(DateOps.Format.DATETIME_WITHOUT_DASH) => end + Minutes(1) - case Some(DateOps.Format.DATETIME_WITH_DASH) => end + Minutes(1) + case Some(DateOps.Format.DATE_WITHOUT_DASH) => end + Days(1) + case Some(DateOps.Format.DATE_WITH_DASH) => end + Days(1) + case Some(DateOps.Format.DATEHOUR_WITHOUT_DASH) => end + Hours(1) + case Some(DateOps.Format.DATEHOUR_WITH_DASH) => end + Hours(1) + case Some(DateOps.Format.DATETIME_WITHOUT_DASH) => end + Minutes(1) + case Some(DateOps.Format.DATETIME_WITH_DASH) => end + Minutes(1) case Some(DateOps.Format.DATETIME_HMS_WITHOUT_DASH) => end + Seconds(1) - case Some(DateOps.Format.DATETIME_HMS_WITH_DASH) => end + Seconds(1) - case Some(DateOps.Format.DATETIME_HMSM_WITH_DASH) => end + Millisecs(2) - case None => Days(1).floorOf(end + Days(1)) + case Some(DateOps.Format.DATETIME_HMS_WITH_DASH) => end + Seconds(1) + case Some(DateOps.Format.DATETIME_HMSM_WITH_DASH) => end + Millisecs(2) + case None => Days(1).floorOf(end + Days(1)) }) - Millisecs(1) } @@ -72,8 +72,8 @@ object RichDate { } /** - * A value class wrapper for milliseconds since the epoch. Its tempting to extend - * this with AnyVal but this causes problem with Java code. + * A value class wrapper for milliseconds since the epoch. Its tempting to extend this with AnyVal but this + * causes problem with Java code. */ case class RichDate(val timestamp: Long) extends Ordered[RichDate] { // these are mutable, don't keep them around @@ -91,9 +91,9 @@ case class RichDate(val timestamp: Long) extends Ordered[RichDate] { //True of the other is a RichDate with equal value, or a Date equal to value override def equals(that: Any) = that match { - case d: Date => d.getTime == timestamp + case d: Date => d.getTime == timestamp case RichDate(ts) => ts == timestamp - case _ => false + case _ => false } def before(that: RichDate): Boolean = compare(that) < 0 @@ -105,8 +105,8 @@ case class RichDate(val timestamp: Long) extends Ordered[RichDate] { def format(pattern: String)(implicit tz: TimeZone): String = String.format(pattern, toCalendar(tz)) /** - * Make sure the hashCode is the same as Date for the (questionable) choice - * to make them equal. This is the same as what java does (and only sane thing). + * Make sure the hashCode is the same as Date for the (questionable) choice to make them equal. This is the + * same as what java does (and only sane thing). */ override def hashCode = (timestamp.toInt) ^ ((timestamp >> 32).toInt) @@ -128,4 +128,3 @@ case class RichDate(val timestamp: Long) extends Ordered[RichDate] { sdfmt.format(cal.getTime) } } - diff --git a/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala b/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala index 606a8a71f1..a929071ca7 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala @@ -18,70 +18,102 @@ class CalendarOpsTest extends WordSpec { } "truncate to a year" in { - assert(dateParser.parse("January 1, 2002") === - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.YEAR)) - - assert(dateParser.parse("January 1, 2001") === - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.YEAR)) + assert( + dateParser.parse("January 1, 2002") === + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.YEAR) + ) + + assert( + dateParser.parse("January 1, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.YEAR) + ) } "truncate to a month" in { - assert(dateParser.parse("February 1, 2002") === - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.MONTH)) - - assert(dateParser.parse("November 1, 2001") === - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.MONTH)) + assert( + dateParser.parse("February 1, 2002") === + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.MONTH) + ) + + assert( + dateParser.parse("November 1, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.MONTH) + ) } "truncate to a date" in { - assert(dateParser.parse("February 12, 2002") == - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.DATE)) - - assert(dateParser.parse("November 18, 2001") === - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.DATE)) + assert( + dateParser.parse("February 12, 2002") == + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.DATE) + ) + + assert( + dateParser.parse("November 18, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.DATE) + ) } "truncate to a minute" in { - assert(dateTimeParser.parse("February 12, 2002 12:34:00.000") === - CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.MINUTE)) - - assert(dateTimeParser.parse("November 18, 2001 1:23:00.000") === - CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.MINUTE)) + assert( + dateTimeParser.parse("February 12, 2002 12:34:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.MINUTE) + ) + + assert( + dateTimeParser.parse("November 18, 2001 1:23:00.000") === + CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.MINUTE) + ) } "truncate to a second" in { - assert(dateTimeParser.parse("February 12, 2002 12:34:56.000") === - CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.SECOND)) - - assert(dateTimeParser.parse("November 18, 2001 1:23:11.000") === - CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.SECOND)) + assert( + dateTimeParser.parse("February 12, 2002 12:34:56.000") === + CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.SECOND) + ) + + assert( + dateTimeParser.parse("November 18, 2001 1:23:11.000") === + CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.SECOND) + ) } "truncate to AM" in { - assert(dateTimeParser.parse("February 3, 2002 00:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 01:10:00.000"), Calendar.AM_PM)) - - assert(dateTimeParser.parse("February 3, 2002 00:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 11:10:00.000"), Calendar.AM_PM)) + assert( + dateTimeParser.parse("February 3, 2002 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 01:10:00.000"), Calendar.AM_PM) + ) + + assert( + dateTimeParser.parse("February 3, 2002 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 11:10:00.000"), Calendar.AM_PM) + ) } "truncate to PM" in { - assert(dateTimeParser.parse("February 3, 2002 12:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 13:10:00.000"), Calendar.AM_PM)) - - assert(dateTimeParser.parse("February 3, 2002 12:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 19:10:00.000"), Calendar.AM_PM)) + assert( + dateTimeParser.parse("February 3, 2002 12:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 13:10:00.000"), Calendar.AM_PM) + ) + + assert( + dateTimeParser.parse("February 3, 2002 12:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 19:10:00.000"), Calendar.AM_PM) + ) } "truncate respects DST" in { TimeZone.setDefault(TimeZone.getTimeZone("MET")) dateTimeParser.setTimeZone(TimeZone.getTimeZone("MET")) - assert(dateTimeParser.parse("March 30, 2003 00:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("March 30, 2003 05:30:45.000"), Calendar.DATE)) + assert( + dateTimeParser.parse("March 30, 2003 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("March 30, 2003 05:30:45.000"), Calendar.DATE) + ) - assert(dateTimeParser.parse("October 26, 2003 00:00:00.000") === - CalendarOps.truncate(dateTimeParser.parse("October 26, 2003 05:30:45.000"), Calendar.DATE)) + assert( + dateTimeParser.parse("October 26, 2003 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("October 26, 2003 05:30:45.000"), Calendar.DATE) + ) } } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala b/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala index 3380d6f42d..72082504f0 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalacheck.Arbitrary @@ -28,25 +28,24 @@ object DateProperties extends Properties("Date Properties") { implicit def dateParser: DateParser = DateParser.default implicit val durationArb: Arbitrary[Duration] = - Arbitrary { choose(0, 10000).map { Millisecs(_) } } + Arbitrary(choose(0, 10000).map(Millisecs(_))) implicit val richDateArb: Arbitrary[RichDate] = Arbitrary { for (v <- choose(0L, 1L << 32)) yield RichDate(v) } implicit val dateRangeArb: Arbitrary[DateRange] = Arbitrary { - for ( - v1 <- choose(0L, 1L << 33); + for { + v1 <- choose(0L, 1L << 33) v2 <- choose(v1, 1L << 33) - ) yield DateRange(RichDate(v1), RichDate(v2)) + } yield DateRange(RichDate(v1), RichDate(v2)) } implicit val absdur: Arbitrary[AbsoluteDuration] = Arbitrary { - implicitly[Arbitrary[Long]] - .arbitrary + implicitly[Arbitrary[Long]].arbitrary // Ignore Longs that are too big to fit, and make sure we can add any random 3 together // Long.MaxValue / 1200 ms is the biggest that will fit, we divide by 3 to make sure // we can add three together in tests - .map { ms => fromMillisecs(ms / (1200 * 3)) } + .map(ms => fromMillisecs(ms / (1200 * 3))) } property("Shifting DateRanges breaks containment") = forAll { (dr: DateRange, r: Duration) => @@ -56,7 +55,7 @@ object DateProperties extends Properties("Date Properties") { property("Arithmetic works as expected") = forAll { (dr: DateRange, r: Duration) => (dr + r) - r == dr && - (dr.start + r) - r == dr.start + (dr.start + r) - r == dr.start } property("fromMillisecs toMillisecs") = forAll { (ad: AbsoluteDuration) => val ms = ad.toMillisecs @@ -67,20 +66,20 @@ object DateProperties extends Properties("Date Properties") { property("Before/After works") = forAll { (dr: DateRange, rd: RichDate) => (asInt(dr.contains(rd)) + asInt(dr.isBefore(rd)) + asInt(dr.isAfter(rd)) == 1) && - (dr.isBefore(dr.end + (dr.end - dr.start))) && - (dr.isAfter(dr.start - (dr.end - dr.start))) + (dr.isBefore(dr.end + (dr.end - dr.start))) && + (dr.isAfter(dr.start - (dr.end - dr.start))) } def divDur(ad: AbsoluteDuration, div: Int) = fromMillisecs(ad.toMillisecs / div) property("each output is contained") = forAll { (dr: DateRange) => val r = divDur(dr.end - dr.start, 10) - dr.each(r).forall { dr.contains(_) } + dr.each(r).forall(dr.contains(_)) } property("Embiggen/extend always contains") = forAll { (dr: DateRange, d: Duration) => dr.embiggen(d).contains(dr) && - dr.extend(d).contains(dr) + dr.extend(d).contains(dr) } property("RichDate subtraction Roundtrip") = forAll { (timestamp0: Long, delta: AbsoluteDuration) => @@ -92,21 +91,20 @@ object DateProperties extends Properties("Date Properties") { Millisecs(ms).toMillisecs.toInt == ms } - property("AbsoluteDuration group properties") = - forAll { (a: AbsoluteDuration, b: AbsoluteDuration, c: AbsoluteDuration) => + property("AbsoluteDuration group properties") = forAll { + (a: AbsoluteDuration, b: AbsoluteDuration, c: AbsoluteDuration) => (a + b) - c == a + (b - c) && - (a + b) + c == a + (b + c) && - (a - a) == fromMillisecs(0) && - (b - b) == fromMillisecs(0) && - (c - c) == fromMillisecs(0) && - { - b.toMillisecs == 0 || { - // Don't divide by zero: - val (d, rem) = (a / b) - a == b * d + rem && (rem.toMillisecs.abs < b.toMillisecs.abs) - } + (a + b) + c == a + (b + c) && + (a - a) == fromMillisecs(0) && + (b - b) == fromMillisecs(0) && + (c - c) == fromMillisecs(0) && { + b.toMillisecs == 0 || { + // Don't divide by zero: + val (d, rem) = a / b + a == b * d + rem && (rem.toMillisecs.abs < b.toMillisecs.abs) } - } + } + } property("DateRange.length is correct") = forAll { (dr: DateRange) => dr.start + dr.length - AbsoluteDuration.fromMillisecs(1L) == dr.end @@ -120,14 +118,14 @@ object DateProperties extends Properties("Date Properties") { val upperPred = upper - Millisecs(1) (false == ex.contains(upper)) && - (ex.contains(upperPred) || (lower == upper)) + (ex.contains(upperPred) || (lower == upper)) } - def toRegex(glob: String) = (glob.flatMap { c => if (c == '*') ".*" else c.toString }).r + def toRegex(glob: String) = glob.flatMap { c => if (c == '*') ".*" else c.toString }.r def matches(l: List[String], arg: String): Int = l - .map { toRegex _ } - .map { _.findFirstMatchIn(arg).map { _ => 1 }.getOrElse(0) } + .map(toRegex _) + .map(_.findFirstMatchIn(arg).map(_ => 1).getOrElse(0)) .sum // Make sure globifier always contains: @@ -136,7 +134,8 @@ object DateProperties extends Properties("Date Properties") { property("Globifying produces matching patterns") = forAll { (dr: DateRange) => val globbed = glob.globify(dr) // Brute force - dr.each(Hours(1)).map { _.start.format(pattern)(DateOps.UTC) } - .forall { matches(globbed, _) == 1 } + dr.each(Hours(1)) + .map(_.start.format(pattern)(DateOps.UTC)) + .forall(matches(globbed, _) == 1) } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala b/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala index f3500883a3..da29a63c70 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalatest.WordSpec @@ -70,8 +70,8 @@ class DateTest extends WordSpec { } "throw an exception when trying to parse illegal strings" in { // Natty is *really* generous about what it accepts - intercept[IllegalArgumentException] { RichDate("jhbjhvhjv") } - intercept[IllegalArgumentException] { RichDate("99-99-99") } + intercept[IllegalArgumentException](RichDate("jhbjhvhjv")) + intercept[IllegalArgumentException](RichDate("99-99-99")) } "be able to deal with arithmetic operations with whitespace" in { val rd1: RichDate = RichDate("2010-10-02") + Seconds(1) @@ -157,12 +157,36 @@ class DateTest extends WordSpec { } } "correctly calculate upperBound" in { - assert(Seconds(1).floorOf(RichDate.upperBound("20101001")) === Seconds(1).floorOf(RichDate("2010-10-01 23:59:59"))) - assert(Seconds(1).floorOf(RichDate.upperBound("2010100114")) === Seconds(1).floorOf(RichDate("2010-10-01 14:59:59"))) - assert(Seconds(1).floorOf(RichDate.upperBound("201010011415")) === Seconds(1).floorOf(RichDate("2010-10-01 14:15:59"))) - assert(Seconds(1).floorOf(RichDate.upperBound("2010-10-01")) === Seconds(1).floorOf(RichDate("2010-10-01 23:59:59"))) - assert(Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14")) === Seconds(1).floorOf(RichDate("2010-10-01 14:59:59"))) - assert(Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14:15")) === Seconds(1).floorOf(RichDate("2010-10-01 14:15:59"))) + assert( + Seconds(1).floorOf(RichDate.upperBound("20101001")) === Seconds(1).floorOf( + RichDate("2010-10-01 23:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010100114")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("201010011415")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:15:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01")) === Seconds(1).floorOf( + RichDate("2010-10-01 23:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14:15")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:15:59") + ) + ) } "Have an implicit Ordering" in { implicitly[Ordering[RichDate]] @@ -171,9 +195,8 @@ class DateTest extends WordSpec { } "A DateRange" should { "correctly iterate on each duration" in { - def rangeContainTest(d1: DateRange, dur: Duration) = { + def rangeContainTest(d1: DateRange, dur: Duration) = assert(d1.each(dur).forall((d1r: DateRange) => d1.contains(d1r))) - } rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Weeks(1)) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Weeks(2)) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Days(1)) @@ -190,14 +213,15 @@ class DateTest extends WordSpec { assert(DateRange("2010-10-01", RichDate.upperBound("2010-10-01")).each(Hours(1)).size === 24) assert(DateRange("2010-10-31", RichDate.upperBound("2010-10-31")).each(Hours(1)).size === 24) assert(DateRange("2010-10-31", RichDate.upperBound("2010-10-31")).each(Days(1)).size === 1) - assert(DateRange("2010-10-31 12:00", RichDate.upperBound("2010-10-31 13")).each(Minutes(1)).size === 120) + assert( + DateRange("2010-10-31 12:00", RichDate.upperBound("2010-10-31 13")).each(Minutes(1)).size === 120 + ) } "have each partition disjoint and adjacent" in { def eachIsDisjoint(d: DateRange, dur: Duration): Unit = { val dl = d.each(dur) - assert(dl.zip(dl.tail).forall { - case (da, db) => - da.isBefore(db.start) && db.isAfter(da.end) && ((da.end + Millisecs(1)) == db.start) + assert(dl.zip(dl.tail).forall { case (da, db) => + da.isBefore(db.start) && db.isAfter(da.end) && ((da.end + Millisecs(1)) == db.start) }) } eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Days(1)) @@ -210,20 +234,21 @@ class DateTest extends WordSpec { eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Minutes(1)) } "reject an end that is before its start" in { - intercept[IllegalArgumentException] { DateRange("2010-10-02", "2010-10-01") } + intercept[IllegalArgumentException](DateRange("2010-10-02", "2010-10-01")) } "correctly add time in either or both directions" in { assert(DateRange("2010-10-01", "2010-10-02").extend(Days(3)).each(Days(1)).size === 5) assert(DateRange("2010-10-01", "2010-10-02").prepend(Days(3)).each(Days(1)).size === 5) assert(DateRange("2010-10-01", "2010-10-02").embiggen(Days(3)).each(Days(1)).size === 8) - assert(DateRange("2010-10-01", "2010-10-10").extend(Days(1)).prepend(Days(1)) == - DateRange("2010-10-01", "2010-10-10").embiggen(Days(1))) + assert( + DateRange("2010-10-01", "2010-10-10").extend(Days(1)).prepend(Days(1)) == + DateRange("2010-10-01", "2010-10-10").embiggen(Days(1)) + ) } } "Time units" should { - def isSame(d1: Duration, d2: Duration) = { + def isSame(d1: Duration, d2: Duration) = (RichDate("2011-12-01") + d1) == (RichDate("2011-12-01") + d2) - } "have 1000 milliseconds in a sec" in { assert(isSame(Millisecs(1000), Seconds(1))) assert(Seconds(1).toMillisecs === 1000L) @@ -266,29 +291,49 @@ class DateTest extends WordSpec { val t2 = Globifier("/%1$tY/%1$tm/%1$td/") val testcases = - (t1.globify(DateRange("2011-12-01T14", "2011-12-04")), - List("/2011/12/01/14", "/2011/12/01/15", "/2011/12/01/16", "/2011/12/01/17", "/2011/12/01/18", - "/2011/12/01/19", "/2011/12/01/20", "/2011/12/01/21", "/2011/12/01/22", "/2011/12/01/23", - "/2011/12/02/*", "/2011/12/03/*", "/2011/12/04/00")) :: - (t1.globify(DateRange("2011-12-01", "2011-12-01T23:59")), - List("/2011/12/01/*")) :: - (t1.globify(DateRange("2014-06-30T00", "2014-07-01T00")), - List("/2014/06/30/*", "/2014/07/01/00")) :: - (t1.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), - List("/2011/12/01/12")) :: - (t1.globify(DateRange("2011-12-01T12", "2011-12-01T14")), - List("/2011/12/01/12", "/2011/12/01/13", "/2011/12/01/14")) :: - (t2.globify(DateRange("2011-12-01T14", "2011-12-04")), - List("/2011/12/01/", "/2011/12/02/", "/2011/12/03/", "/2011/12/04/")) :: - (t2.globify(DateRange("2011-12-01", "2011-12-01T23:59")), - List("/2011/12/01/")) :: - (t2.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), - List("/2011/12/01/")) :: - (t2.globify(DateRange("2011-12-01T12", "2012-01-02T14")), - List("/2011/12/*/", "/2012/01/01/", "/2012/01/02/")) :: - (t2.globify(DateRange("2011-11-01T12", "2011-12-02T14")), - List("/2011/11/*/", "/2011/12/01/", "/2011/12/02/")) :: - Nil + ( + t1.globify(DateRange("2011-12-01T14", "2011-12-04")), + List( + "/2011/12/01/14", + "/2011/12/01/15", + "/2011/12/01/16", + "/2011/12/01/17", + "/2011/12/01/18", + "/2011/12/01/19", + "/2011/12/01/20", + "/2011/12/01/21", + "/2011/12/01/22", + "/2011/12/01/23", + "/2011/12/02/*", + "/2011/12/03/*", + "/2011/12/04/00" + ) + ) :: + (t1.globify(DateRange("2011-12-01", "2011-12-01T23:59")), List("/2011/12/01/*")) :: + ( + t1.globify(DateRange("2014-06-30T00", "2014-07-01T00")), + List("/2014/06/30/*", "/2014/07/01/00") + ) :: + (t1.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), List("/2011/12/01/12")) :: + ( + t1.globify(DateRange("2011-12-01T12", "2011-12-01T14")), + List("/2011/12/01/12", "/2011/12/01/13", "/2011/12/01/14") + ) :: + ( + t2.globify(DateRange("2011-12-01T14", "2011-12-04")), + List("/2011/12/01/", "/2011/12/02/", "/2011/12/03/", "/2011/12/04/") + ) :: + (t2.globify(DateRange("2011-12-01", "2011-12-01T23:59")), List("/2011/12/01/")) :: + (t2.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), List("/2011/12/01/")) :: + ( + t2.globify(DateRange("2011-12-01T12", "2012-01-02T14")), + List("/2011/12/*/", "/2012/01/01/", "/2012/01/02/") + ) :: + ( + t2.globify(DateRange("2011-11-01T12", "2011-12-02T14")), + List("/2011/11/*/", "/2011/12/01/", "/2011/12/02/") + ) :: + Nil testcases.foreach { case (l, r) => assert(l === r) } } @@ -301,7 +346,8 @@ class DateTest extends WordSpec { DateRange("2011-12-01", "2011-12-01T23:59"), DateRange("2014-06-30T00", "2014-07-01T00"), DateRange("2011-12-01T12", "2011-12-01T12:59"), - DateRange("2011-12-01T12", "2011-12-01T14")) + DateRange("2011-12-01T12", "2011-12-01T14") + ) hourlyTestCases.foreach { dr => val resultantDR = globifierOps.hourlyRtGlobifier(dr) @@ -313,7 +359,8 @@ class DateTest extends WordSpec { DateRange("2011-12-01", "2011-12-01T23:59"), DateRange("2011-12-01T12", "2011-12-01T12:59"), DateRange("2011-12-01T12", "2012-01-02T14"), - DateRange("2011-11-01T12", "2011-12-02T14")) + DateRange("2011-11-01T12", "2011-12-02T14") + ) dailyTestCases.foreach { dr => val resultantDR = globifierOps.dailyRtGlobifier(dr) @@ -322,13 +369,11 @@ class DateTest extends WordSpec { } def eachElementDistinct(dates: List[String]) = dates.size == dates.toSet.size - def globMatchesDate(glob: String)(date: String) = { + def globMatchesDate(glob: String)(date: String) = java.util.regex.Pattern.matches(glob.replaceAll("\\*", "[0-9]*"), date) - } - def bruteForce(pattern: String, dr: DateRange, dur: Duration)(implicit tz: java.util.TimeZone) = { + def bruteForce(pattern: String, dr: DateRange, dur: Duration)(implicit tz: java.util.TimeZone) = dr.each(dur) - .map { (dr: DateRange) => String.format(pattern, dr.start.toCalendar(tz)) } - } + .map((dr: DateRange) => String.format(pattern, dr.start.toCalendar(tz))) "handle random test cases" in { // This kind of implicit is not safe (what does the long mean?) @@ -337,7 +382,7 @@ class DateTest extends WordSpec { val t1 = Globifier(pattern) val r = new java.util.Random() - (0 until 100) foreach { step => + (0 until 100).foreach { step => val start = RichDate("2011-08-03").value.getTime + r.nextInt(Int.MaxValue) val dr = DateRange(start, start + r.nextInt(Int.MaxValue)) val splits = bruteForce(pattern, dr, Hours(1)) @@ -345,8 +390,11 @@ class DateTest extends WordSpec { assert(eachElementDistinct(globed)) //See that each path is matched by exactly one glob: - assert(splits.map { path => globed.filter { globMatchesDate(_)(path) }.size } - .forall { _ == 1 }) + assert( + splits + .map(path => globed.filter(globMatchesDate(_)(path)).size) + .forall(_ == 1) + ) } } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala index 55fca6a397..0220a55e3d 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone -import scala.util.{ Try, Success, Failure } +import scala.util.{Failure, Success, Try} case class GlobifierOps(implicit tz: TimeZone, dp: DateParser) { val yearMonthDayHourDurations = List(Years(1), Months(1), Days(1), Hours(1)) @@ -42,56 +42,62 @@ case class GlobifierOps(implicit tz: TimeZone, dp: DateParser) { def rtGlobifier(globifier: Globifier, durationList: List[Duration])(inputDr: DateRange): DateRange = { val p = globifier.globify(inputDr) - val drList = p.map { pattern => - val (lists, _, _) = pattern.split("/").tail.foldLeft((List[(Duration, Duration)](), durationList, true)) { - case ((durationLists, mappings, shouldContinue), current) => - val curMapping = mappings.head - if (shouldContinue) { - val tryDuration: Try[Duration] = Try(current.toInt).map { indx => - curMapping match { - case t if mappings.tail == Nil => t - case _ => Millisecs(0) - } - } - - val (duration, doContinue) = tryDuration match { - case Success(d) => (d, true) - case Failure(e) => - val dur: Duration = curMapping match { - case Years(_) => sys.error("Current is " + current + ", parsed as all years?") - case Months(_) => Years(1) - case Days(_) => Months(1) - case Hours(_) => Days(1) + val drList = p + .map { pattern => + val (lists, _, _) = + pattern.split("/").tail.foldLeft((List[(Duration, Duration)](), durationList, true)) { + case ((durationLists, mappings, shouldContinue), current) => + val curMapping = mappings.head + if (shouldContinue) { + val tryDuration: Try[Duration] = Try(current.toInt).map { indx => + curMapping match { + case t if mappings.tail == Nil => t + case _ => Millisecs(0) + } + } + + val (duration, doContinue) = tryDuration match { + case Success(d) => (d, true) + case Failure(e) => + val dur: Duration = curMapping match { + case Years(_) => sys.error("Current is " + current + ", parsed as all years?") + case Months(_) => Years(1) + case Days(_) => Months(1) + case Hours(_) => Days(1) + } + (dur, false) } - (dur, false) - } - - val base: Duration = Try(current.toInt).map { indx => - curMapping match { - case Years(_) => Years(indx - 1970) - case Months(_) => Months(indx - 1) // months and days are 1 offsets not 0 - case Days(_) => Days(indx - 1) - case Hours(_) => Hours(indx) + + val base: Duration = Try(current.toInt) + .map { indx => + curMapping match { + case Years(_) => Years(indx - 1970) + case Months(_) => Months(indx - 1) // months and days are 1 offsets not 0 + case Days(_) => Days(indx - 1) + case Hours(_) => Hours(indx) + } + } + .getOrElse(Hours(0)) + (durationLists :+ (base, duration), mappings.tail, doContinue) + } else { + (durationLists, mappings.tail, false) } - }.getOrElse(Hours(0)) - (durationLists :+ (base, duration), mappings.tail, doContinue) - } else { - (durationLists, mappings.tail, false) } - } - val baseDate = lists.foldLeft(RichDate("1970-01-01T00")) { - case (curDate, (base, _)) => + val baseDate = lists.foldLeft(RichDate("1970-01-01T00")) { case (curDate, (base, _)) => base.addTo(curDate) - } - val endDate = lists.foldLeft(baseDate) { - case (curDate, (_, dur)) => + } + val endDate = lists.foldLeft(baseDate) { case (curDate, (_, dur)) => dur.addTo(curDate) + } + DateRange(baseDate, endDate - Millisecs(1)) } - DateRange(baseDate, endDate - Millisecs(1)) - }.sortBy(_.start) + .sortBy(_.start) def combineDR(existing: DateRange, next: DateRange): DateRange = { - require(existing.end == next.start - Millisecs(1), "Not contigious range: \n" + existing + "\n" + next + "...From:\n" + p.mkString(",\n")) + require( + existing.end == next.start - Millisecs(1), + "Not contigious range: \n" + existing + "\n" + next + "...From:\n" + p.mkString(",\n") + ) DateRange(existing.start, next.end) } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala index e79f6dc3dc..f976504f5d 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalacheck.Arbitrary @@ -29,13 +29,13 @@ object GlobifierProperties extends Properties("Globifier Properties") { implicit def tz: TimeZone = TimeZone.getTimeZone("UTC") implicit val hourArb: Arbitrary[Hours] = - Arbitrary { choose(0, 10000).map { Hours(_) } } + Arbitrary(choose(0, 10000).map(Hours(_))) implicit val dayArb: Arbitrary[Days] = - Arbitrary { choose(0, 100).map { Days(_) } } + Arbitrary(choose(0, 100).map(Days(_))) implicit val yearArb: Arbitrary[Years] = - Arbitrary { choose(0, 100).map { Years(_) } } + Arbitrary(choose(0, 100).map(Years(_))) implicit val richDateArb: Arbitrary[RichDate] = Arbitrary { for (v <- choose(0L, 1L << 32)) yield RichDate(v) diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala index 8063cd0002..adf80b11b6 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db @@ -39,33 +39,83 @@ case object NotNullable extends IsNullable("NOT NULL") trait ColumnDefiner { // Some helper methods that we can use to generate column definitions - protected def bigint(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def bigint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(BIGINT, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def int(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def int( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(INT, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def smallint(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def smallint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(SMALLINT, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def tinyint(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def tinyint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(TINYINT, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def boolean(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def boolean( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(BOOLEAN, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def varchar(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def varchar( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(VARCHAR, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def date(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def date( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(DATE, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def datetime(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def datetime( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(DATETIME, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def text(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def text( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(TEXT, ColumnName(name), nullable, sizeOpt, defaultValue) - protected def double(name: String, nullable: IsNullable = NotNullable, sizeOpt: Option[Int] = None, defaultValue: Option[String] = None) = + protected def double( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = ColumnDefinition(DOUBLE, ColumnName(name), nullable, sizeOpt, defaultValue) } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala index 5dbed942d2..20c738de3d 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db @@ -21,11 +21,13 @@ import com.twitter.scalding.TupleConverter case class ColumnName(toStr: String) extends AnyVal case class SqlTypeName(toStr: String) extends AnyVal -case class ColumnDefinition(jdbcType: SqlType, - name: ColumnName, - nullable: IsNullable, - sizeOpt: Option[Int], - defaultValue: Option[String]) extends Serializable +case class ColumnDefinition( + jdbcType: SqlType, + name: ColumnName, + nullable: IsNullable, + sizeOpt: Option[Int], + defaultValue: Option[String] +) extends Serializable trait ColumnDefinitionProvider[T] extends Serializable { def columns: Iterable[ColumnDefinition] diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala index fd3c7dc9d6..1d37e15f25 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db @@ -20,47 +20,57 @@ package com.twitter.scalding.db case class Definition(toStr: String) extends AnyVal object DBColumnDefinition { - def apply(col: ColumnDefinition): DBColumnDefinition = DBColumnDefinition(col.jdbcType, + def apply(col: ColumnDefinition): DBColumnDefinition = DBColumnDefinition( + col.jdbcType, col.name, col.nullable, col.sizeOpt, col.defaultValue, - SqlTypeName(col.jdbcType.toString)) + SqlTypeName(col.jdbcType.toString) + ) } -case class DBColumnDefinition(jdbcType: SqlType, - name: ColumnName, - nullable: IsNullable, - sizeOpt: Option[Int], - defaultValue: Option[String], - sqlType: SqlTypeName) +case class DBColumnDefinition( + jdbcType: SqlType, + name: ColumnName, + nullable: IsNullable, + sizeOpt: Option[Int], + defaultValue: Option[String], + sqlType: SqlTypeName +) object DBColumnTransformer { - def columnDefnToDefinition(col: ColumnDefinition, - columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition]): Definition = { + def columnDefnToDefinition( + col: ColumnDefinition, + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] + ): Definition = { val preparedCol = columnMutator(DBColumnDefinition(col)) - val sizeStr = preparedCol.sizeOpt.map { siz => s"($siz)" }.getOrElse("") - val defStr = preparedCol.defaultValue.map { default => s" DEFAULT '${default}' " }.getOrElse(" ") + val sizeStr = preparedCol.sizeOpt.map(siz => s"($siz)").getOrElse("") + val defStr = preparedCol.defaultValue.map(default => s" DEFAULT '$default' ").getOrElse(" ") val sqlType = preparedCol.sqlType.toStr Definition(sqlType + sizeStr + defStr + preparedCol.nullable.toStr) } private def defaultColumnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = { - case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = Some(20)) - case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = Some(11)) + case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = Some(20)) + case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = Some(11)) case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = Some(6)) - case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = Some(6)) - case t @ DBColumnDefinition(VARCHAR, _, _, None, _, _) => t.copy(sizeOpt = Some(255)) - case t => t + case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = Some(6)) + case t @ DBColumnDefinition(VARCHAR, _, _, None, _, _) => t.copy(sizeOpt = Some(255)) + case t => t } - def mutateColumns(columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], - columns: Iterable[ColumnDefinition]): Iterable[DBColumnDefinition] = + def mutateColumns( + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], + columns: Iterable[ColumnDefinition] + ): Iterable[DBColumnDefinition] = columns.map(c => columnMutator.orElse(defaultColumnMutator)(DBColumnDefinition(c))) - def columnDefnsToCreate(columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], - columns: Iterable[ColumnDefinition]): Iterable[Definition] = + def columnDefnsToCreate( + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], + columns: Iterable[ColumnDefinition] + ): Iterable[Definition] = columns.map(c => columnDefnToDefinition(c, columnMutator.orElse(defaultColumnMutator))) def columnDefnsToCreate(columns: Iterable[ColumnDefinition]): Iterable[Definition] = diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala index cf35d99731..140001c6a5 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db @@ -41,11 +41,12 @@ case class StringEncoding(toStr: String) extends AnyVal * Pass your DB credentials to this class in a preferred secure way */ case class ConnectionConfig( - connectUrl: ConnectUrl, - userName: UserName, - password: Password, - adapter: Adapter, - encoding: StringEncoding) + connectUrl: ConnectUrl, + userName: UserName, + password: Password, + adapter: Adapter, + encoding: StringEncoding +) case class Database(toStr: String) extends AnyVal diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala index 731bc6077f..fc5a0066cf 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db import com.twitter.scalding._ diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala index 02600ab6dd..c1d9904fa1 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db.extensions @@ -20,11 +20,11 @@ import com.twitter.scalding.db._ object VerticaExtensions { def verticaMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = { - case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = None) - case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = None) case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = None) - case t @ DBColumnDefinition(BOOLEAN, _, _, None, _, _) => t.copy(sizeOpt = None) - case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(BOOLEAN, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = None) case t @ DBColumnDefinition(DOUBLE, _, _, _, _, _) => t.copy(sqlType = SqlTypeName("DOUBLE PRECISION")) } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala index 9e1a8f1f8b..5b055eda11 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala @@ -2,7 +2,7 @@ package com.twitter.scalding.db.macros import scala.language.experimental.macros import com.twitter.scalding.db.macros.impl._ -import com.twitter.scalding.db.{ ColumnDefinitionProvider, DBTypeDescriptor } +import com.twitter.scalding.db.{ColumnDefinitionProvider, DBTypeDescriptor} // This is the sealed base trait for scala runtime annotiations used by the JDBC macros. // These will read from these macros as a means to annotate fields to make up for the missing diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala index 9e5a4c6fcf..4e1f5bdde9 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala @@ -2,10 +2,10 @@ package com.twitter.scalding.db.macros.impl import scala.annotation.tailrec import scala.reflect.macros.Context -import scala.util.{ Success, Failure } +import scala.util.{Failure, Success} import com.twitter.bijection.macros.impl.IsCaseClassImpl -import com.twitter.scalding.db.{ ColumnDefinition, ColumnDefinitionProvider, ResultSetExtractor } +import com.twitter.scalding.db.{ColumnDefinition, ColumnDefinitionProvider, ResultSetExtractor} import com.twitter.scalding.db.macros.impl.handler._ // Simple wrapper to pass around the string name format of fields @@ -22,7 +22,10 @@ object ColumnDefinitionProviderImpl { val classSym = tpe.typeSymbol val moduleSym = classSym.companionSymbol if (moduleSym == NoSymbol) { - c.abort(c.enclosingPosition, s"No companion for case class ${tpe} available. Possibly a nested class? These do not work with this macro.") + c.abort( + c.enclosingPosition, + s"No companion for case class $tpe available. Possibly a nested class? These do not work with this macro." + ) } // pick the last apply method which (anecdotally) gives us the defaults // set in the case class declaration, not the companion object @@ -30,59 +33,100 @@ object ColumnDefinitionProviderImpl { val apply = applyList.last.asMethod // can handle only default parameters from the first parameter list // because subsequent parameter lists might depend on previous parameters - apply.paramss.head.map(_.asTerm).zipWithIndex.flatMap{ - case (p, i) => + apply.paramss.head + .map(_.asTerm) + .zipWithIndex + .flatMap { case (p, i) => if (!p.isParamWithDefault) None else { val getterName = newTermName("apply$default$" + (i + 1)) - Some(p.name.toString -> c.Expr(q"${moduleSym}.$getterName.toString")) + Some(p.name.toString -> c.Expr(q"$moduleSym.$getterName.toString")) } - }.toMap + } + .toMap } - private[scalding] def getColumnFormats[T](c: Context)(implicit T: c.WeakTypeTag[T]): List[ColumnFormat[c.type]] = { + private[scalding] def getColumnFormats[T]( + c: Context + )(implicit T: c.WeakTypeTag[T]): List[ColumnFormat[c.type]] = { import c.universe._ if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) - c.abort(c.enclosingPosition, s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. - This will mean the macro is operating on a non-resolved type.""") + c.abort( + c.enclosingPosition, + s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. + This will mean the macro is operating on a non-resolved type.""" + ) // Field To JDBCColumn @tailrec - def matchField(accessorTree: List[MethodSymbol], - oTpe: Type, - fieldName: FieldName, - defaultValOpt: Option[c.Expr[String]], - annotationInfo: List[(Type, Option[Int])], - nullable: Boolean): scala.util.Try[List[ColumnFormat[c.type]]] = { + def matchField( + accessorTree: List[MethodSymbol], + oTpe: Type, + fieldName: FieldName, + defaultValOpt: Option[c.Expr[String]], + annotationInfo: List[(Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = oTpe match { // String handling - case tpe if tpe =:= typeOf[String] => StringTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) - case tpe if tpe =:= typeOf[Array[Byte]] => BlobTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) - case tpe if tpe =:= typeOf[Byte] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "TINYINT") - case tpe if tpe =:= typeOf[Short] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "SMALLINT") - case tpe if tpe =:= typeOf[Int] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "INT") - case tpe if tpe =:= typeOf[Long] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BIGINT") - case tpe if tpe =:= typeOf[Double] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "DOUBLE") - case tpe if tpe =:= typeOf[Boolean] => NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BOOLEAN") - case tpe if tpe =:= typeOf[java.util.Date] => DateTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe =:= typeOf[String] => + StringTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe =:= typeOf[Array[Byte]] => + BlobTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe =:= typeOf[Byte] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "TINYINT") + case tpe if tpe =:= typeOf[Short] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "SMALLINT") + case tpe if tpe =:= typeOf[Int] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "INT") + case tpe if tpe =:= typeOf[Long] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BIGINT") + case tpe if tpe =:= typeOf[Double] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "DOUBLE") + case tpe if tpe =:= typeOf[Boolean] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BOOLEAN") + case tpe if tpe =:= typeOf[java.util.Date] => + DateTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) case tpe if tpe.erasure =:= typeOf[Option[Any]] && nullable == true => - Failure(new Exception(s"Case class ${T.tpe} has field ${fieldName} which contains a nested option. This is not supported by this macro.")) + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName which contains a nested option. This is not supported by this macro." + ) + ) case tpe if tpe.erasure =:= typeOf[Option[Any]] && nullable == false => if (defaultValOpt.isDefined) - Failure(new Exception(s"Case class ${T.tpe} has field ${fieldName}: ${oTpe.toString}, with a default value. Options cannot have default values")) + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName: ${oTpe.toString}, with a default value. Options cannot have default values" + ) + ) else { - matchField(accessorTree, tpe.asInstanceOf[TypeRefApi].args.head, fieldName, None, annotationInfo, true) + matchField( + accessorTree, + tpe.asInstanceOf[TypeRefApi].args.head, + fieldName, + None, + annotationInfo, + true + ) } case tpe if IsCaseClassImpl.isCaseClassType(c)(tpe) => expandMethod(accessorTree, tpe) // default - case _ => Failure(new Exception(s"Case class ${T.tpe} has field ${fieldName}: ${oTpe.toString}, which is not supported for talking to JDBC")) + case _ => + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName: ${oTpe.toString}, which is not supported for talking to JDBC" + ) + ) } - } - def expandMethod(outerAccessorTree: List[MethodSymbol], outerTpe: Type): scala.util.Try[List[ColumnFormat[c.type]]] = { + def expandMethod( + outerAccessorTree: List[MethodSymbol], + outerTpe: Type + ): scala.util.Try[List[ColumnFormat[c.type]]] = { val defaultArgs = getDefaultArgs(c)(outerTpe) // Intializes the type info @@ -90,44 +134,57 @@ object ColumnDefinitionProviderImpl { // We have to build this up front as if the case class definition moves to another file // the annotation moves from the value onto the getter method? - val annotationData: Map[String, List[(Type, List[Tree])]] = outerTpe - .declarations + val annotationData: Map[String, List[(Type, List[Tree])]] = outerTpe.declarations .map { m => val mappedAnnotations = m.annotations.map(t => (t.tpe, t.scalaArgs)) m.name.toString.trim -> mappedAnnotations - }.groupBy(_._1).map { - case (k, l) => - (k, l.map(_._2).reduce(_ ++ _)) - }.filter { - case (_, v) => - v.nonEmpty + } + .groupBy(_._1) + .map { case (k, l) => + (k, l.map(_._2).reduce(_ ++ _)) + } + .filter { case (_, v) => + v.nonEmpty } - outerTpe - .declarations + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .map { m => val fieldName = m.name.toString.trim val defaultVal = defaultArgs.get(fieldName) - val annotationInfo: List[(Type, Option[Int])] = annotationData.getOrElse(m.name.toString.trim, Nil) + val annotationInfo: List[(Type, Option[Int])] = annotationData + .getOrElse(m.name.toString.trim, Nil) .collect { - case (tpe, List(Literal(Constant(siz: Int)))) if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => (tpe, Some(siz)) - case (tpe, _) if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => c.abort(c.enclosingPosition, "Hit a size macro where we couldn't parse the value. Probably not a literal constant. Only literal constants are supported.") - case (tpe, _) if tpe <:< typeOf[com.twitter.scalding.db.macros.ScaldingDBAnnotation] => (tpe, None) + case (tpe, List(Literal(Constant(siz: Int)))) + if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => + (tpe, Some(siz)) + case (tpe, _) if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => + c.abort( + c.enclosingPosition, + "Hit a size macro where we couldn't parse the value. Probably not a literal constant. Only literal constants are supported." + ) + case (tpe, _) if tpe <:< typeOf[com.twitter.scalding.db.macros.ScaldingDBAnnotation] => + (tpe, None) } - matchField(outerAccessorTree :+ m, m.returnType, FieldName(fieldName), defaultVal, annotationInfo, false) + matchField( + outerAccessorTree :+ m, + m.returnType, + FieldName(fieldName), + defaultVal, + annotationInfo, + false + ) } .toList // This algorithm returns the error from the first exception we run into. - .foldLeft(scala.util.Try[List[ColumnFormat[c.type]]](Nil)) { - case (pTry, nxt) => - (pTry, nxt) match { - case (Success(l), Success(r)) => Success(l ::: r) - case (f @ Failure(_), _) => f - case (_, f @ Failure(_)) => f - } + .foldLeft(scala.util.Try[List[ColumnFormat[c.type]]](Nil)) { case (pTry, nxt) => + (pTry, nxt) match { + case (Success(l), Success(r)) => Success(l ::: r) + case (f @ Failure(_), _) => f + case (_, f @ Failure(_)) => f + } } } @@ -143,11 +200,13 @@ object ColumnDefinitionProviderImpl { .keys if (duplicateFields.nonEmpty) { - c.abort(c.enclosingPosition, + c.abort( + c.enclosingPosition, s""" Duplicate field names found: ${duplicateFields.mkString(",")}. Please check your nested case classes. - """) + """ + ) } else { formats } @@ -158,21 +217,21 @@ object ColumnDefinitionProviderImpl { val columnFormats = getColumnFormats[T](c) - columnFormats.map { - case cf: ColumnFormat[_] => - val nullableVal = if (cf.nullable) + columnFormats.map { case cf: ColumnFormat[_] => + val nullableVal = + if (cf.nullable) q"_root_.com.twitter.scalding.db.Nullable" else q"_root_.com.twitter.scalding.db.NotNullable" - val fieldTypeSelect = Select(q"_root_.com.twitter.scalding.db", newTermName(cf.fieldType)) - val res = q"""new _root_.com.twitter.scalding.db.ColumnDefinition( + val fieldTypeSelect = Select(q"_root_.com.twitter.scalding.db", newTermName(cf.fieldType)) + val res = q"""new _root_.com.twitter.scalding.db.ColumnDefinition( $fieldTypeSelect, _root_.com.twitter.scalding.db.ColumnName(${cf.fieldName.toStr}), $nullableVal, ${cf.sizeOpt}, ${cf.defaultValue}) """ - c.Expr[ColumnDefinition](res) + c.Expr[ColumnDefinition](res) } } @@ -185,50 +244,49 @@ object ColumnDefinitionProviderImpl { // we validate two things from ResultSetMetadata // 1. the column types match with actual DB schema // 2. all non-nullable fields are indeed non-nullable in DB schema - val checks = columnFormats.zipWithIndex.map { - case (cf: ColumnFormat[_], pos: Int) => - val fieldName = cf.fieldName.toStr - val typeNameTerm = newTermName(c.fresh(s"colTypeName_$pos")) - // MySQL uses names like `DATE`, `INTEGER` and `VARCHAR`; - // Vertica uses names like `Date`, `Integer` and `Varchar` - val typeName = q""" + val checks = columnFormats.zipWithIndex.map { case (cf: ColumnFormat[_], pos: Int) => + val fieldName = cf.fieldName.toStr + val typeNameTerm = newTermName(c.fresh(s"colTypeName_$pos")) + // MySQL uses names like `DATE`, `INTEGER` and `VARCHAR`; + // Vertica uses names like `Date`, `Integer` and `Varchar` + val typeName = q""" val $typeNameTerm = $rsmdTerm.getColumnTypeName(${pos + 1}).toUpperCase(java.util.Locale.US) """ - // certain types have synonyms, so we group them together here - // note: this is mysql specific - // http://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html - val typeValidation = cf.fieldType match { - case "VARCHAR" => q"""List("VARCHAR", "CHAR").contains($typeNameTerm)""" - case "BOOLEAN" | "TINYINT" => q"""List("BOOLEAN", "BOOL", "TINYINT").contains($typeNameTerm)""" - case "INT" => q"""List("INTEGER", "INT").contains($typeNameTerm)""" - // In Vertica, `INTEGER`, `INT`, `BIGINT`, `INT8`, `SMALLINT`, and `TINYINT` are all 64 bits - // https://www.vertica.com/docs/8.1.x/HTML/index.htm#Authoring/SQLReferenceManual/DataTypes/Numeric/INTEGER.htm - // In MySQL, `TINYINT`, `SMALLINT`, `MEDIUMINT`, `INT`, and `BIGINT` are all <= 64 bits - // https://dev.mysql.com/doc/refman/5.7/en/integer-types.html - // As the user has told us this field can store a `BIGINT`, we can safely accept any of these - // types from the database. - case "BIGINT" => - q"""List("INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", + // certain types have synonyms, so we group them together here + // note: this is mysql specific + // http://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html + val typeValidation = cf.fieldType match { + case "VARCHAR" => q"""List("VARCHAR", "CHAR").contains($typeNameTerm)""" + case "BOOLEAN" | "TINYINT" => q"""List("BOOLEAN", "BOOL", "TINYINT").contains($typeNameTerm)""" + case "INT" => q"""List("INTEGER", "INT").contains($typeNameTerm)""" + // In Vertica, `INTEGER`, `INT`, `BIGINT`, `INT8`, `SMALLINT`, and `TINYINT` are all 64 bits + // https://www.vertica.com/docs/8.1.x/HTML/index.htm#Authoring/SQLReferenceManual/DataTypes/Numeric/INTEGER.htm + // In MySQL, `TINYINT`, `SMALLINT`, `MEDIUMINT`, `INT`, and `BIGINT` are all <= 64 bits + // https://dev.mysql.com/doc/refman/5.7/en/integer-types.html + // As the user has told us this field can store a `BIGINT`, we can safely accept any of these + // types from the database. + case "BIGINT" => + q"""List("INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", "TINYINT", "SMALLINT", "MEDIUMINT").contains($typeNameTerm)""" - case "DATETIME" => q"""List("DATE","DATETIME","TIMESTAMP").contains($typeNameTerm)""" - case f => q"""$f == $typeNameTerm""" - } - val typeAssert = q""" + case "DATETIME" => q"""List("DATE","DATETIME","TIMESTAMP").contains($typeNameTerm)""" + case f => q"""$f == $typeNameTerm""" + } + val typeAssert = q""" if (!$typeValidation) { throw new _root_.com.twitter.scalding.db.JdbcValidationException( "Mismatched type for column '" + $fieldName + "'. Expected " + ${cf.fieldType} + " but set to " + $typeNameTerm + " in DB.") } """ - val nullableTerm = newTermName(c.fresh(s"isNullable_$pos")) - val nullableValidation = q""" + val nullableTerm = newTermName(c.fresh(s"isNullable_$pos")) + val nullableValidation = q""" val $nullableTerm = $rsmdTerm.isNullable(${pos + 1}) if ($nullableTerm == _root_.java.sql.ResultSetMetaData.columnNoNulls && ${cf.nullable}) { throw new _root_.com.twitter.scalding.db.JdbcValidationException( "Column '" + $fieldName + "' is not nullable in DB.") } """ - q""" + q""" $typeName $typeAssert $nullableValidation @@ -248,7 +306,10 @@ object ColumnDefinitionProviderImpl { case "TINYINT" => (Some(q"""_root_.java.lang.Byte.valueOf"""), q"""$rsTerm.getByte($fieldName)""") case "DATE" | "DATETIME" => - (None, q"""Option($rsTerm.getTimestamp($fieldName)).map { ts => new java.util.Date(ts.getTime) }.orNull""") + ( + None, + q"""Option($rsTerm.getTimestamp($fieldName)).map { ts => new java.util.Date(ts.getTime) }.orNull""" + ) // dates set to null are populated as None by tuple converter // if the corresponding case class field is an Option[Date] case "DOUBLE" => @@ -258,13 +319,16 @@ object ColumnDefinitionProviderImpl { case "INT" | "SMALLINT" => (Some(q"""_root_.java.lang.Integer.valueOf"""), q"""$rsTerm.getInt($fieldName)""") case "BLOB" => - (None, q"""Option($rsTerm.getBlob($fieldName)).map ( blob => blob.getBytes(1,blob.length().toInt)).orNull """) + ( + None, + q"""Option($rsTerm.getBlob($fieldName)).map ( blob => blob.getBytes(1,blob.length().toInt)).orNull """ + ) case f => (None, q"""sys.error("Invalid format " + $f + " for " + $fieldName)""") } // note: UNSIGNED BIGINT is currently unsupported val valueTerm = newTermName(c.fresh("colValue")) - val boxed = box.map { b => q"""$b($valueTerm)""" }.getOrElse(q"""$valueTerm""") + val boxed = box.map(b => q"""$b($valueTerm)""").getOrElse(q"""$valueTerm""") // primitiveGetter needs to be invoked before we can use wasNull // to check if the column value that was read is null or not q""" diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala index 82006fc39d..e0ccfdd710 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.db.macros.impl import scala.reflect.macros.Context import com.twitter.bijection.macros.impl.IsCaseClassImpl -import com.twitter.scalding.macros.impl.{ FieldsProviderImpl, TupleConverterImpl, TupleSetterImpl } +import com.twitter.scalding.macros.impl.{FieldsProviderImpl, TupleConverterImpl, TupleSetterImpl} import com.twitter.scalding.db.DBTypeDescriptor object DBTypeDescriptorImpl { @@ -12,8 +12,11 @@ object DBTypeDescriptorImpl { import c.universe._ if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) - c.abort(c.enclosingPosition, s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. - This will mean the macro is operating on a non-resolved type.""") + c.abort( + c.enclosingPosition, + s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. + This will mean the macro is operating on a non-resolved type.""" + ) val columnDefn = ColumnDefinitionProviderImpl[T](c) val converter = TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T](c) diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala index b4b4e1f482..0752f9ab71 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala @@ -35,21 +35,23 @@ private[macros] object JdbcFieldSetter extends CaseClassFieldSetter { q"""$container.setObject($idx + 1, $fieldValue)""" } - override def from(c: Context)(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { + override def from( + c: Context + )(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { import c.universe._ // jdbc Statement indexes are one-based, hence +1 here - def simpleType(accessor: Tree) = q"""${accessor}(${idx + 1}, $fieldValue)""" + def simpleType(accessor: Tree) = q"""$accessor(${idx + 1}, $fieldValue)""" fieldType match { - case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") + case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") case tpe if tpe =:= typeOf[Boolean] => simpleType(q"$container.setBoolean") - case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") - case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInt") - case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") - case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") - case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") - case _ => sys.error(s"Unsupported primitive type ${fieldType}") + case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") + case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInt") + case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") + case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") + case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") + case _ => sys.error(s"Unsupported primitive type $fieldType") } } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala index 9d5a621721..2d5d963c0b 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala @@ -25,8 +25,9 @@ import com.twitter.scalding.db.JdbcStatementSetter */ private[macros] object JdbcStatementSetterImpl { - def caseClassJdbcSetterCommonImpl[T](c: Context, - allowUnknownTypes: Boolean)(implicit T: c.WeakTypeTag[T]): c.Expr[JdbcStatementSetter[T]] = { + def caseClassJdbcSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[JdbcStatementSetter[T]] = { import c.universe._ val stmtTerm = newTermName(c.fresh("stmt")) @@ -42,4 +43,3 @@ private[macros] object JdbcStatementSetterImpl { c.Expr[JdbcStatementSetter[T]](res) } } - diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala index cabe14f100..ce02769e97 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala @@ -1,7 +1,7 @@ package com.twitter.scalding.db.macros.impl.handler import scala.reflect.macros.Context -import scala.util.{ Success, Failure } +import scala.util.{Failure, Success} import com.twitter.scalding.db.macros.impl.FieldName @@ -28,7 +28,9 @@ private[handler] abstract class AnnotationHelper { import ctx.universe._ def sizeAnnotation: scala.util.Try[(AnnotationHelper, SizeAnno)] = - consume[SizeAnno](typeOf[com.twitter.scalding.db.macros.size])(_.flatten.map(o => WithSize(o)).getOrElse(WithoutSize)) + consume[SizeAnno](typeOf[com.twitter.scalding.db.macros.size])( + _.flatten.map(o => WithSize(o)).getOrElse(WithoutSize) + ) def textAnnotation: scala.util.Try[(AnnotationHelper, TextAnno)] = consume(typeOf[com.twitter.scalding.db.macros.text])(_.map(_ => WithText).getOrElse(WithoutText)) @@ -39,9 +41,11 @@ private[handler] abstract class AnnotationHelper { def dateAnnotation: scala.util.Try[(AnnotationHelper, DateAnno)] = consume(typeOf[com.twitter.scalding.db.macros.date])(_.map(_ => WithDate).getOrElse(WithoutDate)) - def consume[T](t: ctx.universe.Type)(fn: Option[Option[Int]] => T): scala.util.Try[(AnnotationHelper, T)] = { - val (matchedAnnotations, remainingAnnotations) = cannotationInfo.partition { - case (tpe, _) => tpe =:= t + def consume[T]( + t: ctx.universe.Type + )(fn: Option[Option[Int]] => T): scala.util.Try[(AnnotationHelper, T)] = { + val (matchedAnnotations, remainingAnnotations) = cannotationInfo.partition { case (tpe, _) => + tpe =:= t } val newHelper = new { @@ -52,12 +56,12 @@ private[handler] abstract class AnnotationHelper { matchedAnnotations match { case h :: Nil => Success((newHelper, fn(Some(h._2)))) - case h :: t => Failure(new Exception(s"Error more than one annotation when looking for $t")) - case Nil => Success((newHelper, fn(None))) + case h :: t => Failure(new Exception(s"Error more than one annotation when looking for $t")) + case Nil => Success((newHelper, fn(None))) } } - def validateFinished: scala.util.Try[Unit] = { + def validateFinished: scala.util.Try[Unit] = if (cannotationInfo.isEmpty) { Success(()) } else { @@ -67,5 +71,4 @@ private[handler] abstract class AnnotationHelper { """ Failure(new Exception(msg)) } - } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala index 3bf41ab6c4..4d4e5afb8c 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala @@ -2,20 +2,22 @@ package com.twitter.scalding.db.macros.impl.handler import com.twitter.scalding.db.macros.impl.FieldName import scala.reflect.macros.Context -import scala.util.{ Failure, Success } +import scala.util.{Failure, Success} object BlobTypeHandler { - def apply[T](c: Context)( - implicit - accessorTree: List[c.universe.MethodSymbol], - fieldName: FieldName, - defaultValue: Option[c.Expr[String]], - annotationInfo: List[(c.universe.Type, Option[Int])], - nullable: Boolean): scala.util.Try[List[ColumnFormat[c.type]]] = { - + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = if (defaultValue.nonEmpty || annotationInfo.nonEmpty) - Failure(new Exception(s"Default values and annotation info are not supported: defaultValue = $defaultValue annotationInfo = $annotationInfo")) + Failure( + new Exception( + s"Default values and annotation info are not supported: defaultValue = $defaultValue annotationInfo = $annotationInfo" + ) + ) else Success(List(ColumnFormat(c)(accessorTree, "BLOB", None))) - } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala index 2ed8dbca40..fed2e42935 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala @@ -5,9 +5,11 @@ import scala.reflect.macros.Context import com.twitter.scalding.db.macros.impl.FieldName object ColumnFormat { - def apply(c: Context)(fAccessor: List[c.universe.MethodSymbol], fType: String, size: Option[Int])(implicit fName: FieldName, - isNullable: Boolean, defaultV: Option[c.Expr[String]]): ColumnFormat[c.type] = { - + def apply(c: Context)(fAccessor: List[c.universe.MethodSymbol], fType: String, size: Option[Int])(implicit + fName: FieldName, + isNullable: Boolean, + defaultV: Option[c.Expr[String]] + ): ColumnFormat[c.type] = new ColumnFormat[c.type](c) { val fieldAccessor = fAccessor val fieldType = fType @@ -16,14 +18,12 @@ object ColumnFormat { val sizeOpt = size val defaultValue = defaultV } - } } /** * Contains data format information for a column as defined in the case class. * - * Used by the ColumnDefinitionProvider macro too generate columns definitions and - * JDBC ResultSet extractor. + * Used by the ColumnDefinitionProvider macro too generate columns definitions and JDBC ResultSet extractor. */ abstract class ColumnFormat[C <: Context](val ctx: C) { def fieldAccessor: List[ctx.universe.MethodSymbol] @@ -33,4 +33,3 @@ abstract class ColumnFormat[C <: Context](val ctx: C) { def sizeOpt: Option[Int] def defaultValue: Option[ctx.Expr[String]] } - diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala index 736ae6cdda..26430c9670 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala @@ -7,11 +7,13 @@ import com.twitter.scalding.db.macros.impl.FieldName object DateTypeHandler { - def apply[T](c: Context)(implicit accessorTree: List[c.universe.MethodSymbol], - fieldName: FieldName, - defaultValue: Option[c.Expr[String]], - annotationInfo: List[(c.universe.Type, Option[Int])], - nullable: Boolean): scala.util.Try[List[ColumnFormat[c.type]]] = { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = { val helper = new { val ctx: c.type = c @@ -22,10 +24,10 @@ object DateTypeHandler { val extracted = for { (nextHelper, dateAnno) <- helper.dateAnnotation _ <- nextHelper.validateFinished - } yield (dateAnno) + } yield dateAnno extracted.flatMap { - case WithDate => Success(List(ColumnFormat(c)(accessorTree, "DATE", None))) + case WithDate => Success(List(ColumnFormat(c)(accessorTree, "DATE", None))) case WithoutDate => Success(List(ColumnFormat(c)(accessorTree, "DATETIME", None))) } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala index 43058feae6..492e2fad92 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala @@ -1,17 +1,19 @@ package com.twitter.scalding.db.macros.impl.handler import scala.reflect.macros.Context -import scala.util.{ Success, Failure } +import scala.util.{Failure, Success} import com.twitter.scalding.db.macros.impl.FieldName object NumericTypeHandler { - def apply[T](c: Context)(implicit accessorTree: List[c.universe.MethodSymbol], - fieldName: FieldName, - defaultValue: Option[c.Expr[String]], - annotationInfo: List[(c.universe.Type, Option[Int])], - nullable: Boolean, - numericType: String): scala.util.Try[List[ColumnFormat[c.type]]] = { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean, + numericType: String + ): scala.util.Try[List[ColumnFormat[c.type]]] = { val helper = new { val ctx: c.type = c @@ -22,7 +24,7 @@ object NumericTypeHandler { val extracted = for { (nextHelper, sizeAnno) <- helper.sizeAnnotation _ <- nextHelper.validateFinished - } yield (sizeAnno) + } yield sizeAnno extracted.flatMap { case WithSize(s) if s > 0 => Success(List(ColumnFormat(c)(accessorTree, numericType, Some(s)))) diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala index 83864fcc3e..a1777e35b3 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala @@ -1,16 +1,18 @@ package com.twitter.scalding.db.macros.impl.handler import scala.reflect.macros.Context -import scala.util.{ Success, Failure } +import scala.util.{Failure, Success} import com.twitter.scalding.db.macros.impl.FieldName object StringTypeHandler { - def apply[T](c: Context)(implicit accessorTree: List[c.universe.MethodSymbol], - fieldName: FieldName, - defaultValue: Option[c.Expr[String]], - annotationInfo: List[(c.universe.Type, Option[Int])], - nullable: Boolean): scala.util.Try[List[ColumnFormat[c.type]]] = { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = { val helper = new { val ctx: c.type = c @@ -26,13 +28,32 @@ object StringTypeHandler { } yield (sizeAnno, varcharAnno, textAnno) extracted.flatMap { - case (_, WithVarchar, WithText) => Failure(new Exception(s"String field $fieldName, has mutually exclusive annotations @text and @varchar")) - case (WithoutSize, WithVarchar, WithoutText) => Failure(new Exception(s"String field $fieldName, is forced varchar but has no size annotation. size is required in the presence of varchar.")) - case (WithoutSize, WithoutVarchar, WithoutText) => Failure(new Exception(s"String field $fieldName, at least one of size, varchar, text must be present.")) - case (WithSize(siz), _, _) if siz <= 0 => Failure(new Exception(s"String field $fieldName, has a size $siz which is <= 0. Doesn't make sense for a string.")) - case (WithSize(siz), WithoutVarchar, WithoutText) if siz <= 255 => Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) - case (WithSize(siz), WithoutVarchar, WithoutText) if siz > 255 => Success(List(ColumnFormat(c)(accessorTree, "TEXT", None))) - case (WithSize(siz), WithVarchar, WithoutText) => Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) + case (_, WithVarchar, WithText) => + Failure( + new Exception(s"String field $fieldName, has mutually exclusive annotations @text and @varchar") + ) + case (WithoutSize, WithVarchar, WithoutText) => + Failure( + new Exception( + s"String field $fieldName, is forced varchar but has no size annotation. size is required in the presence of varchar." + ) + ) + case (WithoutSize, WithoutVarchar, WithoutText) => + Failure( + new Exception(s"String field $fieldName, at least one of size, varchar, text must be present.") + ) + case (WithSize(siz), _, _) if siz <= 0 => + Failure( + new Exception( + s"String field $fieldName, has a size $siz which is <= 0. Doesn't make sense for a string." + ) + ) + case (WithSize(siz), WithoutVarchar, WithoutText) if siz <= 255 => + Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) + case (WithSize(siz), WithoutVarchar, WithoutText) if siz > 255 => + Success(List(ColumnFormat(c)(accessorTree, "TEXT", None))) + case (WithSize(siz), WithVarchar, WithoutText) => + Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) case (_, WithoutVarchar, WithText) => Success(List(ColumnFormat(c)(accessorTree, "TEXT", None))) } } diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala index 6cd3aa5497..6a87b12d6c 100644 --- a/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala @@ -12,17 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.db -import scala.language.experimental.{ macros => sMacros } +import scala.language.experimental.{macros => sMacros} -import com.twitter.scalding.db.macros.impl.{ ColumnDefinitionProviderImpl, DBTypeDescriptorImpl } +import com.twitter.scalding.db.macros.impl.{ColumnDefinitionProviderImpl, DBTypeDescriptorImpl} // The implicits in the jdbc.macro's package // These are to allow us to auto provide our Type Classes without the user possibly knowing // all of the various ways we could build it. package object macros { - implicit def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = macro ColumnDefinitionProviderImpl[T] + implicit def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = + macro ColumnDefinitionProviderImpl[T] implicit def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T] } diff --git a/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala b/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala index d598249bb9..15968a11b8 100644 --- a/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala +++ b/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala @@ -6,6 +6,6 @@ import org.scalacheck.Prop._ object DBOptionsTest extends Properties("DBOptions") { property("password") = forAll { x: String => ("Password toString should not be equal to x" |: Password(x).toString != x) && - ("Password toStr should be equal to x" |: Password(x).toStr == x) + ("Password toStr should be equal to x" |: Password(x).toStr == x) } } diff --git a/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala b/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala index 0919af1a1d..bae1a42110 100644 --- a/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala +++ b/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala @@ -1,14 +1,14 @@ package com.twitter.scalding.db.macros import org.mockito.Mockito.when -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import org.scalatest.Inside._ import org.scalatest.exceptions.TestFailedException import org.scalatest.mock.MockitoSugar -import cascading.tuple.{ Fields, Tuple, TupleEntry } +import cascading.tuple.{Fields, Tuple, TupleEntry} import com.twitter.bijection.macros.MacroGenerated import com.twitter.scalding.db._ -import java.sql.{ Blob, ResultSet, ResultSetMetaData } +import java.sql.{Blob, ResultSet, ResultSetMetaData} import java.util.Date import javax.sql.rowset.serial.SerialBlob @@ -21,19 +21,15 @@ object User { } case class User( - date_id: Int, - @size(64) user_name: String, - age: Option[Int], - @size(22) gender: String = "male") + date_id: Int, + @size(64) user_name: String, + age: Option[Int], + @size(22) gender: String = "male" +) -case class Demographics( - age: Option[Int], - @size(22) gender: String = "male") +case class Demographics(age: Option[Int], @size(22) gender: String = "male") -case class User2( - date_id: Int, - @size(64) user_name: String, - demographics: Demographics) +case class User2(date_id: Int, @size(64) user_name: String, demographics: Demographics) case class BadUser1(user_name: String, age: Int = 13) case class BadUser2(@size(-1) user_name: String, age: Int) @@ -46,54 +42,57 @@ object Consts { } case class BadUser7(@size(Consts.cInt) age: Int) case class BadUser8(age: Option[Option[Int]]) -case class BadUser9(@size(15)@text age: Option[Option[Int]]) -case class BadUser10(@size(2)@size(4) age: Option[Option[Int]]) +case class BadUser9(@size(15) @text age: Option[Option[Int]]) +case class BadUser10(@size(2) @size(4) age: Option[Option[Int]]) case class ExhaustiveJdbcCaseClass( - bigInt: Long, // 8 bytes - smallerAgainInt: Int, // 4 bytes - @size(5) normalIntWithSize: Int, // Sizes on numerics seem to just be for display. Not sure if its worth allowing. - evenSmallerInt: Short, // 2 bytes - numberFun: Double, - booleanFlag: Boolean, // 1 byte -- tinyint - @size(20) smallString: String, // Should goto varchar - @size(200) smallishString: String, // Should goto varchar - @size(2048) largeString: String, // Should goto TEXT - @text forceTextString: String, // Force smaller to text, stored out of the table. So row query speed possibly faster - @size(2051)@varchar forcedVarChar: String, // Forced inline to table -- only some sql version support > 255 for varchar - myDateWithTime: Date, // Default goes to MySQL DateTime/Timestamp so its not lossy - @date myDateWithoutTime: Date, - optiLong: Option[Long], // Nullable long - byteArr: Array[Byte], - tinyInt: Byte) + bigInt: Long, // 8 bytes + smallerAgainInt: Int, // 4 bytes + @size( + 5 + ) normalIntWithSize: Int, // Sizes on numerics seem to just be for display. Not sure if its worth allowing. + evenSmallerInt: Short, // 2 bytes + numberFun: Double, + booleanFlag: Boolean, // 1 byte -- tinyint + @size(20) smallString: String, // Should goto varchar + @size(200) smallishString: String, // Should goto varchar + @size(2048) largeString: String, // Should goto TEXT + @text forceTextString: String, // Force smaller to text, stored out of the table. So row query speed possibly faster + @size( + 2051 + ) @varchar forcedVarChar: String, // Forced inline to table -- only some sql version support > 255 for varchar + myDateWithTime: Date, // Default goes to MySQL DateTime/Timestamp so its not lossy + @date myDateWithoutTime: Date, + optiLong: Option[Long], // Nullable long + byteArr: Array[Byte], + tinyInt: Byte +) private final case class VerticaCaseClass( - verticaLong: Long, - @date verticaDate: Date, - @varchar @size(size = 1) verticaVarchar1: String) + verticaLong: Long, + @date verticaDate: Date, + @varchar @size(size = 1) verticaVarchar1: String +) -case class CaseClassWithDate( - id: Long, - myDateWithTime: Date, - @date myDateWithoutTime: Date) +case class CaseClassWithDate(id: Long, myDateWithTime: Date, @date myDateWithoutTime: Date) case class CaseClassWithOptions( - id: Option[Int], - @size(20) name: Option[String], - date_id: Option[Date], - boolean_value: Option[Boolean], - short_value: Option[Short], - long_value: Option[Long], - double_value: Option[Double]) - -case class InnerWithBadNesting( - age: Int, - id: Long) + id: Option[Int], + @size(20) name: Option[String], + date_id: Option[Date], + boolean_value: Option[Boolean], + short_value: Option[Short], + long_value: Option[Long], + double_value: Option[Double] +) + +case class InnerWithBadNesting(age: Int, id: Long) case class OuterWithBadNesting( - id: Int, // duplicate in nested case class - @text name: String, - details: InnerWithBadNesting) + id: Int, // duplicate in nested case class + @text name: String, + details: InnerWithBadNesting +) class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { @@ -102,12 +101,16 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { override val resultSetExtractor = null } - def isColumnDefinitionAvailable[T](implicit proof: ColumnDefinitionProvider[T] = dummy.asInstanceOf[ColumnDefinitionProvider[T]]): Unit = { + def isColumnDefinitionAvailable[T](implicit + proof: ColumnDefinitionProvider[T] = dummy.asInstanceOf[ColumnDefinitionProvider[T]] + ): Unit = { proof shouldBe a[MacroGenerated] proof.columns.isEmpty shouldBe false } - def isJDBCTypeInfoAvailable[T](implicit proof: DBTypeDescriptor[T] = dummy.asInstanceOf[DBTypeDescriptor[T]]): Unit = { + def isJDBCTypeInfoAvailable[T](implicit + proof: DBTypeDescriptor[T] = dummy.asInstanceOf[DBTypeDescriptor[T]] + ): Unit = { proof shouldBe a[MacroGenerated] proof.columnDefn.columns.isEmpty shouldBe false } @@ -157,7 +160,8 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), - ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male"))) + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) val typeDesc = DBMacro.toDBTypeDescriptor[User] val columnDef = typeDesc.columnDefn @@ -167,24 +171,26 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { assert(typeDesc.fields.equalsFields(expectedFields)) val rsmd = mock[ResultSetMetaData] - when(rsmd.getColumnTypeName(1)) thenReturn ("INT") - when(rsmd.isNullable(1)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(2)) thenReturn ("VARCHAR") - when(rsmd.isNullable(2)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(3)) thenReturn ("INT") - when(rsmd.isNullable(3)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(4)) thenReturn ("VARCHAR") - when(rsmd.isNullable(4)) thenReturn (ResultSetMetaData.columnNullableUnknown) + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(3)).thenReturn("INT") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("VARCHAR") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullableUnknown) assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) val rs = mock[ResultSet] - when(rs.getInt("date_id")) thenReturn (123) - when(rs.getString("user_name")) thenReturn ("alice") - when(rs.getInt("age")) thenReturn (26) - when(rs.getString("gender")) thenReturn ("F") - - assert(columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == User(123, "alice", Some(26), "F")) + when(rs.getInt("date_id")).thenReturn(123) + when(rs.getString("user_name")).thenReturn("alice") + when(rs.getInt("age")).thenReturn(26) + when(rs.getString("gender")).thenReturn("F") + + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == User(123, "alice", Some(26), "F") + ) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 } @@ -196,7 +202,8 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), - ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male"))) + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) val typeDesc = DBMacro.toDBTypeDescriptor[User2] val columnDef = typeDesc.columnDefn @@ -206,12 +213,15 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { assert(typeDesc.fields.equalsFields(expectedFields)) val rs = mock[ResultSet] - when(rs.getInt("date_id")) thenReturn (123) - when(rs.getString("user_name")) thenReturn ("alice") - when(rs.getInt("age")) thenReturn (26) - when(rs.getString("gender")) thenReturn ("F") - - assert(columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == User2(123, "alice", Demographics(Some(26), "F"))) + when(rs.getInt("date_id")).thenReturn(123) + when(rs.getString("user_name")).thenReturn("alice") + when(rs.getInt("age")).thenReturn(26) + when(rs.getString("gender")).thenReturn("F") + + assert( + columnDef.resultSetExtractor + .toCaseClass(rs, typeDesc.converter) == User2(123, "alice", Demographics(Some(26), "F")) + ) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 } @@ -225,7 +235,8 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), - ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male"))) + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) assert(DBMacro.toDBTypeDescriptor[User].columnDefn.columns.toList === expectedColumns) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 @@ -236,27 +247,28 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val expectedColumns = List( ColumnDefinition(BIGINT, ColumnName("verticaLong"), NotNullable, None, None), ColumnDefinition(DATE, ColumnName("verticaDate"), NotNullable, None, None), - ColumnDefinition(VARCHAR, ColumnName("verticaVarchar1"), NotNullable, Some(1), None)) + ColumnDefinition(VARCHAR, ColumnName("verticaVarchar1"), NotNullable, Some(1), None) + ) assert(typeDescriptor.columnDefn.columns.toList === expectedColumns) // Vertica uses `Integer` - val int64TypeNames = List("Integer", "INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", - "TINYINT", "SMALLINT", "MEDIUMINT") + val int64TypeNames = + List("Integer", "INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", "TINYINT", "SMALLINT", "MEDIUMINT") // Vertica uses `Date` val dateTypeNames = List("Date", "DATE") // Vertica uses `Varchar` val varcharTypeNames = List("Varchar", "VARCHAR") - int64TypeNames foreach { int64TypeName => - dateTypeNames foreach { dateTypeName => - varcharTypeNames foreach { varcharTypeName => + int64TypeNames.foreach { int64TypeName => + dateTypeNames.foreach { dateTypeName => + varcharTypeNames.foreach { varcharTypeName => val resultSetMetaData = mock[ResultSetMetaData] - when(resultSetMetaData.getColumnTypeName(1)) thenReturn (int64TypeName) - when(resultSetMetaData.isNullable(1)) thenReturn (ResultSetMetaData.columnNoNulls) - when(resultSetMetaData.getColumnTypeName(2)) thenReturn (dateTypeName) - when(resultSetMetaData.isNullable(2)) thenReturn (ResultSetMetaData.columnNoNulls) - when(resultSetMetaData.getColumnTypeName(3)) thenReturn (varcharTypeName) - when(resultSetMetaData.isNullable(3)) thenReturn (ResultSetMetaData.columnNoNulls) + when(resultSetMetaData.getColumnTypeName(1)).thenReturn(int64TypeName) + when(resultSetMetaData.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(resultSetMetaData.getColumnTypeName(2)).thenReturn(dateTypeName) + when(resultSetMetaData.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(resultSetMetaData.getColumnTypeName(3)).thenReturn(varcharTypeName) + when(resultSetMetaData.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) val validationResult = typeDescriptor.columnDefn.resultSetExtractor.validate(resultSetMetaData) @@ -292,45 +304,46 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { ColumnDefinition(DATE, ColumnName("myDateWithoutTime"), NotNullable, None, None), ColumnDefinition(BIGINT, ColumnName("optiLong"), Nullable, None, None), ColumnDefinition(BLOB, ColumnName("byteArr"), NotNullable, None, None), - ColumnDefinition(TINYINT, ColumnName("tinyInt"), NotNullable, None, None)) + ColumnDefinition(TINYINT, ColumnName("tinyInt"), NotNullable, None, None) + ) val typeDesc = DBMacro.toDBTypeDescriptor[ExhaustiveJdbcCaseClass] val columnDef = typeDesc.columnDefn assert(columnDef.columns.toList === expectedColumns) val rsmd = mock[ResultSetMetaData] - when(rsmd.getColumnTypeName(1)) thenReturn ("BIGINT") - when(rsmd.isNullable(1)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(2)) thenReturn ("INT") - when(rsmd.isNullable(2)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(3)) thenReturn ("INTEGER") // synonym of INT - when(rsmd.isNullable(3)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(4)) thenReturn ("SMALLINT") - when(rsmd.isNullable(4)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(5)) thenReturn ("DOUBLE") - when(rsmd.isNullable(5)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(6)) thenReturn ("TINYINT") - when(rsmd.isNullable(6)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(7)) thenReturn ("VARCHAR") - when(rsmd.isNullable(7)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(8)) thenReturn ("CHAR") // synonym of VARCHAR - when(rsmd.isNullable(8)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(9)) thenReturn ("TEXT") - when(rsmd.isNullable(9)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(10)) thenReturn ("TEXT") - when(rsmd.isNullable(10)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(11)) thenReturn ("VARCHAR") - when(rsmd.isNullable(11)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(12)) thenReturn ("DATETIME") - when(rsmd.isNullable(12)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(13)) thenReturn ("DATE") - when(rsmd.isNullable(13)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(14)) thenReturn ("BIGINT") - when(rsmd.isNullable(14)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(15)) thenReturn ("BLOB") - when(rsmd.isNullable(15)) thenReturn (ResultSetMetaData.columnNoNulls) - when(rsmd.getColumnTypeName(16)) thenReturn ("TINYINT") - when(rsmd.isNullable(16)) thenReturn (ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(1)).thenReturn("BIGINT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(2)).thenReturn("INT") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(3)).thenReturn("INTEGER") // synonym of INT + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(4)).thenReturn("SMALLINT") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(5)).thenReturn("DOUBLE") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(6)).thenReturn("TINYINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(7)).thenReturn("VARCHAR") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(8)).thenReturn("CHAR") // synonym of VARCHAR + when(rsmd.isNullable(8)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(9)).thenReturn("TEXT") + when(rsmd.isNullable(9)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(10)).thenReturn("TEXT") + when(rsmd.isNullable(10)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(11)).thenReturn("VARCHAR") + when(rsmd.isNullable(11)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(12)).thenReturn("DATETIME") + when(rsmd.isNullable(12)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(13)).thenReturn("DATE") + when(rsmd.isNullable(13)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(14)).thenReturn("BIGINT") + when(rsmd.isNullable(14)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(15)).thenReturn("BLOB") + when(rsmd.isNullable(15)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(16)).thenReturn("TINYINT") + when(rsmd.isNullable(16)).thenReturn(ResultSetMetaData.columnNoNulls) assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) @@ -338,57 +351,58 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val byteArr: Array[Byte] = byteArrStr.getBytes val blob: Blob = new SerialBlob(byteArr) val rs = mock[ResultSet] - when(rs.getLong("bigInt")) thenReturn (12345678L) - when(rs.getInt("smallerAgainInt")) thenReturn (123) - when(rs.getInt("normalIntWithSize")) thenReturn (12) - when(rs.getInt("evenSmallerInt")) thenReturn (1) - when(rs.getDouble("numberFun")) thenReturn (1.1) - when(rs.getBoolean("booleanFlag")) thenReturn (true) - when(rs.getString("smallString")) thenReturn ("small_string") - when(rs.getString("smallishString")) thenReturn ("smallish_string") - when(rs.getString("largeString")) thenReturn ("large_string") - when(rs.getString("forceTextString")) thenReturn ("force_text_string") - when(rs.getString("forcedVarChar")) thenReturn ("forced_var_char") - when(rs.getTimestamp("myDateWithTime")) thenReturn (new java.sql.Timestamp(1111L)) - when(rs.getTimestamp("myDateWithoutTime")) thenReturn (new java.sql.Timestamp(1112L)) - when(rs.getLong("optiLong")) thenReturn (1113L) - when(rs.getBlob("byteArr")) thenReturn (blob) - when(rs.getByte("tinyInt")) thenReturn (12.toByte) + when(rs.getLong("bigInt")).thenReturn(12345678L) + when(rs.getInt("smallerAgainInt")).thenReturn(123) + when(rs.getInt("normalIntWithSize")).thenReturn(12) + when(rs.getInt("evenSmallerInt")).thenReturn(1) + when(rs.getDouble("numberFun")).thenReturn(1.1) + when(rs.getBoolean("booleanFlag")).thenReturn(true) + when(rs.getString("smallString")).thenReturn("small_string") + when(rs.getString("smallishString")).thenReturn("smallish_string") + when(rs.getString("largeString")).thenReturn("large_string") + when(rs.getString("forceTextString")).thenReturn("force_text_string") + when(rs.getString("forcedVarChar")).thenReturn("forced_var_char") + when(rs.getTimestamp("myDateWithTime")).thenReturn(new java.sql.Timestamp(1111L)) + when(rs.getTimestamp("myDateWithoutTime")).thenReturn(new java.sql.Timestamp(1112L)) + when(rs.getLong("optiLong")).thenReturn(1113L) + when(rs.getBlob("byteArr")).thenReturn(blob) + when(rs.getByte("tinyInt")).thenReturn(12.toByte) val actual: ExhaustiveJdbcCaseClass = columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) inside(actual) { case ExhaustiveJdbcCaseClass( - bigInt, - smallerAgainInt, - normalIntWithSize, - evenSmallerInt, - numberFun, - booleanFlag, - smallString, - smallishString, - largeString, - forceTextString, - forcedVarChar, - myDateWithTime, - myDateWithoutTime, - optiLong, - bArr, - tinyInt) => - bigInt should be (12345678L) - smallerAgainInt should be (123) - normalIntWithSize should be (12) - evenSmallerInt should be (1) - numberFun should be (1.1) - booleanFlag should be (true) - smallString should be ("small_string") - smallishString should be ("smallish_string") - largeString should be ("large_string") - forceTextString should be ("force_text_string") - forcedVarChar should be ("forced_var_char") - myDateWithTime should be (new Date(1111L)) - myDateWithoutTime should be (new Date(1112L)) - optiLong.get should be (1113L) + bigInt, + smallerAgainInt, + normalIntWithSize, + evenSmallerInt, + numberFun, + booleanFlag, + smallString, + smallishString, + largeString, + forceTextString, + forcedVarChar, + myDateWithTime, + myDateWithoutTime, + optiLong, + bArr, + tinyInt + ) => + bigInt should be(12345678L) + smallerAgainInt should be(123) + normalIntWithSize should be(12) + evenSmallerInt should be(1) + numberFun should be(1.1) + booleanFlag should be(true) + smallString should be("small_string") + smallishString should be("smallish_string") + largeString should be("large_string") + forceTextString should be("force_text_string") + forcedVarChar should be("forced_var_char") + myDateWithTime should be(new Date(1111L)) + myDateWithoutTime should be(new Date(1112L)) + optiLong.get should be(1113L) bArr shouldEqual byteArr tinyInt shouldEqual 12.toByte } @@ -414,20 +428,20 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val columnDef = typeDesc.columnDefn val rsmd = mock[ResultSetMetaData] - when(rsmd.getColumnTypeName(1)) thenReturn ("INT") - when(rsmd.isNullable(1)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(2)) thenReturn ("VARCHAR") - when(rsmd.isNullable(2)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(3)) thenReturn ("DATETIME") - when(rsmd.isNullable(3)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(4)) thenReturn ("BOOLEAN") - when(rsmd.isNullable(4)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(5)) thenReturn ("SMALLINT") - when(rsmd.isNullable(5)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(6)) thenReturn ("BIGINT") - when(rsmd.isNullable(6)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(7)) thenReturn ("DOUBLE") - when(rsmd.isNullable(7)) thenReturn (ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 @@ -438,23 +452,32 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val columnDef = typeDesc.columnDefn val rs = mock[ResultSet] - when(rs.getInt("id")) thenReturn (26) - when(rs.wasNull) thenReturn (false) - when(rs.getString("name")) thenReturn ("alice") - when(rs.wasNull) thenReturn (false) - when(rs.getTimestamp("date_id")) thenReturn (new java.sql.Timestamp(1111L)) - when(rs.wasNull) thenReturn (false) - when(rs.getBoolean("boolean_value")) thenReturn (true) - when(rs.wasNull) thenReturn (false) - when(rs.getInt("short_value")) thenReturn (2) - when(rs.wasNull) thenReturn (false) - when(rs.getLong("long_value")) thenReturn (2000L) - when(rs.wasNull) thenReturn (false) - when(rs.getDouble("double_value")) thenReturn (2.2) - when(rs.wasNull) thenReturn (false) - assert(columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == - CaseClassWithOptions(Some(26), Some("alice"), Some(new Date(1111L)), - Some(true), Some(2), Some(2000L), Some(2.2))) + when(rs.getInt("id")).thenReturn(26) + when(rs.wasNull).thenReturn(false) + when(rs.getString("name")).thenReturn("alice") + when(rs.wasNull).thenReturn(false) + when(rs.getTimestamp("date_id")).thenReturn(new java.sql.Timestamp(1111L)) + when(rs.wasNull).thenReturn(false) + when(rs.getBoolean("boolean_value")).thenReturn(true) + when(rs.wasNull).thenReturn(false) + when(rs.getInt("short_value")).thenReturn(2) + when(rs.wasNull).thenReturn(false) + when(rs.getLong("long_value")).thenReturn(2000L) + when(rs.wasNull).thenReturn(false) + when(rs.getDouble("double_value")).thenReturn(2.2) + when(rs.wasNull).thenReturn(false) + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == + CaseClassWithOptions( + Some(26), + Some("alice"), + Some(new Date(1111L)), + Some(true), + Some(2), + Some(2000L), + Some(2.2) + ) + ) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 } @@ -463,22 +486,23 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val columnDef = typeDesc.columnDefn val rs = mock[ResultSet] - when(rs.getInt("id")) thenReturn (0) // jdbc returns 0 for null numeric values - when(rs.wasNull) thenReturn (true) - when(rs.getString("name")) thenReturn (null) - when(rs.wasNull) thenReturn (true) - when(rs.getString("date_id")) thenReturn (null) - when(rs.getBoolean("boolean_value")) thenReturn (false) // jdbc returns false for null boolean values - when(rs.wasNull) thenReturn (true) - when(rs.getInt("short_value")) thenReturn (0) - when(rs.wasNull) thenReturn (true) - when(rs.getLong("long_value")) thenReturn (0L) - when(rs.wasNull) thenReturn (true) - when(rs.getDouble("double_value")) thenReturn (0) - when(rs.wasNull) thenReturn (true) - assert(columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == - CaseClassWithOptions(None, None, None, - None, None, None, None)) + when(rs.getInt("id")).thenReturn(0) // jdbc returns 0 for null numeric values + when(rs.wasNull).thenReturn(true) + when(rs.getString("name")).thenReturn(null) + when(rs.wasNull).thenReturn(true) + when(rs.getString("date_id")).thenReturn(null) + when(rs.getBoolean("boolean_value")).thenReturn(false) // jdbc returns false for null boolean values + when(rs.wasNull).thenReturn(true) + when(rs.getInt("short_value")).thenReturn(0) + when(rs.wasNull).thenReturn(true) + when(rs.getLong("long_value")).thenReturn(0L) + when(rs.wasNull).thenReturn(true) + when(rs.getDouble("double_value")).thenReturn(0) + when(rs.wasNull).thenReturn(true) + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == + CaseClassWithOptions(None, None, None, None, None, None, None) + ) () // Need this till: https://github.com/scalatest/scalatest/issues/1107 } @@ -487,20 +511,20 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val columnDef = typeDesc.columnDefn val rsmd = mock[ResultSetMetaData] - when(rsmd.getColumnTypeName(1)) thenReturn ("INT") - when(rsmd.isNullable(1)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(2)) thenReturn ("TINYINT") // mismatch - when(rsmd.isNullable(2)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(3)) thenReturn ("DATETIME") - when(rsmd.isNullable(3)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(4)) thenReturn ("BOOLEAN") - when(rsmd.isNullable(4)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(5)) thenReturn ("SMALLINT") - when(rsmd.isNullable(5)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(6)) thenReturn ("BIGINT") - when(rsmd.isNullable(6)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(7)) thenReturn ("DOUBLE") - when(rsmd.isNullable(7)) thenReturn (ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("TINYINT") // mismatch + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) assert(columnDef.resultSetExtractor.validate(rsmd).isFailure) } @@ -510,20 +534,20 @@ class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { val columnDef = typeDesc.columnDefn val rsmd = mock[ResultSetMetaData] - when(rsmd.getColumnTypeName(1)) thenReturn ("INT") - when(rsmd.isNullable(1)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(2)) thenReturn ("VARCHAR") - when(rsmd.isNullable(2)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(3)) thenReturn ("DATETIME") - when(rsmd.isNullable(3)) thenReturn (ResultSetMetaData.columnNoNulls) // mismatch - when(rsmd.getColumnTypeName(4)) thenReturn ("BOOLEAN") - when(rsmd.isNullable(4)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(5)) thenReturn ("SMALLINT") - when(rsmd.isNullable(5)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(6)) thenReturn ("BIGINT") - when(rsmd.isNullable(6)) thenReturn (ResultSetMetaData.columnNullable) - when(rsmd.getColumnTypeName(7)) thenReturn ("DOUBLE") - when(rsmd.isNullable(7)) thenReturn (ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) // mismatch + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) assert(columnDef.resultSetExtractor.validate(rsmd).isFailure) } diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala index ee4d65b123..f805956d2c 100644 --- a/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala @@ -1,13 +1,13 @@ package com.twitter.scalding.estimation.memory import com.twitter.scalding.Config -import com.twitter.scalding.estimation.{ FlowStepHistory, FlowStrategyInfo, HistoryService, Task } -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} import com.twitter.scalding.reducer_estimation._ import org.apache.hadoop.mapred.JobConf -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ -import scala.util.{ Success, Try } +import scala.util.{Success, Try} class MemoryEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatformTest { "Single-step job with memory estimator" should { @@ -142,49 +142,45 @@ object EmptyHistoryService extends HistoryService { class CustomHistoryService(val history: JobConf => Seq[(String, Long)]) extends HistoryService { import Utils._ - override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = { + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = if (info.step.getStepNum == 1) { makeHistory(info.step.getConfig, history) } else if (info.step.getStepNum == 2) { Success(Nil) } else { - makeHistory(info.step.getConfig, _ => Seq( - "MAP" -> 512.megabyte, - "REDUCE" -> 512.megabyte)) + makeHistory(info.step.getConfig, _ => Seq("MAP" -> 512.megabyte, "REDUCE" -> 512.megabyte)) } - } def makeHistory(conf: JobConf, history: JobConf => Seq[(String, Long)]): Success[Seq[FlowStepHistory]] = - Success(history(conf).map { - case (taskType, memory) => - val task = Task( - details = Map( - Task.TaskType -> taskType), - counters = Map( - SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory)) - val tasks = Seq(task) - FlowStepHistory( - keys = null, - submitTimeMillis = 0, - launchTimeMillis = 0L, - finishTimeMillis = 0L, - totalMaps = 0L, - totalReduces = 0L, - finishedMaps = 0L, - finishedReduces = 0L, - failedMaps = 0L, - failedReduces = 0L, - mapFileBytesRead = 0L, - mapFileBytesWritten = 0L, - mapOutputBytes = 0l, - reduceFileBytesRead = 0l, - hdfsBytesRead = 0l, - hdfsBytesWritten = 0L, - mapperTimeMillis = 0L, - reducerTimeMillis = 0L, - reduceShuffleBytes = 0L, - cost = 1.1, - tasks = tasks) + Success(history(conf).map { case (taskType, memory) => + val task = Task( + details = Map(Task.TaskType -> taskType), + counters = Map(SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory) + ) + val tasks = Seq(task) + FlowStepHistory( + keys = null, + submitTimeMillis = 0, + launchTimeMillis = 0L, + finishTimeMillis = 0L, + totalMaps = 0L, + totalReduces = 0L, + finishedMaps = 0L, + finishedReduces = 0L, + failedMaps = 0L, + failedReduces = 0L, + mapFileBytesRead = 0L, + mapFileBytesWritten = 0L, + mapOutputBytes = 0L, + reduceFileBytesRead = 0L, + hdfsBytesRead = 0L, + hdfsBytesWritten = 0L, + mapperTimeMillis = 0L, + reducerTimeMillis = 0L, + reduceShuffleBytes = 0L, + cost = 1.1, + tasks = tasks + ) }) } @@ -195,31 +191,40 @@ class EmptySmoothedMemoryEstimator extends SmoothedHistoryMemoryEstimator { class SmoothedMemoryEstimatorWithData extends SmoothedHistoryMemoryEstimator { import Utils._ - override def historyService: HistoryService = new CustomHistoryService(_ => Seq( - "MAP" -> 800.megabytes, - "REDUCE" -> 800.megabytes, - "MAP" -> 1024.megabytes, - "REDUCE" -> 1024.megabytes, - "MAP" -> 1300.megabytes, - "REDUCE" -> 1300.megabytes, - "MAP" -> 723.megabytes, - "REDUCE" -> 723.megabytes)) + override def historyService: HistoryService = new CustomHistoryService(_ => + Seq( + "MAP" -> 800.megabytes, + "REDUCE" -> 800.megabytes, + "MAP" -> 1024.megabytes, + "REDUCE" -> 1024.megabytes, + "MAP" -> 1300.megabytes, + "REDUCE" -> 1300.megabytes, + "MAP" -> 723.megabytes, + "REDUCE" -> 723.megabytes + ) + ) } class SmoothedMemoryEstimatorWithMoreThanMaxCap extends SmoothedHistoryMemoryEstimator { import Utils._ - override def historyService: HistoryService = new CustomHistoryService(conf => Seq( - "MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte), - "REDUCE" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte))) + override def historyService: HistoryService = new CustomHistoryService(conf => + Seq( + "MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte), + "REDUCE" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte) + ) + ) } class SmoothedMemoryEstimatorWithLessThanMinCap extends SmoothedHistoryMemoryEstimator { import Utils._ - override def historyService: HistoryService = new CustomHistoryService(conf => Seq( - "MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte), - "REDUCE" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte))) + override def historyService: HistoryService = new CustomHistoryService(conf => + Seq( + "MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte), + "REDUCE" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte) + ) + ) } class ErrorHistoryBasedMemoryEstimator extends SmoothedHistoryMemoryEstimator { @@ -240,4 +245,4 @@ object Utils { } implicit def doubleToLong(value: Double): StorageUnit = new StorageUnit(value.toLong) -} \ No newline at end of file +} diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala index d487fefbde..ae1b25a601 100644 --- a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala @@ -1,18 +1,19 @@ package com.twitter.scalding.reducer_estimation import com.twitter.scalding._ -import com.twitter.scalding.estimation.{ FlowStepHistory, FlowStrategyInfo, HistoryService, Task } -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } -import org.scalatest.{ Matchers, WordSpec } +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} class SimpleJobWithNoSetReducers(args: Args, customConfig: Config) extends Job(args) { import HipJob._ override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .map(_.toLowerCase -> 1) .group @@ -36,7 +37,11 @@ object HistoryServiceWithData { def makeHistory(inputHdfsBytesRead: Long, mapOutputBytes: Long): FlowStepHistory = makeHistory(inputHdfsBytesRead, mapOutputBytes, Seq()) - def makeHistory(inputHdfsBytesRead: Long, mapOutputBytes: Long, taskRuntimes: Seq[Long]): FlowStepHistory = { + def makeHistory( + inputHdfsBytesRead: Long, + mapOutputBytes: Long, + taskRuntimes: Seq[Long] + ): FlowStepHistory = { val random = new scala.util.Random(123) val tasks = taskRuntimes.map { time => val startTime = random.nextLong @@ -45,8 +50,10 @@ object HistoryServiceWithData { Task.TaskType -> "REDUCE", Status -> "SUCCEEDED", StartTime -> startTime, - FinishTime -> (startTime + time)), - Map.empty) + FinishTime -> (startTime + time) + ), + Map.empty + ) } FlowStepHistory( @@ -63,14 +70,15 @@ object HistoryServiceWithData { mapFileBytesRead = 0L, mapFileBytesWritten = 0L, mapOutputBytes = mapOutputBytes, - reduceFileBytesRead = 0l, + reduceFileBytesRead = 0L, hdfsBytesRead = inputHdfsBytesRead, hdfsBytesWritten = 0L, mapperTimeMillis = 0L, reducerTimeMillis = 0L, reduceShuffleBytes = 0L, cost = 1.1, - tasks = tasks) + tasks = tasks + ) } def inputSize = HipJob.InSrcFileSize @@ -86,7 +94,9 @@ object ValidHistoryService extends HistoryService { makeHistory(10, 1), // below threshold, ignored makeHistory(inputSize, inputSize / 2), makeHistory(inputSize, inputSize / 2), - makeHistory(inputSize, inputSize / 2))) + makeHistory(inputSize, inputSize / 2) + ) + ) } object SmallDataExplosionHistoryService extends HistoryService { @@ -98,10 +108,8 @@ object SmallDataExplosionHistoryService extends HistoryService { val outSize = inputSize * 1000 Success( - Seq( - makeHistory(inputSize, outSize), - makeHistory(inputSize, outSize), - makeHistory(inputSize, outSize))) + Seq(makeHistory(inputSize, outSize), makeHistory(inputSize, outSize), makeHistory(inputSize, outSize)) + ) } } @@ -110,11 +118,7 @@ object InvalidHistoryService extends HistoryService { def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = // all entries below the 10% threshold for past input size - Success( - Seq( - makeHistory(10, 1), - makeHistory(10, 1), - makeHistory(10, 1))) + Success(Seq(makeHistory(10, 1), makeHistory(10, 1), makeHistory(10, 1))) } class EmptyHistoryBasedEstimator extends RatioBasedEstimator { @@ -151,7 +155,7 @@ class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopS steps should have size 1 val conf = steps.head.getConfig - conf.getNumReduceTasks should equal (1) // default + conf.getNumReduceTasks should equal(1) // default } .run() } @@ -167,7 +171,7 @@ class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopS steps should have size 1 val conf = steps.head.getConfig - conf.getNumReduceTasks should equal (1) // default + conf.getNumReduceTasks should equal(1) // default } .run() } @@ -187,7 +191,7 @@ class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopS // reducer ratio from history = 0.5 // final estimate = ceil(3 * 0.5) = 2 val conf = steps.head.getConfig - conf.getNumReduceTasks should equal (2) + conf.getNumReduceTasks should equal(2) } .run() } @@ -215,8 +219,9 @@ class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopS steps should have size 1 val conf = steps.head.getConfig - conf.getNumReduceTasks should equal (2) // used to pick 1000 with the rounding error - }.run() + conf.getNumReduceTasks should equal(2) // used to pick 1000 with the rounding error + } + .run() } "not set reducers when there is no valid history" in { @@ -230,7 +235,7 @@ class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopS steps should have size 1 val conf = steps.head.getConfig - conf.getNumReduceTasks should equal (1) // default + conf.getNumReduceTasks should equal(1) // default } .run() } diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala index 40d8887112..cee846e53b 100644 --- a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala @@ -2,9 +2,9 @@ package com.twitter.scalding.reducer_estimation import cascading.flow.FlowException import com.twitter.scalding._ -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} import java.io.FileNotFoundException -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ object HipJob { @@ -12,7 +12,8 @@ object HipJob { val inPath = getClass.getResource("/hipster.txt") // file size is 2496 bytes val inSrc = TextLine(inPath.toString) val InScoresFileSize = 174L - val inScores = TypedTsv[(String, Double)](getClass.getResource("/scores.tsv").toString) // file size is 174 bytes + val inScores = + TypedTsv[(String, Double)](getClass.getResource("/scores.tsv").toString) // file size is 174 bytes val out = TypedTsv[Double]("output") val counts = TypedTsv[(String, Int)]("counts.tsv") val size = TypedTsv[Long]("size.tsv") @@ -28,7 +29,8 @@ class HipJob(args: Args, customConfig: Config) extends Job(args) { .replaceAll("[^a-zA-Z0-9\\s]", "") .split("\\s+") - val wordCounts = TypedPipe.from(inSrc) + val wordCounts = TypedPipe + .from(inSrc) .flatMap(tokenize) .map(_ -> 1) .group @@ -36,14 +38,18 @@ class HipJob(args: Args, customConfig: Config) extends Job(args) { val scores = TypedPipe.from(inScores).group - wordCounts.leftJoin(scores) - .mapValues{ case (count, score) => (count, score.getOrElse(0.0)) } + wordCounts + .leftJoin(scores) + .mapValues { case (count, score) => (count, score.getOrElse(0.0)) } // force another M/R step - should use reducer estimation .toTypedPipe - .map{ case (word, (count, score)) => (count, score) } - .group.sum + .map { case (word, (count, score)) => (count, score) } + .group + .sum // force another M/R step - this should force 1 reducer because it is essentially a groupAll - .toTypedPipe.values.sum + .toTypedPipe + .values + .sum .write(out) } @@ -53,7 +59,8 @@ class SimpleJob(args: Args, customConfig: Config) extends Job(args) { override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .map(_.toLowerCase -> 1) .group @@ -71,7 +78,8 @@ class SimpleGlobJob(args: Args, customConfig: Config) extends Job(args) { override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .map(_.toLowerCase -> 1) .group @@ -84,13 +92,17 @@ class SimpleGlobJob(args: Args, customConfig: Config) extends Job(args) { class SimpleMemoryJob(args: Args, customConfig: Config) extends Job(args) { import HipJob._ - val inSrc = IterableSource(List( - "Direct trade American Apparel squid umami tote bag. Lo-fi XOXO gluten-free meh literally, typewriter readymade wolf salvia whatever drinking vinegar organic. Four loko literally bicycle rights drinking vinegar Cosby sweater hella stumptown. Dreamcatcher iPhone 90's organic chambray cardigan, wolf fixie gluten-free Brooklyn four loko. Mumblecore ennui twee, 8-bit food truck sustainable tote bag Williamsburg mixtape biodiesel. Semiotics Helvetica put a bird on it, roof party fashion axe organic post-ironic readymade Wes Anderson Pinterest keffiyeh. Craft beer meggings sartorial, butcher Marfa kitsch art party mustache Brooklyn vinyl.", - "Wolf flannel before they sold out vinyl, selfies four loko Bushwick Banksy Odd Future. Chillwave banh mi iPhone, Truffaut shabby chic craft beer keytar DIY. Scenester selvage deep v YOLO paleo blog photo booth fap. Sustainable wolf mixtape small batch skateboard, pop-up brunch asymmetrical seitan butcher Thundercats disrupt twee Etsy. You probably haven't heard of them freegan skateboard before they sold out, mlkshk pour-over Echo Park keytar retro farm-to-table. Tattooed sustainable beard, Helvetica Wes Anderson pickled vinyl yr pop-up Vice. Wolf bespoke lomo photo booth ethnic cliche.")) + val inSrc = IterableSource( + List( + "Direct trade American Apparel squid umami tote bag. Lo-fi XOXO gluten-free meh literally, typewriter readymade wolf salvia whatever drinking vinegar organic. Four loko literally bicycle rights drinking vinegar Cosby sweater hella stumptown. Dreamcatcher iPhone 90's organic chambray cardigan, wolf fixie gluten-free Brooklyn four loko. Mumblecore ennui twee, 8-bit food truck sustainable tote bag Williamsburg mixtape biodiesel. Semiotics Helvetica put a bird on it, roof party fashion axe organic post-ironic readymade Wes Anderson Pinterest keffiyeh. Craft beer meggings sartorial, butcher Marfa kitsch art party mustache Brooklyn vinyl.", + "Wolf flannel before they sold out vinyl, selfies four loko Bushwick Banksy Odd Future. Chillwave banh mi iPhone, Truffaut shabby chic craft beer keytar DIY. Scenester selvage deep v YOLO paleo blog photo booth fap. Sustainable wolf mixtape small batch skateboard, pop-up brunch asymmetrical seitan butcher Thundercats disrupt twee Etsy. You probably haven't heard of them freegan skateboard before they sold out, mlkshk pour-over Echo Park keytar retro farm-to-table. Tattooed sustainable beard, Helvetica Wes Anderson pickled vinyl yr pop-up Vice. Wolf bespoke lomo photo booth ethnic cliche." + ) + ) override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .map(_.toLowerCase -> 1) .group @@ -107,7 +119,8 @@ class SimpleFileNotFoundJob(args: Args, customConfig: Config) extends Job(args) override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .map(_.toLowerCase -> 1) .group @@ -122,7 +135,8 @@ class GroupAllJob(args: Args, customConfig: Config) extends Job(args) { import HipJob._ override def config = super.config ++ customConfig.toMap.toMap - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .groupAll .size @@ -136,7 +150,8 @@ class SimpleMapOnlyJob(args: Args, customConfig: Config) extends Job(args) { override def config = super.config ++ customConfig.toMap.toMap // simple job with no reduce phase - TypedPipe.from(inSrc) + TypedPipe + .from(inSrc) .flatMap(_.split("[^\\w]+")) .write(TypedTsv[String]("mapped_output")) } @@ -155,8 +170,8 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.getNumReducers should contain (2) - conf.get(ReducerEstimatorConfig.originalNumReducers) should be (None) + conf.getNumReducers should contain(2) + conf.get(ReducerEstimatorConfig.originalNumReducers) should be(None) } .run() } @@ -172,8 +187,8 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.getNumReducers should contain (3) - conf.get(ReducerEstimatorConfig.originalNumReducers) should contain ("2") + conf.getNumReducers should contain(3) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") } .run() } @@ -189,8 +204,8 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.getNumReducers should contain (3) - conf.get(ReducerEstimatorConfig.originalNumReducers) should contain ("2") + conf.getNumReducers should contain(3) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") } .run() } @@ -208,9 +223,9 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.get(ReducerEstimatorConfig.estimatedNumReducers) should contain ("2496") - conf.get(ReducerEstimatorConfig.cappedEstimatedNumReducersKey) should contain ("10") - conf.getNumReducers should contain (10) + conf.get(ReducerEstimatorConfig.estimatedNumReducers) should contain("2496") + conf.get(ReducerEstimatorConfig.cappedEstimatedNumReducersKey) should contain("10") + conf.getNumReducers should contain(10) } .run() } @@ -226,8 +241,8 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.getNumReducers should contain (2) - conf.get(ReducerEstimatorConfig.originalNumReducers) should contain ("2") + conf.getNumReducers should contain(2) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") } .run() } @@ -238,9 +253,8 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf (Config.ReducerEstimatorOverride -> "true") HadoopPlatformJobTest(new SimpleFileNotFoundJob(_, customConfig), cluster) - .runExpectFailure { - case error: FlowException => - error.getCause.getClass should be(classOf[FileNotFoundException]) + .runExpectFailure { case error: FlowException => + error.getCause.getClass should be(classOf[FileNotFoundException]) } } } @@ -256,7 +270,7 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf steps should have size 1 val conf = Config.fromHadoop(steps.head.getConfig) - conf.getNumReducers should contain (1) + conf.getNumReducers should contain(1) } .run() } @@ -296,4 +310,3 @@ class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatf } } } - diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala index b41cd95b7e..1d0a60f282 100644 --- a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala @@ -1,12 +1,12 @@ package com.twitter.scalding.reducer_estimation import com.twitter.scalding._ -import RuntimeReducerEstimator.{ EstimationScheme, IgnoreInputSize, RuntimePerReducer } -import com.twitter.scalding.estimation.{ Estimator, FlowStepHistory, FlowStrategyInfo, HistoryService } -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } -import org.scalatest.{ Matchers, WordSpec } +import RuntimeReducerEstimator.{EstimationScheme, IgnoreInputSize, RuntimePerReducer} +import com.twitter.scalding.estimation.{Estimator, FlowStepHistory, FlowStrategyInfo, HistoryService} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ -import scala.util.{ Success, Try } +import scala.util.{Success, Try} object HistoryService1 extends HistoryService { import HistoryServiceWithData._ @@ -16,7 +16,9 @@ object HistoryService1 extends HistoryService { Seq( makeHistory(inputSize * 2, 0, List(10, 1000, 3000)), makeHistory(inputSize / 2, 0, List(10, 200, 400)), - makeHistory(inputSize * 4, 0, List(10, 2400, 3000)))) + makeHistory(inputSize * 4, 0, List(10, 2400, 3000)) + ) + ) } class Estimator1 extends RuntimeReducerEstimator { diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala index 580bbf24f9..74120040a1 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.platform import com.twitter.scalding._ import com.twitter.scalding.source.TypedText -import java.io.{ BufferedWriter, File, FileWriter } +import java.io.{BufferedWriter, File, FileWriter} import org.slf4j.LoggerFactory @@ -36,35 +36,34 @@ trait HadoopPlatform[P, R, T <: HadoopPlatform[P, R, T]] { def run(): Unit def runExpectFailure[K](fn: Throwable => K): K = - fn(Try { run() }.failed.get) + fn(Try(run()).failed.get) def init(cons: P => R): R def execute(unit: R): Unit protected def createSources(): Unit = { - dataToCreate foreach { - case (location, lines) => - val tmpFile = File.createTempFile("hadoop_platform", "job_test") - tmpFile.deleteOnExit() - if (lines.nonEmpty) { - val os = new BufferedWriter(new FileWriter(tmpFile)) - os.write(lines.head) - lines.tail.foreach { str => - os.newLine() - os.write(str) - } - os.close() + dataToCreate.foreach { case (location, lines) => + val tmpFile = File.createTempFile("hadoop_platform", "job_test") + tmpFile.deleteOnExit() + if (lines.nonEmpty) { + val os = new BufferedWriter(new FileWriter(tmpFile)) + os.write(lines.head) + lines.tail.foreach { str => + os.newLine() + os.write(str) } - cluster.putFile(tmpFile, location) - tmpFile.delete() + os.close() + } + cluster.putFile(tmpFile, location) + tmpFile.delete() } - sourceWriters.foreach { cons => execute(init(cons)) } + sourceWriters.foreach(cons => execute(init(cons))) } protected def checkSinks(): Unit = { LOG.debug("Executing sinks") - sourceReaders.foreach { _(cluster.mode) } + sourceReaders.foreach(_(cluster.mode)) } -} \ No newline at end of file +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala index e0254f84e9..fe2f9d9844 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala @@ -3,16 +3,17 @@ package com.twitter.scalding.platform import cascading.flow.Flow import com.twitter.scalding._ import org.apache.hadoop.mapred.JobConf -import scala.util.{ Failure, Success } +import scala.util.{Failure, Success} case class HadoopPlatformExecutionTest( - cons: (Config) => Execution[_], - cluster: LocalCluster, - parameters: Map[String, String] = Map.empty, - dataToCreate: Seq[(String, Seq[String])] = Vector(), - sourceWriters: Seq[Config => Execution[_]] = Vector.empty, - sourceReaders: Seq[Mode => Unit] = Vector.empty, - flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty) extends HadoopPlatform[Config, Execution[_], HadoopPlatformExecutionTest] { + cons: (Config) => Execution[_], + cluster: LocalCluster, + parameters: Map[String, String] = Map.empty, + dataToCreate: Seq[(String, Seq[String])] = Vector(), + sourceWriters: Seq[Config => Execution[_]] = Vector.empty, + sourceReaders: Seq[Mode => Unit] = Vector.empty, + flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty +) extends HadoopPlatform[Config, Execution[_], HadoopPlatformExecutionTest] { def config: Config = Config.defaultFrom(cluster.mode) ++ Config.from(parameters) diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala index 84fad089fd..638c09bdb6 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform import cascading.flow.Flow @@ -21,18 +21,18 @@ import com.twitter.scalding._ import org.apache.hadoop.mapred.JobConf /** - * This class is used to construct unit tests in scalding which - * use Hadoop's MiniCluster to more fully simulate and test - * the logic which is deployed in a job. + * This class is used to construct unit tests in scalding which use Hadoop's MiniCluster to more fully + * simulate and test the logic which is deployed in a job. */ case class HadoopPlatformJobTest( - cons: (Args) => Job, - cluster: LocalCluster, - argsMap: Map[String, List[String]] = Map.empty, - dataToCreate: Seq[(String, Seq[String])] = Vector(), - sourceWriters: Seq[Args => Job] = Vector.empty, - sourceReaders: Seq[Mode => Unit] = Vector.empty, - flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty) extends HadoopPlatform[Args, Job, HadoopPlatformJobTest] { + cons: (Args) => Job, + cluster: LocalCluster, + argsMap: Map[String, List[String]] = Map.empty, + dataToCreate: Seq[(String, Seq[String])] = Vector(), + sourceWriters: Seq[Args => Job] = Vector.empty, + sourceReaders: Seq[Mode => Unit] = Vector.empty, + flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty +) extends HadoopPlatform[Args, Job, HadoopPlatformJobTest] { override def arg(key: String, value: String): HadoopPlatformJobTest = copy(argsMap = argsMap + (key -> List(value))) @@ -43,12 +43,14 @@ case class HadoopPlatformJobTest( override def source[T](out: TypedSink[T], data: Seq[T]): HadoopPlatformJobTest = copy(sourceWriters = sourceWriters :+ { args: Args => new Job(args) { - TypedPipe.from(List("")).flatMap { _ => data }.write(out) + TypedPipe.from(List("")).flatMap(_ => data).write(out) } }) override def sink[T](in: Mappable[T])(toExpect: (Seq[T]) => Unit): HadoopPlatformJobTest = - copy(sourceReaders = sourceReaders :+ { m: Mode => toExpect(in.toIterator(Config.defaultFrom(m), m).toSeq) }) + copy(sourceReaders = sourceReaders :+ { m: Mode => + toExpect(in.toIterator(Config.defaultFrom(m), m).toSeq) + }) def inspectCompletedFlow(checker: Flow[JobConf] => Unit): HadoopPlatformJobTest = copy(flowCheckers = flowCheckers :+ checker) @@ -62,8 +64,8 @@ case class HadoopPlatformJobTest( execute(job) checkSinks() flowCheckers.foreach { checker => - job.completedFlow.collect { - case f: Flow[JobConf @unchecked] => checker(f) + job.completedFlow.collect { case f: Flow[JobConf @unchecked] => + checker(f) } } } @@ -76,7 +78,7 @@ case class HadoopPlatformJobTest( job.clear() job.next match { // linter:ignore:UseOptionForeachNotPatMatch case Some(nextJob) => execute(nextJob) - case None => () + case None => () } } } diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala index f4275fe0e6..6560c7eb5c 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala @@ -12,10 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform -import org.scalatest.{ BeforeAndAfterAll, Suite } +import org.scalatest.{BeforeAndAfterAll, Suite} trait HadoopSharedPlatformTest extends BeforeAndAfterAll { this: Suite => org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.ERROR) @@ -35,7 +35,7 @@ trait HadoopSharedPlatformTest extends BeforeAndAfterAll { this: Suite => //TODO is there a way to buffer such that we see test results AFTER afterEach? Otherwise the results // get lost in the logging - override def afterAll(): Unit = { + override def afterAll(): Unit = try super.afterAll() finally { // Necessary because afterAll can be called from a different thread and we want to make sure that the state @@ -45,5 +45,4 @@ trait HadoopSharedPlatformTest extends BeforeAndAfterAll { this: Suite => cluster.shutdown() } } - } } diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala index 4ec2395be7..32c820455d 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala @@ -12,19 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform import com.twitter.scalding._ -import java.io.{ File, RandomAccessFile } +import java.io.{File, RandomAccessFile} import java.nio.channels.FileLock import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.filecache.DistributedCache -import org.apache.hadoop.fs.{ FileUtil, Path } +import org.apache.hadoop.fs.{FileUtil, Path} import org.apache.hadoop.hdfs.MiniDFSCluster -import org.apache.hadoop.mapred.{ JobConf, MiniMRCluster } +import org.apache.hadoop.mapred.{JobConf, MiniMRCluster} import org.slf4j.LoggerFactory import org.slf4j.impl.Log4jLoggerAdapter @@ -66,7 +66,7 @@ class LocalCluster(mutex: Boolean = true) { private[this] def releaseMutex(): Unit = { LOG.debug("Releasing mutex") - lock.foreach { _.release() } + lock.foreach(_.release()) LOG.debug("Mutex released") lock = None } @@ -74,7 +74,8 @@ class LocalCluster(mutex: Boolean = true) { /** * Start up the local cluster instance. * - * @param inConf override default configuration + * @param inConf + * override default configuration */ def initialize(inConf: Config = Config.empty): this.type = { if (mutex) { @@ -113,7 +114,7 @@ class LocalCluster(mutex: Boolean = true) { fileSystem.mkdirs(LocalCluster.HADOOP_CLASSPATH_DIR) // merge in input configuration - inConf.toMap.foreach{ case (k, v) => mrJobConf.set(k, v) } + inConf.toMap.foreach { case (k, v) => mrJobConf.set(k, v) } hadoop = Some(dfs, cluster, mrJobConf) @@ -145,13 +146,13 @@ class LocalCluster(mutex: Boolean = true) { classOf[com.esotericsoftware.kryo.KryoSerializable], classOf[com.twitter.chill.hadoop.KryoSerialization], classOf[com.twitter.maple.tap.TupleMemoryInputFormat], - classOf[org.apache.commons.configuration.Configuration]).foreach { addClassSourceToClassPath(_) } + classOf[org.apache.commons.configuration.Configuration] + ).foreach(addClassSourceToClassPath(_)) this } - def addClassSourceToClassPath[T](clazz: Class[T]): Unit = { + def addClassSourceToClassPath[T](clazz: Class[T]): Unit = addFileToHadoopClassPath(getFileForClass(clazz)) - } def addFileToHadoopClassPath(resourceDir: File): Boolean = if (classpath.contains(resourceDir)) { @@ -182,10 +183,9 @@ class LocalCluster(mutex: Boolean = true) { //TODO is there a way to know if we need to wait on anything to shut down, etc? def shutdown(): Unit = { - hadoop.foreach { - case (dfs, mr, _) => - dfs.shutdown() - mr.shutdown() + hadoop.foreach { case (dfs, mr, _) => + dfs.shutdown() + mr.shutdown() } hadoop = None if (mutex) { diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala index 438efb6e33..b8b90c3e28 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform -import java.io.{ BufferedInputStream, File, FileInputStream, FileOutputStream } -import java.util.jar.{ Attributes, JarEntry, JarOutputStream, Manifest => JarManifest } +import java.io.{BufferedInputStream, File, FileInputStream, FileOutputStream} +import java.util.jar.{Attributes, JarEntry, JarOutputStream, Manifest => JarManifest} import org.slf4j.LoggerFactory @@ -26,7 +26,8 @@ object MakeJar { def apply(classDir: File, jarName: Option[String] = None): File = { val syntheticJar = new File( System.getProperty("java.io.tmpdir"), - jarName.getOrElse(classDir.getAbsolutePath.replace("/", "_") + ".jar")) + jarName.getOrElse(classDir.getAbsolutePath.replace("/", "_") + ".jar") + ) LOG.debug("Creating synthetic jar: " + syntheticJar.getAbsolutePath) val manifest = new JarManifest manifest.getMainAttributes.put(Attributes.Name.MANIFEST_VERSION, "1.0") @@ -45,7 +46,7 @@ object MakeJar { target.putNextEntry(entry) target.closeEntry() } - source.listFiles.foreach { add(parent, _, target) } + source.listFiles.foreach(add(parent, _, target)) } else { val entry = new JarEntry(name) entry.setTime(source.lastModified) @@ -65,14 +66,17 @@ object MakeJar { // Note that this assumes that parent and source are in absolute form if that's what we want @annotation.tailrec private[this] def getRelativeFileBetween( - parent: File, source: File, result: List[String] = List.empty): Option[File] = + parent: File, + source: File, + result: List[String] = List.empty + ): Option[File] = Option(source) match { // linter:disable:UseOptionFlatMapNotPatMatch // need as is for tailrec case Some(src) => { if (parent == src) { result.foldLeft(None: Option[File]) { (cum, part) => Some(cum match { case Some(p) => new File(p, part) - case None => new File(part) + case None => new File(part) }) } } else { diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala index 44076cb5d8..72b4d8f609 100644 --- a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform -import org.scalatest.{ BeforeAndAfterEach, Suite } +import org.scalatest.{BeforeAndAfterEach, Suite} /** - * This is a mixin fixture for scalatest which makes it easy to use a LocalCluster and will manage - * the lifecycle of one appropriately. + * This is a mixin fixture for scalatest which makes it easy to use a LocalCluster and will manage the + * lifecycle of one appropriately. */ trait HadoopPlatformTest extends BeforeAndAfterEach { this: Suite => org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.ERROR) @@ -39,7 +39,7 @@ trait HadoopPlatformTest extends BeforeAndAfterEach { this: Suite => //TODO is there a way to buffer such that we see test results AFTER afterEach? Otherwise the results // get lost in the logging - override def afterEach(): Unit = { + override def afterEach(): Unit = try super.afterEach() finally { // Necessary because afterAll can be called from a different thread and we want to make sure that the state @@ -49,5 +49,4 @@ trait HadoopPlatformTest extends BeforeAndAfterEach { this: Suite => cluster.shutdown() } } - } } diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala index 42c92f0119..dff9366ba7 100644 --- a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala @@ -1,7 +1,7 @@ package com.twitter.scalding.platform -import com.twitter.scalding.{ Config, Execution, TypedPipe, TypedTsv } -import org.scalatest.{ Matchers, WordSpec } +import com.twitter.scalding.{Config, Execution, TypedPipe, TypedTsv} +import org.scalatest.{Matchers, WordSpec} import scala.io.Source object InAndOutExecution extends Function[Config, Execution[Unit]] { @@ -72,7 +72,7 @@ class PlatformExecutionTest extends WordSpec with Matchers with HadoopSharedPlat "reading then writing shouldn't change the data" in { HadoopPlatformExecutionTest(InAndOutExecution, cluster) .source("input", inAndOut) - .sink[String]("output") { _.toSet shouldBe inAndOut.toSet } + .sink[String]("output")(_.toSet shouldBe inAndOut.toSet) .run() } } @@ -84,7 +84,7 @@ class PlatformExecutionTest extends WordSpec with Matchers with HadoopSharedPlat HadoopPlatformExecutionTest(OneDistributedCacheExecution, cluster) .data(one) .source("input", input) - .sink[String]("output") { _ shouldBe output } + .sink[String]("output")(_ shouldBe output) .run() } @@ -95,7 +95,7 @@ class PlatformExecutionTest extends WordSpec with Matchers with HadoopSharedPlat .data(first) .data(second) .source("input", input) - .sink[String]("output") { _ shouldBe output } + .sink[String]("output")(_ shouldBe output) .run() } } diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala index 002dc6eaae..b8519580ba 100644 --- a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala @@ -12,20 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.platform -import java.util.{ Iterator => JIterator } +import java.util.{Iterator => JIterator} import cascading.flow.FlowException -import cascading.pipe.joiner.{ InnerJoin, JoinerClosure } +import cascading.pipe.joiner.{InnerJoin, JoinerClosure} import cascading.tap.Tap -import cascading.tuple.{ Fields, Tuple } +import cascading.tuple.{Fields, Tuple} import com.twitter.scalding._ import com.twitter.scalding.serialization.OrderedSerialization -import com.twitter.scalding.source.{ FixedTypedText, NullSink, TypedText } -import org.scalacheck.{ Arbitrary, Gen } -import org.scalatest.{ Matchers, WordSpec } +import com.twitter.scalding.source.{FixedTypedText, NullSink, TypedText} +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ import scala.language.experimental.macros @@ -54,7 +54,7 @@ class TinyJoinAndMergeJob(args: Args) extends Job(args) { .mapTo(0 -> 'id) { v: Int => v } .joinWithTiny('id -> 'id, people) - (messages ++ people).groupBy('id) { _.size('count) }.write(output) + (messages ++ people).groupBy('id)(_.size('count)).write(output) } object TsvNoCacheJob { @@ -70,11 +70,11 @@ object TsvNoCacheJob { class TsvNoCacheJob(args: Args) extends Job(args) { import TsvNoCacheJob._ dataInput.read - .flatMap(new cascading.tuple.Fields(Integer.valueOf(0)) -> 'word){ line: String => line.split("\\s") } - .groupBy('word){ group => group.size } - .mapTo('word -> 'num) { (w: String) => w.toFloat } + .flatMap(new cascading.tuple.Fields(Integer.valueOf(0)) -> 'word) { line: String => line.split("\\s") } + .groupBy('word)(group => group.size) + .mapTo('word -> 'num)((w: String) => w.toFloat) .write(throwAwayOutput) - .groupAll { _.sortBy('num) } + .groupAll(_.sortBy('num)) .write(realOuput) } @@ -101,7 +101,7 @@ class NormalDistinctJob(args: Args) extends Job(args) { object MultipleGroupByJobData { val data: List[String] = { val rnd = new scala.util.Random(22) - (0 until 20).map { _ => rnd.nextLong.toString }.toList + (0 until 20).map(_ => rnd.nextLong.toString).toList }.distinct } @@ -109,16 +109,17 @@ class MultipleGroupByJob(args: Args) extends Job(args) { import com.twitter.scalding.serialization._ import MultipleGroupByJobData._ implicit val stringOrdSer: OrderedSerialization[String] = new StringOrderedSerialization() - implicit val stringTup2OrdSer: OrderedSerialization[(String, String)] = new OrderedSerialization2(stringOrdSer, stringOrdSer) - val otherStream = TypedPipe.from(data).map{ k => (k, k) }.group + implicit val stringTup2OrdSer: OrderedSerialization[(String, String)] = + new OrderedSerialization2(stringOrdSer, stringOrdSer) + val otherStream = TypedPipe.from(data).map(k => (k, k)).group - TypedPipe.from(data) - .map{ k => (k, 1L) } + TypedPipe + .from(data) + .map(k => (k, 1L)) .group(stringOrdSer) .sum - .map { - case (k, _) => - ((k, k), 1L) + .map { case (k, _) => + ((k, k), 1L) } .sumByKey(stringTup2OrdSer, implicitly) .map(_._1._1) @@ -206,7 +207,7 @@ class TypedPipeHashJoinWithGroupByJob(args: Args) extends Job(args) { val x = TypedPipe.from[(String, Int)](Tsv("input1", ('x1, 'y1)), Fields.ALL) val y = Tsv("input2", ('x2, 'y2)) - val yGroup = y.groupBy('x2){ p => p } + val yGroup = y.groupBy('x2)(p => p) val yTypedPipe = TypedPipe.from[(String, Int)](yGroup, Fields.ALL) x.hashJoin(yTypedPipe) @@ -226,7 +227,7 @@ class TypedPipeHashJoinWithCoGroupJob(args: Args) extends Job(args) { } val coGroupTypedPipe = TypedPipe.from[(Int, Int, Int)](coGroupPipe, Fields.ALL) - val coGroupTuplePipe = coGroupTypedPipe.map{ case (a, b, c) => (a, (b, c)) } + val coGroupTuplePipe = coGroupTypedPipe.map { case (a, b, c) => (a, (b, c)) } x.hashJoin(coGroupTuplePipe) .withDescription("hashJoin") .write(TypedTsv[(Int, (Int, (Int, Int)))]("output")) @@ -237,7 +238,7 @@ class TypedPipeHashJoinWithEveryJob(args: Args) extends Job(args) { val x = TypedPipe.from[(Int, String)](Tsv("input1", ('x1, 'y1)), Fields.ALL) val y = Tsv("input2", ('x2, 'y2)).groupBy('x2) { - _.foldLeft('y2 -> 'y2)(0){ (b: Int, a: Int) => b + a } + _.foldLeft('y2 -> 'y2)(0)((b: Int, a: Int) => b + a) } val yTypedPipe = TypedPipe.from[(Int, Int)](y, Fields.ALL) @@ -248,9 +249,10 @@ class TypedPipeHashJoinWithEveryJob(args: Args) extends Job(args) { class TypedPipeForceToDiskWithDescriptionJob(args: Args) extends Job(args) { val writeWords = { - TypedPipe.from[String](List("word1 word2", "word1", "word2")) + TypedPipe + .from[String](List("word1 word2", "word1", "word2")) .withDescription("write words to disk") - .flatMap { _.split("\\s+") } + .flatMap(_.split("\\s+")) .forceToDisk } writeWords @@ -262,9 +264,10 @@ class TypedPipeForceToDiskWithDescriptionJob(args: Args) extends Job(args) { class GroupedLimitJobWithSteps(args: Args) extends Job(args) { val writeWords = - TypedPipe.from[String](List("word1 word2", "word1", "word2")) - .flatMap { _.split("\\s+") } - .map{ k => k -> 1L } + TypedPipe + .from[String](List("word1 word2", "word1", "word2")) + .flatMap(_.split("\\s+")) + .map(k => k -> 1L) .sumByKey .limit(3) @@ -285,7 +288,7 @@ object OrderedSerializationTest { implicit val genASGK: Arbitrary[NestedCaseClass] = Arbitrary { for { ts <- Arbitrary.arbitrary[Long] - b <- Gen.nonEmptyListOf(Gen.alphaNumChar).map (_.mkString) + b <- Gen.nonEmptyListOf(Gen.alphaNumChar).map(_.mkString) } yield NestedCaseClass(RichDate(ts), (b, b)) } @@ -297,7 +300,8 @@ case class NestedCaseClass(day: RichDate, key: (String, String)) // Need to define this in a separate companion object to work around Scala 2.12 compile issues object OrderedSerializationImplicitDefs { - implicit def primitiveOrderedBufferSupplier[T]: OrderedSerialization[T] = macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + implicit def primitiveOrderedBufferSupplier[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] } class ComplexJob(input: List[NestedCaseClass], args: Args) extends Job(args) { @@ -307,12 +311,12 @@ class ComplexJob(input: List[NestedCaseClass], args: Args) extends Job(args) { val ds2 = TypedPipe.from(input).map(_ -> 1L).distinct.group - ds2 - .keys + ds2.keys .map(s => s.toString) .write(TypedTsv[String](args("output1"))) - ds2.join(ds1) + ds2 + .join(ds1) .values .map(_.toString) .write(TypedTsv[String](args("output2"))) @@ -374,16 +378,21 @@ class CheckForFlowProcessInTypedJob(args: Args) extends Job(args) { val inA = TypedPipe.from(TypedTsv[(String, String)]("inputA")) val inB = TypedPipe.from(TypedTsv[(String, String)]("inputB")) - inA.group.join(inB.group).forceToReducers.mapGroup((key, valuesIter) => { - stat.inc() + inA.group + .join(inB.group) + .forceToReducers + .mapGroup { (key, valuesIter) => + stat.inc() - val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) - if (flowProcess == null) { - throw new NullPointerException("No active FlowProcess was available.") - } + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) + if (flowProcess == null) { + throw new NullPointerException("No active FlowProcess was available.") + } - valuesIter.map({ case (a, b) => s"$a:$b" }) - }).toTypedPipe.write(TypedTsv[(String, String)]("output")) + valuesIter.map { case (a, b) => s"$a:$b" } + } + .toTypedPipe + .write(TypedTsv[(String, String)]("output")) } case class BypassValidationSource(path: String) extends FixedTypedText[Int](TypedText.TAB, path) { @@ -391,7 +400,7 @@ case class BypassValidationSource(path: String) extends FixedTypedText[Int](Type override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = (mode, readOrWrite) match { case (hdfsMode: Hdfs, Read) => new InvalidSourceTap(Seq(path)) - case _ => super.createTap(readOrWrite) + case _ => super.createTap(readOrWrite) } } @@ -402,17 +411,20 @@ class ReadPathJob(args: Args) extends Job(args) { } object PlatformTest { - def setAutoForceRight(mode: Mode, autoForce: Boolean): Unit = { + def setAutoForceRight(mode: Mode, autoForce: Boolean): Unit = mode match { case h: HadoopMode => val config = h.jobConf config.setBoolean(Config.HashJoinAutoForceRight, autoForce) case _ => () } - } } -class TestTypedEmptySource extends FileSource with TextSourceScheme with Mappable[(Long, String)] with SuccessFileSource { +class TestTypedEmptySource + extends FileSource + with TextSourceScheme + with Mappable[(Long, String)] + with SuccessFileSource { override def hdfsPaths: Iterable[String] = Iterable.empty override def localPaths: Iterable[String] = Iterable.empty override def converter[U >: (Long, String)] = @@ -423,7 +435,8 @@ class TestTypedEmptySource extends FileSource with TextSourceScheme with Mappabl // due to the directory being empty (but for a _SUCCESS file) // We test out that this shouldn't result in a Cascading planner error during {@link Job.buildFlow} class EmptyDataJob(args: Args) extends Job(args) { - TypedPipe.from(new TestTypedEmptySource) + TypedPipe + .from(new TestTypedEmptySource) .map { case (offset, line) => line } .write(TypedTsv[String]("output")) } @@ -438,7 +451,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "reading then writing shouldn't change the data" in { HadoopPlatformJobTest(new InAndOutJob(_), cluster) .source("input", inAndOut) - .sink[String]("output") { _.toSet shouldBe (inAndOut.toSet) } + .sink[String]("output")(_.toSet shouldBe (inAndOut.toSet)) .run() } } @@ -450,7 +463,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest HadoopPlatformJobTest(new TinyJoinAndMergeJob(_), cluster) .source(peopleInput, peopleData) .source(messageInput, messageData) - .sink(output) { _.toSet shouldBe (outputData.toSet) } + .sink(output)(_.toSet shouldBe (outputData.toSet)) .run() } } @@ -461,8 +474,12 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "Writing to a tsv in a flow shouldn't effect the output" in { HadoopPlatformJobTest(new TsvNoCacheJob(_), cluster) .source(dataInput, data) - .sink(typedThrowAwayOutput) { _.toSet should have size 4 } - .sink(typedRealOutput) { _.map{ f: Float => (f * 10).toInt }.toList shouldBe (outputData.map{ f: Float => (f * 10).toInt }.toList) } + .sink(typedThrowAwayOutput)(_.toSet should have size 4) + .sink(typedRealOutput) { + _.map { f: Float => (f * 10).toInt }.toList shouldBe (outputData.map { f: Float => + (f * 10).toInt + }.toList) + } .run() } } @@ -473,7 +490,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "do some ops and not stamp on each other ordered serializations" in { HadoopPlatformJobTest(new MultipleGroupByJob(_), cluster) .source[String]("input", data) - .sink[String]("output") { _.toSet shouldBe data.map(_.toString).toSet } + .sink[String]("output")(_.toSet shouldBe data.map(_.toString).toSet) .run() } @@ -488,10 +505,10 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val secondStep = steps.filter(_.getName.startsWith("(2/2")) val lab1 = firstStep.map(_.getConfig.get(Config.StepDescriptions)) lab1 should have size 1 - lab1(0) should include ("write words to disk") + lab1(0) should include("write words to disk") val lab2 = secondStep.map(_.getConfig.get(Config.StepDescriptions)) lab2 should have size 1 - lab2(0) should include ("output frequency by length") + lab2(0) should include("output frequency by length") } .run() } @@ -519,8 +536,8 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 1 val firstStep = steps.headOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - firstStep should include ("leftJoin") - firstStep should include ("hashJoin") + firstStep should include("leftJoin") + firstStep should include("hashJoin") steps.map(_.getConfig.get(Config.StepDescriptions)).foreach(s => info(s)) } .run() @@ -535,7 +552,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 2 val secondStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - secondStep should include ("hashJoin") + secondStep should include("hashJoin") } .run() } @@ -549,7 +566,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 3 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -563,7 +580,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 2 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -577,7 +594,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 2 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -591,7 +608,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 3 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -601,12 +618,15 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "have a custom step name from withDescription and no extra forceToDisk after groupBy on hashJoin's rhs" in { HadoopPlatformJobTest(new TypedPipeHashJoinWithGroupByJob(_), cluster) .source(TypedTsv[(String, Int)]("input1"), Seq(("first", 45))) - .source(TypedTsv[(String, Int)]("input2"), Seq(("first", 1), ("first", 2), ("first", 3), ("second", 1), ("second", 2))) + .source( + TypedTsv[(String, Int)]("input2"), + Seq(("first", 1), ("first", 2), ("first", 3), ("second", 1), ("second", 2)) + ) .inspectCompletedFlow { flow => val steps = flow.getFlowSteps.asScala steps should have size 2 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -621,7 +641,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 2 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -636,7 +656,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val steps = flow.getFlowSteps.asScala steps should have size 2 val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") - lastStep should include ("hashJoin") + lastStep should include("hashJoin") } .run() } @@ -647,19 +667,21 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest HadoopPlatformJobTest(new TypedPipeWithDescriptionJob(_), cluster) .inspectCompletedFlow { flow => val steps = flow.getFlowSteps.asScala - val descs = List("map stage - assign words to 1", + val descs = List( + "map stage - assign words to 1", "reduce stage - sum", "write", // should see the .group and the .write show up as line numbers "com.twitter.scalding.platform.TypedPipeWithDescriptionJob.(TestJobsWithDescriptions.scala:30)", - "com.twitter.scalding.platform.TypedPipeWithDescriptionJob.(TestJobsWithDescriptions.scala:34)") + "com.twitter.scalding.platform.TypedPipeWithDescriptionJob.(TestJobsWithDescriptions.scala:34)" + ) val foundDescs = steps.map(_.getConfig.get(Config.StepDescriptions)) descs.foreach { d => assert(foundDescs.size == 1) assert(foundDescs(0).contains(d)) } - //steps.map(_.getConfig.get(Config.StepDescriptions)).foreach(s => info(s)) + //steps.map(_.getConfig.get(Config.StepDescriptions)).foreach(s => info(s)) } .run() } @@ -671,19 +693,19 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "distinct properly from normal data" in { HadoopPlatformJobTest(new NormalDistinctJob(_), cluster) .source[String]("input", data ++ data ++ data) - .sink[String]("output") { _.toList shouldBe data } + .sink[String]("output")(_.toList shouldBe data) .run() } "distinctBy(identity) properly from a list in memory" in { HadoopPlatformJobTest(new IterableSourceDistinctIdentityJob(_), cluster) - .sink[String]("output") { _.toList shouldBe data } + .sink[String]("output")(_.toList shouldBe data) .run() } "distinct properly from a list" in { HadoopPlatformJobTest(new IterableSourceDistinctJob(_), cluster) - .sink[String]("output") { _.toList shouldBe data } + .sink[String]("output")(_.toList shouldBe data) .run() } } @@ -709,8 +731,8 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest // Here we are just testing that we hit no exceptions in the course of this run // the previous issue would have caused OOM or other exceptions. If we get to the end // then we are good. - .sink[String](TypedTsv[String]("output2")) { x => () } - .sink[String](TypedTsv[String]("output1")) { x => () } + .sink[String](TypedTsv[String]("output2"))(x => ()) + .sink[String](TypedTsv[String]("output1"))(x => ()) .run() } @@ -722,8 +744,8 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest // Here we are just testing that we hit no exceptions in the course of this run // the previous issue would have caused OOM or other exceptions. If we get to the end // then we are good. - .sink[String](TypedTsv[String]("output2")) { x => () } - .sink[String](TypedTsv[String]("output1")) { x => () } + .sink[String](TypedTsv[String]("output2"))(x => ()) + .sink[String](TypedTsv[String]("output1"))(x => ()) .run() } } @@ -736,9 +758,9 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest .sink(TypedTsv[(String, String)]("output")) { _ => // The job will fail with an exception if the FlowProcess is unavailable. } - .inspectCompletedFlow({ flow => + .inspectCompletedFlow { flow => flow.getFlowStats.getCounterValue(Stats.ScaldingGroup, "joins") shouldBe 2 - }) + } .run() } @@ -749,9 +771,9 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest .sink[(String, String)](TypedTsv[(String, String)]("output")) { _ => // The job will fail with an exception if the FlowProcess is unavailable. } - .inspectCompletedFlow({ flow => + .inspectCompletedFlow { flow => flow.getFlowStats.getCounterValue(Stats.ScaldingGroup, "joins") shouldBe 2 - }) + } .run() } } diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala index b50c26e8e7..7a7198e5fb 100644 --- a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala @@ -24,9 +24,10 @@ class TypedPipeJoinWithDescriptionJob(args: Args) extends Job(args) { } class TypedPipeWithDescriptionJob(args: Args) extends Job(args) { - TypedPipe.from[String](List("word1", "word1", "word2")) + TypedPipe + .from[String](List("word1", "word1", "word2")) .withDescription("map stage - assign words to 1") - .map { w => (w, 1L) } + .map(w => (w, 1L)) .group .withDescription("reduce stage - sum") .sum diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala index 084e5ad18b..534df99b02 100644 --- a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala @@ -1,15 +1,15 @@ package com.twitter.scalding.hraven.estimation import cascading.flow.FlowStep -import com.twitter.hraven.JobDescFactory.{ JOBTRACKER_KEY, RESOURCE_MANAGER_KEY } +import com.twitter.hraven.JobDescFactory.{JOBTRACKER_KEY, RESOURCE_MANAGER_KEY} import com.twitter.hraven.rest.client.HRavenRestClient -import com.twitter.hraven.{ Constants, CounterMap, Flow, HadoopVersion, JobDetails, TaskDetails } -import com.twitter.scalding.estimation.{ FlowStepHistory, FlowStepKeys, FlowStrategyInfo, HistoryService, Task } +import com.twitter.hraven.{Constants, CounterMap, Flow, HadoopVersion, JobDetails, TaskDetails} +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStepKeys, FlowStrategyInfo, HistoryService, Task} import java.io.IOException import org.apache.hadoop.mapred.JobConf import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} object HRavenClient { import HRavenHistoryService.jobConfToRichConfig @@ -22,10 +22,15 @@ object HRavenClient { private final val clientReadTimeoutDefault = 30000 def apply(conf: JobConf): Try[HRavenRestClient] = - conf.getFirstKey(apiHostnameKey) - .map(new HRavenRestClient(_, - conf.getInt(clientConnectTimeoutKey, clientConnectTimeoutDefault), - conf.getInt(clientReadTimeoutKey, clientReadTimeoutDefault))) + conf + .getFirstKey(apiHostnameKey) + .map( + new HRavenRestClient( + _, + conf.getInt(clientConnectTimeoutKey, clientConnectTimeoutDefault), + conf.getInt(clientReadTimeoutKey, clientReadTimeoutDefault) + ) + ) } object HRavenHistoryService { @@ -52,16 +57,17 @@ object HRavenHistoryService { } /** - * Try fields in order until one returns a value. - * Logs a warning if nothing was found. + * Try fields in order until one returns a value. Logs a warning if nothing was found. */ def getFirstKey(fields: String*): Try[String] = - fields.collectFirst { - case f if conf.get(f) != null => Success(conf.get(f)) - }.getOrElse { - LOG.warn("Missing required config param: " + fields.mkString(" or ")) - Failure(MissingFieldsException(fields)) - } + fields + .collectFirst { + case f if conf.get(f) != null => Success(conf.get(f)) + } + .getOrElse { + LOG.warn("Missing required config param: " + fields.mkString(" or ")) + Failure(MissingFieldsException(fields)) + } } implicit def jobConfToRichConfig(conf: JobConf): RichConfig = RichConfig(conf) @@ -86,24 +92,25 @@ trait HRavenHistoryService extends HistoryService { def hRavenClient(conf: JobConf): Try[HRavenRestClient] = HRavenClient(conf) /** - * Fetch flows until it finds one that was successful - * (using "HdfsBytesRead > 0" as a marker for successful jobs since it seems - * that this is only set on completion of jobs) + * Fetch flows until it finds one that was successful (using "HdfsBytesRead > 0" as a marker for successful + * jobs since it seems that this is only set on completion of jobs) * - * TODO: query hRaven for successful jobs (first need to add ability to filter - * results in hRaven REST API) + * TODO: query hRaven for successful jobs (first need to add ability to filter results in hRaven REST API) */ private def fetchSuccessfulFlows( - client: HRavenRestClient, - cluster: String, - user: String, - batch: String, - signature: String, - stepNum: Int, - max: Int, - nFetch: Int): Try[Seq[Flow]] = - Try(client - .fetchFlowsWithConfig(cluster, user, batch, signature, nFetch, RequiredJobConfigs: _*)) + client: HRavenRestClient, + cluster: String, + user: String, + batch: String, + signature: String, + stepNum: Int, + max: Int, + nFetch: Int + ): Try[Seq[Flow]] = + Try( + client + .fetchFlowsWithConfig(cluster, user, batch, signature, nFetch, RequiredJobConfigs: _*) + ) .flatMap { flows => Try { // Ugly mutable code to add task info to flows @@ -117,7 +124,12 @@ trait HRavenHistoryService extends HistoryService { val tasks = if (counterFields.isEmpty) { client.fetchTaskDetails(flow.getCluster, job.getJobId, detailFields.asJava) } else { - client.fetchTaskDetails(flow.getCluster, job.getJobId, detailFields.asJava, counterFields.asJava) + client.fetchTaskDetails( + flow.getCluster, + job.getJobId, + detailFields.asJava, + counterFields.asJava + ) } job.addTasks(tasks) } @@ -129,35 +141,38 @@ trait HRavenHistoryService extends HistoryService { } successfulFlows } - }.recoverWith { - case e: IOException => - LOG.error("Error making API request to hRaven. HRavenHistoryService will be disabled.") - Failure(e) + } + .recoverWith { case e: IOException => + LOG.error("Error making API request to hRaven. HRavenHistoryService will be disabled.") + Failure(e) } /** - * Fetch info from hRaven for the last time the given JobStep ran. - * Finds the last successful complete flow and selects the corresponding - * step from it. + * Fetch info from hRaven for the last time the given JobStep ran. Finds the last successful complete flow + * and selects the corresponding step from it. * - * @param step FlowStep to get info for - * @return Details about the previous successful run. + * @param step + * FlowStep to get info for + * @return + * Details about the previous successful run. */ def fetchPastJobDetails(step: FlowStep[JobConf], max: Int): Try[Seq[JobDetails]] = { val conf = step.getConfig val stepNum = step.getStepNum def findMatchingJobStep(pastFlow: Flow) = - pastFlow.getJobs.asScala.find { step => - try { - step.getConfiguration.get("cascading.flow.step.num").toInt == stepNum - } catch { - case _: NumberFormatException => false + pastFlow.getJobs.asScala + .find { step => + try { + step.getConfiguration.get("cascading.flow.step.num").toInt == stepNum + } catch { + case _: NumberFormatException => false + } + } + .orElse { + LOG.warn("No matching job step in the retrieved hRaven flow.") + None } - } orElse { - LOG.warn("No matching job step in the retrieved hRaven flow.") - None - } def lookupClusterName(client: HRavenRestClient): Try[String] = { // regex for case matching URL to get hostname out @@ -198,18 +213,24 @@ trait HRavenHistoryService extends HistoryService { fetchPastJobDetails(info.step, maxHistory).map { history => for { step <- history // linter:disable:MergeMaps - keys = FlowStepKeys(step.getJobName, step.getUser, step.getPriority, step.getStatus, step.getVersion, "") + keys = FlowStepKeys( + step.getJobName, + step.getUser, + step.getPriority, + step.getStatus, + step.getVersion, + "" + ) // update HRavenHistoryService.TaskDetailFields when consuming additional task fields from hraven below tasks = step.getTasks.asScala.flatMap { taskDetails => - details(taskDetails).zip(counters(taskDetails.getCounters)).map { - case (details, counters) => - Task(details, counters) + details(taskDetails).zip(counters(taskDetails.getCounters)).map { case (details, counters) => + Task(details, counters) } } } yield toFlowStepHistory(keys, step, tasks) } - private def toFlowStepHistory(keys: FlowStepKeys, step: JobDetails, tasks: Seq[Task]) = { + private def toFlowStepHistory(keys: FlowStepKeys, step: JobDetails, tasks: Seq[Task]) = FlowStepHistory( keys = keys, submitTimeMillis = step.getSubmitTime, @@ -231,18 +252,21 @@ trait HRavenHistoryService extends HistoryService { reducerTimeMillis = step.getReduceSlotMillis, reduceShuffleBytes = step.getReduceShuffleBytes, cost = 0, - tasks = tasks) - } + tasks = tasks + ) - private def mapOutputBytes(step: JobDetails): Long = { + private def mapOutputBytes(step: JobDetails): Long = if (step.getHadoopVersion == HadoopVersion.TWO) { getCounterValueAsLong(step.getMapCounters, Constants.TASK_COUNTER_HADOOP2, MapOutputBytesKey) } else { getCounterValueAsLong(step.getMapCounters, Constants.TASK_COUNTER, MapOutputBytesKey) } - } - private def getCounterValueAsLong(counters: CounterMap, counterGroupName: String, counterName: String): Long = { + private def getCounterValueAsLong( + counters: CounterMap, + counterGroupName: String, + counterName: String + ): Long = { val counter = counters.getCounter(counterGroupName, counterName) if (counter != null) counter.getValue else 0L } diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala index f7eafed942..13fc335295 100644 --- a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.hraven.estimation.memory -import com.twitter.hraven.{ CounterMap, TaskDetails } +import com.twitter.hraven.{CounterMap, TaskDetails} import com.twitter.scalding.estimation.Task import com.twitter.scalding.estimation.memory.SmoothedHistoryMemoryEstimator import com.twitter.scalding.hraven.estimation.HRavenHistoryService @@ -15,30 +15,32 @@ trait HRavenMemoryHistoryService extends HRavenHistoryService { "org.apache.hadoop.mapreduce.TaskCounter.COMMITTED_HEAP_BYTES", "org.apache.hadoop.mapreduce.TaskCounter.PHYSICAL_MEMORY_BYTES", "org.apache.hadoop.mapreduce.TaskCounter.GC_TIME_MILLIS", - "org.apache.hadoop.mapreduce.TaskCounter.CPU_MILLISECONDS") + "org.apache.hadoop.mapreduce.TaskCounter.CPU_MILLISECONDS" + ) - override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = { + override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = if (taskDetails.getType.nonEmpty) { Some(Map(Task.TaskType -> taskDetails.getType)) } else { None } - } - override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = { + override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = //sometimes get groups with only partial data if (taskCounters.getGroups.isEmpty || taskCounters.getGroup(TaskCounterGroup).size() < 4) { None } else { val group = taskCounters.getGroup(TaskCounterGroup) - Some(Map( - CommittedHeapBytes -> group.get(CommittedHeapBytes).getValue, - CpuMs -> group.get(CpuMs).getValue, - PhysicalMemoryBytes -> group.get(PhysicalMemoryBytes).getValue, - GCTimeMs -> group.get(GCTimeMs).getValue)) + Some( + Map( + CommittedHeapBytes -> group.get(CommittedHeapBytes).getValue, + CpuMs -> group.get(CpuMs).getValue, + PhysicalMemoryBytes -> group.get(PhysicalMemoryBytes).getValue, + GCTimeMs -> group.get(GCTimeMs).getValue + ) + ) } - } } object HRavenMemoryHistoryService extends HRavenMemoryHistoryService diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala index c12801e170..9e72ca57e8 100644 --- a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala @@ -1,27 +1,26 @@ package com.twitter.scalding.hraven.reducer_estimation -import com.twitter.hraven.{ CounterMap, TaskDetails } +import com.twitter.hraven.{CounterMap, TaskDetails} import com.twitter.scalding.estimation.Task import com.twitter.scalding.hraven.estimation.HRavenHistoryService -import com.twitter.scalding.reducer_estimation.{ RatioBasedEstimator, RuntimeReducerEstimator } +import com.twitter.scalding.reducer_estimation.{RatioBasedEstimator, RuntimeReducerEstimator} trait HRavenReducerHistoryService extends HRavenHistoryService { override protected val counterFields: List[String] = List() - override protected val detailFields: List[String] = List( - Task.TaskType, - "status", - "startTime", - "finishTime") + override protected val detailFields: List[String] = List(Task.TaskType, "status", "startTime", "finishTime") override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = Some(Map.empty) override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = if (taskDetails.getType.nonEmpty) { - Some(Map( - Task.TaskType -> taskDetails.getType, - "status" -> taskDetails.getStatus, - "startTime" -> taskDetails.getStartTime, - "finishTime" -> taskDetails.getFinishTime)) + Some( + Map( + Task.TaskType -> taskDetails.getType, + "status" -> taskDetails.getStatus, + "startTime" -> taskDetails.getStartTime, + "finishTime" -> taskDetails.getFinishTime + ) + ) } else { None } diff --git a/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala b/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala index 734aa755e7..23bdf9ca07 100644 --- a/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala +++ b/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala @@ -4,7 +4,7 @@ import cascading.flow.FlowStep import com.twitter.hraven.JobDescFactory.RESOURCE_MANAGER_KEY import com.twitter.hraven.rest.client.HRavenRestClient import com.twitter.hraven.util.JSONUtil -import com.twitter.hraven.{ Flow, TaskDetails } +import com.twitter.hraven.{Flow, TaskDetails} import com.twitter.scalding.estimation.FlowStrategyInfo import com.twitter.scalding.hraven.estimation.memory.HRavenMemoryHistoryService import com.twitter.scalding.hraven.reducer_estimation.HRavenReducerHistoryService @@ -13,7 +13,7 @@ import org.apache.hadoop.mapred.JobConf import org.codehaus.jackson.`type`.TypeReference import org.mockito.Matchers._ import org.mockito.Mockito._ -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} import scala.collection.JavaConverters._ import scala.util.Try @@ -27,9 +27,7 @@ class HRavenHistoryServiceTest extends WordSpec with Matchers { HRavenMockedClient(super.hRavenClient(conf), detailFields, counterFields) } - val history = historyService.fetchHistory( - TestFlowStrategyInfo.dummy(), - HRavenMockedClient.nFetch) + val history = historyService.fetchHistory(TestFlowStrategyInfo.dummy(), HRavenMockedClient.nFetch) if (history.isFailure) { history.get @@ -53,9 +51,7 @@ class HRavenHistoryServiceTest extends WordSpec with Matchers { HRavenMockedClient(super.hRavenClient(conf), detailFields, counterFields) } - val history = historyService.fetchHistory( - TestFlowStrategyInfo.dummy(), - HRavenMockedClient.nFetch) + val history = historyService.fetchHistory(TestFlowStrategyInfo.dummy(), HRavenMockedClient.nFetch) if (history.isFailure) { history.get @@ -102,9 +98,10 @@ object HRavenMockedClient { val RequiredJobConfigs = Seq("cascading.flow.step.num") def apply( - hRaven: Try[HRavenRestClient], - detailFields: List[String], - counterFields: List[String]): Try[HRavenRestClient] = { + hRaven: Try[HRavenRestClient], + detailFields: List[String], + counterFields: List[String] + ): Try[HRavenRestClient] = hRaven.map { hRaven => val client = spy(hRaven) @@ -130,7 +127,6 @@ object HRavenMockedClient { client } - } def configure(conf: JobConf): Unit = { conf.set(HRavenClient.apiHostnameKey, "test") @@ -142,12 +138,18 @@ object HRavenMockedClient { } def flowsResponse: util.List[Flow] = - JSONUtil.readJson( - getClass.getResourceAsStream("../../../../../flowResponse.json"), - new TypeReference[util.List[Flow]] {}).asInstanceOf[util.List[Flow]] + JSONUtil + .readJson( + getClass.getResourceAsStream("../../../../../flowResponse.json"), + new TypeReference[util.List[Flow]] {} + ) + .asInstanceOf[util.List[Flow]] def jobResponse(jobId: String): util.List[TaskDetails] = - JSONUtil.readJson( - getClass.getResourceAsStream(s"../../../../../jobResponse_$jobId.json"), - new TypeReference[util.List[TaskDetails]] {}).asInstanceOf[util.List[TaskDetails]] + JSONUtil + .readJson( + getClass.getResourceAsStream(s"../../../../../jobResponse_$jobId.json"), + new TypeReference[util.List[TaskDetails]] {} + ) + .asInstanceOf[util.List[TaskDetails]] } diff --git a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/ColumnDefiner.scala b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/ColumnDefiner.scala index 3ce8de2c34..4c6f75bcb4 100644 --- a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/ColumnDefiner.scala +++ b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/ColumnDefiner.scala @@ -12,13 +12,31 @@ trait ColumnDefiner { protected def columnDefinitions: Array[Definition] = columns.map(_.definition).toArray // Some helper methods that we can use to generate column definitions - protected def bigint(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[BIGINT.type]) = coldef(name, nullable) - protected def int(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[INT.type]) = coldef(name, nullable) - protected def smallint(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[SMALLINT.type]) = coldef(name, nullable) - protected def tinyint(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[TINYINT.type]) = coldef(name, nullable) - protected def varchar(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[VARCHAR.type]) = coldef(name, nullable) - protected def date(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[DATE.type]) = coldef(name, nullable) - protected def datetime(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[DATETIME.type]) = coldef(name, nullable) - protected def text(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[TEXT.type]) = coldef(name, nullable) - protected def double(name: String, nullable: IsNullable = NotNullable)(implicit coldef: DriverColumnDefiner[DOUBLE.type]) = coldef(name, nullable) + protected def bigint(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[BIGINT.type] + ) = coldef(name, nullable) + protected def int(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[INT.type] + ) = coldef(name, nullable) + protected def smallint(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[SMALLINT.type] + ) = coldef(name, nullable) + protected def tinyint(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[TINYINT.type] + ) = coldef(name, nullable) + protected def varchar(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[VARCHAR.type] + ) = coldef(name, nullable) + protected def date(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[DATE.type] + ) = coldef(name, nullable) + protected def datetime(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[DATETIME.type] + ) = coldef(name, nullable) + protected def text(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[TEXT.type] + ) = coldef(name, nullable) + protected def double(name: String, nullable: IsNullable = NotNullable)(implicit + coldef: DriverColumnDefiner[DOUBLE.type] + ) = coldef(name, nullable) } diff --git a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/DriverColumnDefiner.scala b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/DriverColumnDefiner.scala index 53195275fe..9b44eabb8f 100644 --- a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/DriverColumnDefiner.scala +++ b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/DriverColumnDefiner.scala @@ -20,7 +20,8 @@ case object Nullable extends IsNullable("NULL") case object NotNullable extends IsNullable("NOT NULL") /** - * This is a mechanism by which different databases can control and configure the way in which statements are created. + * This is a mechanism by which different databases can control and configure the way in which statements are + * created. */ trait DriverColumnDefiner[Type <: JdbcType] { //TODO does this need to deal with sizes, or now that it's fixed per DB will that be fine? @@ -28,13 +29,14 @@ trait DriverColumnDefiner[Type <: JdbcType] { //TODO should use the fact that now we have more typed typeName protected def mkColumnDef( - name: String, - typeName: String, - nullable: IsNullable, - sizeOp: Option[Int] = None, - defOp: Option[String]) = { - val sizeStr = sizeOp.map { "(" + _.toString + ")" }.getOrElse("") - val defStr = defOp.map { " DEFAULT '" + _ + "' " }.getOrElse(" ") + name: String, + typeName: String, + nullable: IsNullable, + sizeOp: Option[Int] = None, + defOp: Option[String] + ) = { + val sizeStr = sizeOp.map("(" + _.toString + ")").getOrElse("") + val defStr = defOp.map(" DEFAULT '" + _ + "' ").getOrElse(" ") ColumnDefinition(ColumnName(name), Definition(typeName + sizeStr + defStr + nullable.get)) } diff --git a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCDriver.scala b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCDriver.scala index 9a32922645..e125d400d5 100644 --- a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCDriver.scala +++ b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCDriver.scala @@ -1,21 +1,23 @@ package com.twitter.scalding.jdbc -import cascading.jdbc.{ MySqlScheme, JDBCScheme, TableDesc } +import cascading.jdbc.{JDBCScheme, MySqlScheme, TableDesc} case class DriverClass(get: String) trait JdbcDriver { def driver: DriverClass def getTableDesc( - tableName: TableName, - columnNames: Array[ColumnName], - columnDefinitions: Array[Definition]) = + tableName: TableName, + columnNames: Array[ColumnName], + columnDefinitions: Array[Definition] + ) = new TableDesc(tableName.get, columnNames.map(_.get), columnDefinitions.map(_.get), null, null) def getJDBCScheme( - columnNames: Array[ColumnName], - filterCondition: Option[String], - updateBy: Iterable[String], - replaceOnInsert: Boolean) = { + columnNames: Array[ColumnName], + filterCondition: Option[String], + updateBy: Iterable[String], + replaceOnInsert: Boolean + ) = { if (replaceOnInsert) sys.error("replaceOnInsert functionality only supported by MySql") new JDBCScheme( null, // inputFormatClass @@ -23,35 +25,39 @@ trait JdbcDriver { columnNames.map(_.get), null, // orderBy filterCondition.orNull, - updateBy.toArray) + updateBy.toArray + ) } } trait MysqlDriver extends JdbcDriver with MysqlTableCreationImplicits { override val driver = DriverClass("com.mysql.jdbc.Driver") override def getTableDesc( - tableName: TableName, - columnNames: Array[ColumnName], - columnDefinitions: Array[Definition]) = + tableName: TableName, + columnNames: Array[ColumnName], + columnDefinitions: Array[Definition] + ) = new TableDesc( tableName.get, columnNames.map(_.get), columnDefinitions.map(_.get), null, - "SHOW TABLES LIKE '%s'") + "SHOW TABLES LIKE '%s'" + ) override def getJDBCScheme( - columnNames: Array[ColumnName], - filterCondition: Option[String], - updateBy: Iterable[String], - replaceOnInsert: Boolean) = { + columnNames: Array[ColumnName], + filterCondition: Option[String], + updateBy: Iterable[String], + replaceOnInsert: Boolean + ) = new MySqlScheme( null, // inputFormatClass columnNames.map(_.get), null, // orderBy filterCondition.orNull, updateBy.toArray, - replaceOnInsert) - } + replaceOnInsert + ) } trait HsqlDbDriver extends JdbcDriver { diff --git a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala index 7b48c77cce..cf35671d95 100644 --- a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala +++ b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala @@ -12,38 +12,30 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.jdbc -import com.twitter.scalding.{ AccessMode, Hdfs, Mode, Source, TestTapFactory } +import com.twitter.scalding.{AccessMode, Hdfs, Mode, Source, TestTapFactory} import cascading.jdbc.JDBCTap import cascading.tap.Tap import cascading.tuple.Fields /** - * Extend this source to let scalding read from or write to a database. - * In order for this to work you need to specify the table name, column definitions and DB credentials. - * If you write to a DB, the fields in the final pipe have to correspond to the column names in the DB table. - * Example usage: - * case object YourTableSource extends JDBCSource { - * override val tableName = TableName("tableName") - * override val columns = List( - * varchar("col1", 64), - * date("col2"), - * tinyint("col3"), - * double("col4") - * ) - * override def currentConfig = ConnectionSpec( - * ConnectUrl("jdbc:mysql://mysql01.company.com:3306/production"), - * UserName("username"), Password("password"), - * MysqlDriver - * ) - * } + * Extend this source to let scalding read from or write to a database. In order for this to work you need to + * specify the table name, column definitions and DB credentials. If you write to a DB, the fields in the + * final pipe have to correspond to the column names in the DB table. Example usage: case object + * YourTableSource extends JDBCSource { override val tableName = TableName("tableName") override val columns = + * List( varchar("col1", 64), date("col2"), tinyint("col3"), double("col4") ) override def currentConfig = + * ConnectionSpec( ConnectUrl("jdbc:mysql://mysql01.company.com:3306/production"), UserName("username"), + * Password("password"), MysqlDriver ) } * - * @author Argyris Zymnis - * @author Oscar Boykin - * @author Kevin Lin + * @author + * Argyris Zymnis + * @author + * Oscar Boykin + * @author + * Kevin Lin */ abstract class JDBCSource extends Source with ColumnDefiner with JdbcDriver { @@ -82,7 +74,8 @@ abstract class JDBCSource extends Source with ColumnDefiner with JdbcDriver { passwd.get, driver.get, getTableDesc(tableName, columnNames, columnDefinitions), - getJDBCScheme(columnNames, filterCondition, updateBy, replaceOnInsert)) + getJDBCScheme(columnNames, filterCondition, updateBy, replaceOnInsert) + ) tap.setConcurrentReads(maxConcurrentReads) tap.setBatchSize(batchSize) tap @@ -109,4 +102,3 @@ abstract class JDBCSource extends Source with ColumnDefiner with JdbcDriver { } case class TableName(get: String) - diff --git a/scalding-jdbc/src/test/scala/com/twitter/scalding/jdbc/JDBCSourceCompileTest.scala b/scalding-jdbc/src/test/scala/com/twitter/scalding/jdbc/JDBCSourceCompileTest.scala index de2b015a1a..dcb1cd152c 100644 --- a/scalding-jdbc/src/test/scala/com/twitter/scalding/jdbc/JDBCSourceCompileTest.scala +++ b/scalding-jdbc/src/test/scala/com/twitter/scalding/jdbc/JDBCSourceCompileTest.scala @@ -11,7 +11,8 @@ class ExampleMysqlJdbcSource() extends JDBCSource with MysqlDriver { datetime("off"), text("of"), double("my"), - smallint("cloud")) + smallint("cloud") + ) override def currentConfig = ConnectionSpec(ConnectUrl("how"), UserName("are"), Password("you")) } @@ -24,7 +25,8 @@ class ExampleVerticaJdbcSource() extends JDBCSource with VerticaJdbcDriver { datetime("off"), text("of"), double("my"), - smallint("cloud")) + smallint("cloud") + ) override def currentConfig = ConnectionSpec(ConnectUrl("how"), UserName("are"), Password("you")) } diff --git a/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala b/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala index e9676d399c..cf05472bc0 100644 --- a/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala +++ b/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala @@ -12,63 +12,67 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.Serializable -import java.lang.reflect.{ Type, ParameterizedType } +import java.lang.reflect.{ParameterizedType, Type} import cascading.pipe.Pipe import cascading.tap.SinkMode -import cascading.tuple.{ Tuple, TupleEntry, Fields } +import cascading.tuple.{Fields, Tuple, TupleEntry} import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.module.scala._ import com.fasterxml.jackson.databind.ObjectMapper /** - * This Source writes out the TupleEntry as a simple JSON object, using the field - * names as keys and the string representation of the values. + * This Source writes out the TupleEntry as a simple JSON object, using the field names as keys and the string + * representation of the values. * - * TODO: it would be nice to have a way to add read/write transformations to pipes - * that doesn't require extending the sources and overriding methods. + * TODO: it would be nice to have a way to add read/write transformations to pipes that doesn't require + * extending the sources and overriding methods. * - * @param failOnEmptyLines When set to false, it just skips empty lines instead of failing the jobs. Defaults to true - * for backwards compatibility. + * @param failOnEmptyLines + * When set to false, it just skips empty lines instead of failing the jobs. Defaults to true for backwards + * compatibility. */ -case class JsonLine(p: String, fields: Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE, - override val transformInTest: Boolean = false, - failOnEmptyLines: Boolean = true) - extends FixedPathSource(p) with TextLineScheme { +case class JsonLine( + p: String, + fields: Fields = Fields.ALL, + override val sinkMode: SinkMode = SinkMode.REPLACE, + override val transformInTest: Boolean = false, + failOnEmptyLines: Boolean = true +) extends FixedPathSource(p) + with TextLineScheme { import Dsl._ import JsonLine._ - override def transformForWrite(pipe: Pipe) = pipe.mapTo(fields -> 'json) { - t: TupleEntry => mapper.writeValueAsString(TupleConverter.ToMap(t)) + override def transformForWrite(pipe: Pipe) = pipe.mapTo(fields -> 'json) { t: TupleEntry => + mapper.writeValueAsString(TupleConverter.ToMap(t)) } override def transformForRead(pipe: Pipe) = { @scala.annotation.tailrec - def nestedRetrieval(node: Option[Map[String, AnyRef]], path: List[String]): AnyRef = { + def nestedRetrieval(node: Option[Map[String, AnyRef]], path: List[String]): AnyRef = (path, node) match { - case (_, None) => null + case (_, None) => null case (h :: Nil, Some(fs)) => fs.get(h).orNull - case (h :: tail, Some(fs)) => fs.get(h).orNull match { - case fs: Map[String @unchecked, AnyRef @unchecked] => nestedRetrieval(Option(fs), tail) - case _ => null - } + case (h :: tail, Some(fs)) => + fs.get(h).orNull match { + case fs: Map[String @unchecked, AnyRef @unchecked] => nestedRetrieval(Option(fs), tail) + case _ => null + } case (Nil, _) => null } - } val splitFields = (0 until fields.size).map { i: Int => fields.get(i).toString.split('.').toList } pipe.collectTo[String, Tuple]('line -> fields) { case line: String if failOnEmptyLines || line.trim.nonEmpty => val fs: Map[String, AnyRef] = mapper.readValue(line, mapTypeReference) - val values = splitFields.map { nestedRetrieval(Option(fs), _) } + val values = splitFields.map(nestedRetrieval(Option(fs), _)) new cascading.tuple.Tuple(values: _*) } } @@ -80,8 +84,10 @@ case class JsonLine(p: String, fields: Fields = Fields.ALL, * TODO: at the next binary incompatible version remove the AbstractFunction2/scala.Serializable jank which * was added to get mima to not report binary errors */ -object JsonLine extends scala.runtime.AbstractFunction5[String, Fields, SinkMode, Boolean, Boolean, JsonLine] - with Serializable with scala.Serializable { +object JsonLine + extends scala.runtime.AbstractFunction5[String, Fields, SinkMode, Boolean, Boolean, JsonLine] + with Serializable + with scala.Serializable { val mapTypeReference = typeReference[Map[String, AnyRef]] @@ -89,16 +95,16 @@ object JsonLine extends scala.runtime.AbstractFunction5[String, Fields, SinkMode override def getType = typeFromManifest(manifest[T]) } - private[this] def typeFromManifest(m: Manifest[_]): Type = { + private[this] def typeFromManifest(m: Manifest[_]): Type = if (m.typeArguments.isEmpty) { m.runtimeClass } - else new ParameterizedType { - def getRawType = m.runtimeClass + else + new ParameterizedType { + def getRawType = m.runtimeClass - def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray + def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray - def getOwnerType = null - } - } + def getOwnerType = null + } val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) diff --git a/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala b/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala index 66a4e2f216..546cc0e527 100644 --- a/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala +++ b/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala @@ -1,12 +1,12 @@ package com.twitter.scalding -import com.twitter.bijection.{ Injection, AbstractInjection } +import com.twitter.bijection.{AbstractInjection, Injection} import com.twitter.bijection.Inversion._ import com.twitter.elephantbird.cascading2.scheme.LzoTextLine import org.json4s._ import org.json4s.native.Serialization._ -import org.json4s.{ NoTypeHints, native } +import org.json4s.{native, NoTypeHints} import scala.collection.JavaConverters._ import scala.util.Try @@ -15,29 +15,31 @@ import cascading.pipe.Pipe /** * This type uses the structural type of a case class, but not it's name, to describe the Json using json4s. - * This is intended to be used for intermediate output from a REPL session. - * The intended use is to save adhoc data between sessions. - * The fully qualified class name of classes defined in a REPL is not stable between REPL sessions. + * This is intended to be used for intermediate output from a REPL session. The intended use is to save adhoc + * data between sessions. The fully qualified class name of classes defined in a REPL is not stable between + * REPL sessions. * - * We believe using a fixed schema, such as thrift or Avro is a much safer way to do long term productionized data - * pipelines to minimize risks of incompatible changes to schema that render old data unreadable. + * We believe using a fixed schema, such as thrift or Avro is a much safer way to do long term productionized + * data pipelines to minimize risks of incompatible changes to schema that render old data unreadable. */ object TypedJson { private implicit val formats = native.Serialization.formats(NoTypeHints) - private def caseClass2Json[A <: AnyRef](implicit tt: Manifest[A], fmt: Formats): Injection[A, String] = new AbstractInjection[A, String] { - override def apply(a: A): String = write(a) + private def caseClass2Json[A <: AnyRef](implicit tt: Manifest[A], fmt: Formats): Injection[A, String] = + new AbstractInjection[A, String] { + override def apply(a: A): String = write(a) - override def invert(b: String): Try[A] = attempt(b)(read[A]) - } + override def invert(b: String): Try[A] = attempt(b)(read[A]) + } def apply[T <: AnyRef: Manifest](p: String) = new TypedJson(p) } -class TypedJson[T <: AnyRef: Manifest](p: String) extends FixedPathSource(p) - with TextSourceScheme - with SingleMappable[T] - with TypedSink[T] { +class TypedJson[T <: AnyRef: Manifest](p: String) + extends FixedPathSource(p) + with TextSourceScheme + with SingleMappable[T] + with TypedSink[T] { import Dsl._ import TypedJson._ @@ -46,16 +48,17 @@ class TypedJson[T <: AnyRef: Manifest](p: String) extends FixedPathSource(p) @transient private[this] lazy val inj = caseClass2Json[T] override def transformForWrite(pipe: Pipe) = - pipe.mapTo((0) -> (fieldSym)) { inj.apply(_: T) } + pipe.mapTo(0 -> fieldSym)(inj.apply(_: T)) override def transformForRead(pipe: Pipe) = - pipe.mapTo(('line) -> (fieldSym)) { (jsonStr: String) => inj.invert(jsonStr).get } + pipe.mapTo('line -> fieldSym)((jsonStr: String) => inj.invert(jsonStr).get) override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { val tap = createTap(Read)(mode) - CascadingMode.cast(mode) + CascadingMode + .cast(mode) .openForRead(config, tap) .asScala .map { te => @@ -65,5 +68,7 @@ class TypedJson[T <: AnyRef: Manifest](p: String) extends FixedPathSource(p) } case class TypedJsonLzo[T <: AnyRef: Manifest](p: String) extends TypedJson[T](p) { - override def hdfsScheme = HadoopSchemeInstance(new LzoTextLine().asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]]) + override def hdfsScheme = HadoopSchemeInstance( + new LzoTextLine().asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]] + ) } diff --git a/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala b/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala index 72982ddc71..212a39b380 100644 --- a/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala +++ b/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala @@ -12,14 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.json import cascading.flow.FlowException import cascading.tap.SinkMode import cascading.tuple.Fields -import com.twitter.scalding.{ JsonLine => StandardJsonLine, _ } +import com.twitter.scalding.{JsonLine => StandardJsonLine, _} import org.scalatest.WordSpec object JsonLine { @@ -27,9 +27,15 @@ object JsonLine { new JsonLine(p, fields, failOnEmptyLines) } -class JsonLine(p: String, fields: Fields, failOnEmptyLines: Boolean) extends StandardJsonLine(p, fields, SinkMode.REPLACE, - // We want to test the actual transformation here. - transformInTest = true, failOnEmptyLines = failOnEmptyLines) +class JsonLine(p: String, fields: Fields, failOnEmptyLines: Boolean) + extends StandardJsonLine( + p, + fields, + SinkMode.REPLACE, + // We want to test the actual transformation here. + transformInTest = true, + failOnEmptyLines = failOnEmptyLines + ) class JsonLineJob(args: Args) extends Job(args) { try { @@ -72,7 +78,7 @@ class JsonLineInputJobSkipEmptyLines(args: Args) extends Job(args) { class JsonLineNestedInputJob(args: Args) extends Job(args) { try { JsonLine("input0", (Symbol("foo.too"), 'bar)).read - .rename((Symbol("foo.too") -> ('foo))) + .rename((Symbol("foo.too") -> 'foo)) .project('foo, 'bar) .write(Tsv("output0")) @@ -111,11 +117,10 @@ class JsonLineTest extends WordSpec { JobTest(new JsonLineInputJob(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "read json line input" in { - assert(outBuf.toList === List((3, "baz"))) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "read json line input" in { + assert(outBuf.toList === List((3, "baz"))) + } } .run .finish() @@ -124,11 +129,10 @@ class JsonLineTest extends WordSpec { JobTest(new JsonLineInputJob(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "handle missing fields" in { - assert(outBuf.toList === List((3, "baz"), (7, null))) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle missing fields" in { + assert(outBuf.toList === List((3, "baz"), (7, null))) + } } .run .finish() @@ -137,11 +141,10 @@ class JsonLineTest extends WordSpec { JobTest(new JsonLineNestedInputJob(_)) .source(JsonLine("input0", (Symbol("foo.too"), 'bar)), List((0, json), (1, json3))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "handle nested fields" in { - assert(outBuf.toList === List((0, "baz"), (9, null))) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle nested fields" in { + assert(outBuf.toList === List((0, "baz"), (9, null))) + } } .run .finish() @@ -150,8 +153,8 @@ class JsonLineTest extends WordSpec { intercept[FlowException] { JobTest(new JsonLineInputJob(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2), (2, ""), (3, " "))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => outBuf.toList + .sink[(Int, String)](Tsv("output0")) { outBuf => + outBuf.toList } .run @@ -161,11 +164,10 @@ class JsonLineTest extends WordSpec { JobTest(new JsonLineInputJobSkipEmptyLines(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2), (2, ""), (3, " "))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "handle empty lines when `failOnEmptyLines` is set to false" in { - assert(outBuf.toList.size === 2) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle empty lines when `failOnEmptyLines` is set to false" in { + assert(outBuf.toList.size === 2) + } } .run .finish() diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala index afc15ebc6b..925482a47b 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala @@ -4,36 +4,37 @@ import cascading.flow.FlowProcess import cascading.tap.Tap import com.twitter.scalding.parquet.ParquetValueScheme import com.twitter.scalding.parquet.thrift.Parquet346StructTypeRepairer -import com.twitter.scrooge.{ ThriftStruct, ThriftStructCodec } -import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } +import com.twitter.scrooge.{ThriftStruct, ThriftStructCodec} +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} import org.apache.parquet.hadoop.thrift.ThriftReadSupport import org.apache.parquet.schema.MessageType import org.apache.parquet.thrift.struct.ThriftType.StructType -import org.apache.parquet.thrift.{ ThriftReader, ThriftRecordConverter } +import org.apache.parquet.thrift.{ThriftReader, ThriftRecordConverter} import org.apache.thrift.protocol.TProtocol import scala.util.control.NonFatal /** - * This file contains workarounds for PARQUET-346, everything in it should - * be removed once that bug is fixed in upstream parquet. + * This file contains workarounds for PARQUET-346, everything in it should be removed once that bug is fixed + * in upstream parquet. * - * The root issue is that ScroogeRecordConverter passes a schema - * based on the file metadata to ThriftRecordConverter that may be missing - * structOrUnionType metadata. This metadata is not actually needed, but parquet - * currently throws if it's missing. The (temporary) "fix" is to populate this metadata - * by setting all structOrUnionType fields to UNION. + * The root issue is that ScroogeRecordConverter passes a schema based on the file metadata to + * ThriftRecordConverter that may be missing structOrUnionType metadata. This metadata is not actually needed, + * but parquet currently throws if it's missing. The (temporary) "fix" is to populate this metadata by setting + * all structOrUnionType fields to UNION. */ /** * The same as ParquetScroogeScheme, but sets the record convert to Parquet346ScroogeRecordConverter */ class Parquet346ScroogeScheme[T <: ThriftStruct](config: ParquetValueScheme.Config[T]) - extends ParquetScroogeScheme[T](config) { + extends ParquetScroogeScheme[T](config) { - override def sourceConfInit(fp: FlowProcess[JobConf], - tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], - jobConf: JobConf): Unit = { + override def sourceConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + jobConf: JobConf + ): Unit = { super.sourceConfInit(fp, tap, jobConf) @@ -47,8 +48,7 @@ object Parquet346ScroogeRecordConverter { /** * Same as the (private) getCodec in ScroogeRecordConverter */ - def getCodec[T <: ThriftStruct](klass: Class[T]): ThriftStructCodec[T] = { - + def getCodec[T <: ThriftStruct](klass: Class[T]): ThriftStructCodec[T] = try { val companionClass = Class.forName(klass.getName + "$") val companionObject: AnyRef = companionClass.getField("MODULE$").get(null) @@ -57,30 +57,30 @@ object Parquet346ScroogeRecordConverter { case NonFatal(e) => throw new RuntimeException("Unable to create ThriftStructCodec", e) } - } } /** - * Same as ScroogeRecordConverter with one important (subtle) difference. - * It passes a repaired schema (StructType) to ThriftRecordConverter's - * constructor. This is important because older files don't contain all the metadata needed for - * ThriftSchemaConverter to not throw, but we can put dummy data in there because it's not actually - * used. + * Same as ScroogeRecordConverter with one important (subtle) difference. It passes a repaired schema + * (StructType) to ThriftRecordConverter's constructor. This is important because older files don't contain + * all the metadata needed for ThriftSchemaConverter to not throw, but we can put dummy data in there because + * it's not actually used. */ -class Parquet346ScroogeRecordConverter[T <: ThriftStruct](thriftClass: Class[T], - parquetSchema: MessageType, - thriftType: StructType) extends ThriftRecordConverter[T]( - // this is a little confusing because it's all being passed to the super constructor - - // this thrift reader is the same as what's in ScroogeRecordConverter's constructor - new ThriftReader[T] { - val codec: ThriftStructCodec[T] = Parquet346ScroogeRecordConverter.getCodec(thriftClass) - def readOneRecord(protocol: TProtocol): T = codec.decode(protocol) - }, +class Parquet346ScroogeRecordConverter[T <: ThriftStruct]( + thriftClass: Class[T], + parquetSchema: MessageType, + thriftType: StructType +) extends ThriftRecordConverter[T]( + // this is a little confusing because it's all being passed to the super constructor - thriftClass.getSimpleName, - parquetSchema, + // this thrift reader is the same as what's in ScroogeRecordConverter's constructor + new ThriftReader[T] { + val codec: ThriftStructCodec[T] = Parquet346ScroogeRecordConverter.getCodec(thriftClass) + def readOneRecord(protocol: TProtocol): T = codec.decode(protocol) + }, + thriftClass.getSimpleName, + parquetSchema, - // this is the fix -- we add in the missing structOrUnionType metadata - // before passing it along - Parquet346StructTypeRepairer.repair(thriftType)) + // this is the fix -- we add in the missing structOrUnionType metadata + // before passing it along + Parquet346StructTypeRepairer.repair(thriftType) + ) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala index f5c1256e13..2d999705e3 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.parquet.scrooge import cascading.scheme.Scheme import com.twitter.scalding._ import com.twitter.scalding.parquet.thrift.ParquetThriftBaseFileSource -import com.twitter.scalding.source.{ DailySuffixSource, HourlySuffixSource } +import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource} import com.twitter.scrooge.ThriftStruct import scala.reflect.ClassTag @@ -18,15 +18,16 @@ trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBaseFileSource[T] { } -class DailySuffixParquetScrooge[T <: ThriftStruct]( - path: String, - dateRange: DateRange)(implicit override val ct: ClassTag[T]) - extends DailySuffixSource(path, dateRange) with ParquetScrooge[T] +class DailySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends DailySuffixSource(path, dateRange) + with ParquetScrooge[T] -class HourlySuffixParquetScrooge[T <: ThriftStruct]( - path: String, - dateRange: DateRange)(implicit override val ct: ClassTag[T]) - extends HourlySuffixSource(path, dateRange) with ParquetScrooge[T] +class HourlySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends HourlySuffixSource(path, dateRange) + with ParquetScrooge[T] class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val ct: ClassTag[T]) - extends FixedPathSource(paths: _*) with ParquetScrooge[T] + extends FixedPathSource(paths: _*) + with ParquetScrooge[T] diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala index 4f958a61a0..7667f85f9c 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.parquet.scrooge import _root_.cascading.scheme.Scheme import com.twitter.scalding._ import com.twitter.scalding.parquet.thrift.ParquetThriftBase -import com.twitter.scalding.typed.{ PartitionSchemed, PartitionUtil } +import com.twitter.scalding.typed.{PartitionSchemed, PartitionUtil} import com.twitter.scrooge.ThriftStruct import scala.reflect.ClassTag @@ -11,9 +11,8 @@ import scala.reflect.ClassTag /** * Scalding source to read or write partitioned Parquet scrooge data. * - * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and - * `T` is the scrooge object. `P` must be either a String or a tuple of Strings. - * Below is an example. + * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and `T` is the + * scrooge object. `P` must be either a String or a tuple of Strings. Below is an example. * {{{ * val data: TypedPipe[MyScroogeObject] = ??? * data.map { obj => @@ -21,23 +20,30 @@ import scala.reflect.ClassTag * }.write(PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s")) * }}} * - * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding - * scrooge object. Below is an example. + * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding scrooge + * object. Below is an example. * {{{ * val in: TypedPipe[(String, String), MyScroogeObject] = * TypedPipe.from( PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s") ) * }}} - * */ -case class PartitionedParquetScroogeSource[P, T <: ThriftStruct](path: String, template: String)(implicit val ct: ClassTag[T], - val valueSetter: TupleSetter[T], val valueConverter: TupleConverter[T], val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P]) - extends FixedPathSource(path) with ParquetThriftBase[T] with PartitionSchemed[P, T] with Serializable { +case class PartitionedParquetScroogeSource[P, T <: ThriftStruct](path: String, template: String)(implicit + val ct: ClassTag[T], + val valueSetter: TupleSetter[T], + val valueConverter: TupleConverter[T], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends FixedPathSource(path) + with ParquetThriftBase[T] + with PartitionSchemed[P, T] + with Serializable { override val fields = PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity) assert( fields.size == valueSetter.arity, - "The number of fields needs to be the same as the arity of the value setter") + "The number of fields needs to be the same as the arity of the value setter" + ) // Create the underlying scheme and explicitly set the source, sink fields to be only the specified fields override def hdfsScheme = { diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala index e3c4c76cc2..fd985242cf 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala @@ -1,6 +1,10 @@ package com.twitter.scalding.parquet.scrooge -import com.twitter.scalding.parquet.{ StrictColumnProjectionString, DeprecatedColumnProjectionString, ParquetSourcesTestsBase } +import com.twitter.scalding.parquet.{ + DeprecatedColumnProjectionString, + ParquetSourcesTestsBase, + StrictColumnProjectionString +} import com.twitter.scrooge.ThriftStruct import org.apache.thrift.protocol.TProtocol import org.apache.parquet.filter2.predicate.FilterPredicate @@ -12,22 +16,25 @@ class ParquetScroogeTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -36,22 +43,25 @@ class ParquetScroogeTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -60,22 +70,25 @@ class ParquetScroogeTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -83,4 +96,4 @@ class ParquetScroogeTests extends ParquetSourcesTestsBase { class MockThriftStruct extends ThriftStruct { override def write(oprot: TProtocol): Unit = () -} \ No newline at end of file +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala index 368a44eb92..649343e892 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala @@ -8,7 +8,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetReader -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} object PartitionedParquetScroogeTestSources { val path = "/a/path" @@ -17,9 +17,14 @@ object PartitionedParquetScroogeTestSources { class PartitionedParquetScroogeWriteJob(args: Args) extends Job(args) { import PartitionedParquetScroogeTestSources._ - val input = Seq(Address("123 Embarcadero", "94111"), Address("123 E 79th St", "10075"), Address("456 W 80th St", "10075")) + val input = Seq( + Address("123 Embarcadero", "94111"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) - TypedPipe.from(input) + TypedPipe + .from(input) .map { case Address(street, zipcode) => (zipcode, Address(street, zipcode)) } .write(partitionSource) } @@ -31,7 +36,8 @@ class PartitionedParquetScroogeSourceTests extends WordSpec with Matchers { val conf: Configuration = new Configuration conf.set("parquet.thrift.converter.class", classOf[ScroogeRecordConverter[Address]].getName) val parquetReader: ParquetReader[Address] = - ParquetReader.builder[Address](new ScroogeReadSupport[Address], path) + ParquetReader + .builder[Address](new ScroogeReadSupport[Address], path) .withConf(conf) .build() @@ -45,21 +51,25 @@ class PartitionedParquetScroogeSourceTests extends WordSpec with Matchers { job = new PartitionedParquetScroogeWriteJob(args) job } - JobTest(buildJob(_)) - .runHadoop + JobTest(buildJob(_)).runHadoop .finish() val testMode = job.mode.asInstanceOf[HadoopTest] val directory = new File(testMode.getWritePathFor(partitionSource)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("94111", "10075") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("94111", "10075") // check that the partitioning is done correctly by zipcode - validate(new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"), - Address("123 Embarcadero", "94111")) - validate(new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"), - Address("123 E 79th St", "10075"), Address("456 W 80th St", "10075")) + validate( + new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"), + Address("123 Embarcadero", "94111") + ) + validate( + new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) } } } diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala index 77a18516d5..6de6010011 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala @@ -25,8 +25,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).map(_ => null.asInstanceOf[MockThriftStruct])) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).map(_ => null.asInstanceOf[MockThriftStruct]) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -66,8 +66,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -77,8 +77,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).forceToDisk.filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).forceToDisk.filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 2) @@ -88,8 +88,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).forceToDisk) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).forceToDisk assert(steps(pipe) == 1) assert(steps(pipe, false) == 2) @@ -99,8 +99,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).onComplete(() => println("done")).filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).onComplete(() => println("done")).filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -121,8 +121,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).withDescription("foo").filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).withDescription("foo").filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -132,8 +132,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).debug.filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).debug.filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -143,8 +143,8 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2).filter(_ => true).map(_ => null.asInstanceOf[MockThriftStruct])) + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).filter(_ => true).map(_ => null.asInstanceOf[MockThriftStruct]) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) @@ -154,10 +154,11 @@ class PlanningTests extends FunSuite { val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") - val pipe = (TypedPipe.from(src1) ++ - TypedPipe.from(src2) - .map(_ => null.asInstanceOf[MockThriftStruct]) - .filter(_ => true)) + val pipe = TypedPipe.from(src1) ++ + TypedPipe + .from(src2) + .map(_ => null.asInstanceOf[MockThriftStruct]) + .filter(_ => true) assert(steps(pipe) == 1) assert(steps(pipe, false) == 1) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index 79b5f327ed..34c5585a1c 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -2,26 +2,24 @@ package com.twitter.scalding.parquet.scrooge import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Address import com.twitter.scalding.parquet.tuple.macros.Macros._ -import com.twitter.scalding.parquet.tuple.{ TypedParquet, TypedParquetSink } -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } +import com.twitter.scalding.parquet.tuple.{TypedParquet, TypedParquetSink} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.{ Args, Job } +import com.twitter.scalding.{Args, Job} import org.apache.parquet.io.InvalidRecordException import org.apache.parquet.schema.MessageTypeParser -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPlatformTest { "ScroogeReadSupport getSchemaForRead" should { "project extra optional field" in { - val fileType = MessageTypeParser.parseMessageType( - """ + val fileType = MessageTypeParser.parseMessageType(""" |message SampleClass { | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ + val requestedProjection = MessageTypeParser.parseMessageType(""" |message SampleProjection { | required int32 x; | optional int32 extra; @@ -33,14 +31,12 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl } "fail projecting extra required field" in { - val fileType = MessageTypeParser.parseMessageType( - """ + val fileType = MessageTypeParser.parseMessageType(""" |message SampleClass { | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ + val requestedProjection = MessageTypeParser.parseMessageType(""" |message SampleProjection { | required int32 x; | required int32 extra; @@ -53,14 +49,12 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl } "project required field using optional" in { - val fileType = MessageTypeParser.parseMessageType( - """ + val fileType = MessageTypeParser.parseMessageType(""" |message SampleClass { | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ + val requestedProjection = MessageTypeParser.parseMessageType(""" |message SampleProjection { | optional int32 x; |} @@ -71,14 +65,12 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl } "fail projecting optional using required" in { - val fileType = MessageTypeParser.parseMessageType( - """ + val fileType = MessageTypeParser.parseMessageType(""" |message SampleClass { | optional int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ + val requestedProjection = MessageTypeParser.parseMessageType(""" |message SampleProjection { | required int32 x; |} @@ -94,25 +86,29 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl "write using typedparquet and read using parquet scrooge" in { HadoopPlatformJobTest(new WriteToTypedParquetTupleJob(_), cluster) .arg("output", "output1") - .sink[AddressCaseClass](TypedParquet[AddressCaseClass](Seq("output1"))) { - in => - in should contain theSameElementsAs TypedParquetTestSources.caseClassValues - }.run() + .sink[AddressCaseClass](TypedParquet[AddressCaseClass](Seq("output1"))) { in => + in should contain theSameElementsAs TypedParquetTestSources.caseClassValues + } + .run() HadoopPlatformJobTest(new ReadWithParquetScrooge(_), cluster) .arg("input", "output1") .arg("output", "output2") - .sink[Address](new FixedPathParquetScrooge[Address]("output2")) { - out => - out should contain theSameElementsAs TypedParquetTestSources.thriftValues - }.run() + .sink[Address](new FixedPathParquetScrooge[Address]("output2")) { out => + out should contain theSameElementsAs TypedParquetTestSources.thriftValues + } + .run() } } } object TypedParquetTestSources { - val thriftValues = Seq(Address("123 Embarcadero", "94111"), Address("123 E 79th St", "10075"), Address("456 W 80th St", "10075")) + val thriftValues = Seq( + Address("123 Embarcadero", "94111"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) val caseClassValues = thriftValues.map(a => AddressCaseClass(a.street, a.zip)) } @@ -131,4 +127,4 @@ class ReadWithParquetScrooge(args: Args) extends Job(args) { val input = new FixedPathParquetScrooge[Address](inputPath) val sink = new FixedPathParquetScrooge[Address](outputPath) TypedPipe.from(input).write(sink) -} \ No newline at end of file +} diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasColumnProjection.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasColumnProjection.scala index 3a14111a48..320310b963 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasColumnProjection.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasColumnProjection.scala @@ -5,9 +5,8 @@ import org.slf4j.LoggerFactory object HasColumnProjection { val LOG = LoggerFactory.getLogger(this.getClass) - def requireNoSemiColon(glob: String) = { + def requireNoSemiColon(glob: String) = require(!glob.contains(";"), "A column projection glob cannot contain a ; character") - } } trait HasColumnProjection { @@ -42,14 +41,15 @@ trait HasColumnProjection { val deprecated = withColumns val strict = withColumnProjections - require(deprecated.isEmpty || strict.isEmpty, - "Cannot provide both withColumns and withColumnProjections") + require(deprecated.isEmpty || strict.isEmpty, "Cannot provide both withColumns and withColumnProjections") deprecated.foreach(requireNoSemiColon) strict.foreach(requireNoSemiColon) if (deprecated.nonEmpty) { - LOG.warn("withColumns is deprecated. Please use withColumnProjections, which uses a different glob syntax") + LOG.warn( + "withColumns is deprecated. Please use withColumnProjections, which uses a different glob syntax" + ) Some(DeprecatedColumnProjectionString(deprecated)) } else if (strict.nonEmpty) { Some(StrictColumnProjectionString(strict)) diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/Parquet346TBaseScheme.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/Parquet346TBaseScheme.scala index d71623dcd4..fc75309e9b 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/Parquet346TBaseScheme.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/Parquet346TBaseScheme.scala @@ -4,39 +4,40 @@ import com.twitter.scalding.parquet.ParquetValueScheme import cascading.flow.FlowProcess import cascading.tap.Tap -import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} import org.apache.parquet.hadoop.thrift.ThriftReadSupport import org.apache.parquet.io.ParquetDecodingException import org.apache.parquet.schema.MessageType import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType import org.apache.parquet.thrift.struct.ThriftType._ -import org.apache.parquet.thrift.struct.{ ThriftField, ThriftType } -import org.apache.parquet.thrift.{ ThriftReader, ThriftRecordConverter } +import org.apache.parquet.thrift.struct.{ThriftField, ThriftType} +import org.apache.parquet.thrift.{ThriftReader, ThriftRecordConverter} import org.apache.thrift.TBase import org.apache.thrift.protocol.TProtocol import scala.collection.JavaConverters._ /** - * This file contains workarounds for PARQUET-346, everything in it should - * be removed once that bug is fixed in upstream parquet. + * This file contains workarounds for PARQUET-346, everything in it should be removed once that bug is fixed + * in upstream parquet. * - * The root issue is that TBaseRecordConverter passes a schema - * based on the file metadata to ThriftRecordConverter that may be missing - * structOrUnionType metadata. This metadata is not actually needed, but parquet - * currently throws if it's missing. The (temporary) "fix" is to populate this metadata - * by setting all structOrUnionType fields to UNION. + * The root issue is that TBaseRecordConverter passes a schema based on the file metadata to + * ThriftRecordConverter that may be missing structOrUnionType metadata. This metadata is not actually needed, + * but parquet currently throws if it's missing. The (temporary) "fix" is to populate this metadata by setting + * all structOrUnionType fields to UNION. */ /** * The same as ParquetTBaseScheme, but sets the record convert to Parquet346TBaseRecordConverter */ class Parquet346TBaseScheme[T <: TBase[_, _]](config: ParquetValueScheme.Config[T]) - extends ParquetTBaseScheme[T](config) { + extends ParquetTBaseScheme[T](config) { - override def sourceConfInit(fp: FlowProcess[JobConf], - tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], - jobConf: JobConf): Unit = { + override def sourceConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + jobConf: JobConf + ): Unit = { super.sourceConfInit(fp, tap, jobConf) @@ -46,57 +47,54 @@ class Parquet346TBaseScheme[T <: TBase[_, _]](config: ParquetValueScheme.Config[ } /** - * Same as TBaseRecordConverter with one important (subtle) difference. - * It passes a repaired schema (StructType) to ThriftRecordConverter's - * constructor. This is important because older files don't contain all the metadata needed for - * ThriftSchemaConverter to not throw, but we can put dummy data in there because it's not actually - * used. + * Same as TBaseRecordConverter with one important (subtle) difference. It passes a repaired schema + * (StructType) to ThriftRecordConverter's constructor. This is important because older files don't contain + * all the metadata needed for ThriftSchemaConverter to not throw, but we can put dummy data in there because + * it's not actually used. */ -class Parquet346TBaseRecordConverter[T <: TBase[_, _]](thriftClass: Class[T], - requestedParquetSchema: MessageType, thriftType: ThriftType.StructType) extends ThriftRecordConverter[T]( - // this is a little confusing because it's all being passed to the super constructor - - // this thrift reader is the same as what's in ScroogeRecordConverter's constructor - new ThriftReader[T] { - override def readOneRecord(protocol: TProtocol): T = { - try { - val thriftObject: T = thriftClass.newInstance - thriftObject.read(protocol) - thriftObject - } catch { - case e: InstantiationException => - throw new ParquetDecodingException("Could not instantiate Thrift " + thriftClass, e) - case e: IllegalAccessException => - throw new ParquetDecodingException("Thrift class or constructor not public " + thriftClass, e) - } - } - }, - thriftClass.getSimpleName, - requestedParquetSchema, - - // this is the fix -- we add in the missing structOrUnionType metadata - // before passing it along - Parquet346StructTypeRepairer.repair(thriftType)) +class Parquet346TBaseRecordConverter[T <: TBase[_, _]]( + thriftClass: Class[T], + requestedParquetSchema: MessageType, + thriftType: ThriftType.StructType +) extends ThriftRecordConverter[T]( + // this is a little confusing because it's all being passed to the super constructor + + // this thrift reader is the same as what's in ScroogeRecordConverter's constructor + new ThriftReader[T] { + override def readOneRecord(protocol: TProtocol): T = + try { + val thriftObject: T = thriftClass.newInstance + thriftObject.read(protocol) + thriftObject + } catch { + case e: InstantiationException => + throw new ParquetDecodingException("Could not instantiate Thrift " + thriftClass, e) + case e: IllegalAccessException => + throw new ParquetDecodingException("Thrift class or constructor not public " + thriftClass, e) + } + }, + thriftClass.getSimpleName, + requestedParquetSchema, + + // this is the fix -- we add in the missing structOrUnionType metadata + // before passing it along + Parquet346StructTypeRepairer.repair(thriftType) + ) /** - * Takes a ThriftType with potentially missing structOrUnionType metadata, - * and makes a copy that sets all StructOrUnionType metadata to UNION + * Takes a ThriftType with potentially missing structOrUnionType metadata, and makes a copy that sets all + * StructOrUnionType metadata to UNION */ object Parquet346StructTypeRepairer extends StateVisitor[ThriftType, Unit] { - def repair(fromMetadata: StructType): StructType = { + def repair(fromMetadata: StructType): StructType = visit(fromMetadata, ()) - } - def copyRecurse(field: ThriftField): ThriftField = { + def copyRecurse(field: ThriftField): ThriftField = new ThriftField(field.getName, field.getFieldId, field.getRequirement, field.getType.accept(this, ())) - } override def visit(structType: StructType, state: Unit): StructType = { - val repairedChildren = structType - .getChildren - .asScala - .iterator + val repairedChildren = structType.getChildren.asScala.iterator .map(copyRecurse) new StructType(repairedChildren.toBuffer.asJava, StructOrUnionType.UNION) diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala index 077b9db1f1..c4981d4363 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala @@ -12,22 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.parquet.thrift import cascading.scheme.Scheme import com.twitter.scalding._ import com.twitter.scalding.parquet.{ - StrictColumnProjectionString, DeprecatedColumnProjectionString, HasColumnProjection, HasFilterPredicate, - ParquetValueScheme + ParquetValueScheme, + StrictColumnProjectionString } -import com.twitter.scalding.source.{ DailySuffixSource, HourlySuffixSource } +import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource} import java.io.Serializable -import org.apache.thrift.{ TBase, TFieldIdEnum } +import org.apache.thrift.{TBase, TFieldIdEnum} import scala.reflect.ClassTag @@ -44,12 +44,14 @@ trait ParquetThriftBase[T] extends LocalTapSource with HasFilterPredicate with H val config = new ParquetValueScheme.Config[T].withRecordClass(clazz) val configWithFp = withFilter match { case Some(fp) => config.withFilterPredicate(fp) - case None => config + case None => config } val configWithProjection = columnProjectionString match { - case Some(s @ DeprecatedColumnProjectionString(_)) => configWithFp.withProjectionString(s.asSemicolonString) - case Some(s @ StrictColumnProjectionString(_)) => configWithFp.withStrictProjectionString(s.asSemicolonString) + case Some(s @ DeprecatedColumnProjectionString(_)) => + configWithFp.withProjectionString(s.asSemicolonString) + case Some(s @ StrictColumnProjectionString(_)) => + configWithFp.withStrictProjectionString(s.asSemicolonString) case None => configWithFp } @@ -57,7 +59,11 @@ trait ParquetThriftBase[T] extends LocalTapSource with HasFilterPredicate with H } } -trait ParquetThriftBaseFileSource[T] extends FileSource with ParquetThriftBase[T] with SingleMappable[T] with TypedSink[T] { +trait ParquetThriftBaseFileSource[T] + extends FileSource + with ParquetThriftBase[T] + with SingleMappable[T] + with TypedSink[T] { override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) } @@ -72,21 +78,20 @@ trait ParquetThrift[T <: ParquetThrift.ThriftBase] extends ParquetThriftBaseFile } /** - * When Using these sources or creating subclasses of them, you can - * provide a filter predicate and / or a set of fields (columns) to keep (project). + * When Using these sources or creating subclasses of them, you can provide a filter predicate and / or a set + * of fields (columns) to keep (project). * - * The filter predicate will be pushed down to the input format, potentially - * making the filter significantly more efficient than a filter applied to - * a TypedPipe (parquet push-down filters can skip reading entire chunks of data off disk). + * The filter predicate will be pushed down to the input format, potentially making the filter significantly + * more efficient than a filter applied to a TypedPipe (parquet push-down filters can skip reading entire + * chunks of data off disk). * - * For data with a large schema (many fields / columns), providing the set of columns - * you intend to use can also make your job significantly more efficient (parquet column projection - * push-down will skip reading unused columns from disk). - * The columns are specified in the format described here: + * For data with a large schema (many fields / columns), providing the set of columns you intend to use can + * also make your job significantly more efficient (parquet column projection push-down will skip reading + * unused columns from disk). The columns are specified in the format described here: * https://github.com/apache/parquet-mr/blob/master/parquet_cascading.md#21-projection-pushdown-with-thriftscrooge-records * - * These settings are defined in the traits [[com.twitter.scalding.parquet.HasFilterPredicate]] - * and [[com.twitter.scalding.parquet.HasColumnProjection]] + * These settings are defined in the traits [[com.twitter.scalding.parquet.HasFilterPredicate]] and + * [[com.twitter.scalding.parquet.HasColumnProjection]] * * Here are two ways you can use these in a parquet source: * @@ -111,15 +116,17 @@ trait ParquetThrift[T <: ParquetThrift.ThriftBase] extends ParquetThriftBaseFile * val mySourceFilteredAndProjected = new MyParquetSource(dr, Some(myFp), Set("a.b.c", "x.y")) * }}} */ -class DailySuffixParquetThrift[T <: ParquetThrift.ThriftBase]( - path: String, - dateRange: DateRange)(implicit override val ct: ClassTag[T]) - extends DailySuffixSource(path, dateRange) with ParquetThrift[T] - -class HourlySuffixParquetThrift[T <: ParquetThrift.ThriftBase]( - path: String, - dateRange: DateRange)(implicit override val ct: ClassTag[T]) - extends HourlySuffixSource(path, dateRange) with ParquetThrift[T] - -class FixedPathParquetThrift[T <: ParquetThrift.ThriftBase](paths: String*)(implicit override val ct: ClassTag[T]) - extends FixedPathSource(paths: _*) with ParquetThrift[T] +class DailySuffixParquetThrift[T <: ParquetThrift.ThriftBase](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends DailySuffixSource(path, dateRange) + with ParquetThrift[T] + +class HourlySuffixParquetThrift[T <: ParquetThrift.ThriftBase](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends HourlySuffixSource(path, dateRange) + with ParquetThrift[T] + +class FixedPathParquetThrift[T <: ParquetThrift.ThriftBase](paths: String*)(implicit + override val ct: ClassTag[T] +) extends FixedPathSource(paths: _*) + with ParquetThrift[T] diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala index b24ca3cc3f..ab1dd97f6e 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala @@ -1,17 +1,16 @@ package com.twitter.scalding.parquet.thrift import cascading.scheme.Scheme -import com.twitter.scalding.typed.{ PartitionSchemed, PartitionUtil } -import com.twitter.scalding.{ FixedPathSource, HadoopSchemeInstance, TupleConverter, TupleSetter } +import com.twitter.scalding.typed.{PartitionSchemed, PartitionUtil} +import com.twitter.scalding.{FixedPathSource, HadoopSchemeInstance, TupleConverter, TupleSetter} import scala.reflect.ClassTag /** * Scalding source to read or write partitioned Parquet thrift data. * - * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and - * `T` is the thrift object. `P` must be either a String or a tuple of Strings. - * Below is an example. + * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and `T` is the + * thrift object. `P` must be either a String or a tuple of Strings. Below is an example. * {{{ * val data: TypedPipe[MyThriftObject] = ??? * data.map{ obj => @@ -19,23 +18,31 @@ import scala.reflect.ClassTag * }.write(PartitionedParquetThriftSource[(String, String), MyThriftObject](path, "%s/%s")) * }}} * - * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding - * thrift object. Below is an example. + * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding thrift + * object. Below is an example. * {{{ * val in: TypedPipe[(String, String), MyThriftObject] = * TypedPipe.from( PartitionedParquetThriftSource[(String, String), MyThriftObject](path, "%s/%s") ) * }}} - * */ -case class PartitionedParquetThriftSource[P, T <: ParquetThrift.ThriftBase](path: String, template: String)(implicit val ct: ClassTag[T], - val valueSetter: TupleSetter[T], val valueConverter: TupleConverter[T], val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P]) - extends FixedPathSource(path) with ParquetThriftBase[T] with PartitionSchemed[P, T] with Serializable { +case class PartitionedParquetThriftSource[P, T <: ParquetThrift.ThriftBase](path: String, template: String)( + implicit + val ct: ClassTag[T], + val valueSetter: TupleSetter[T], + val valueConverter: TupleConverter[T], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends FixedPathSource(path) + with ParquetThriftBase[T] + with PartitionSchemed[P, T] + with Serializable { override val fields = PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity) assert( fields.size == valueSetter.arity, - "The number of fields needs to be the same as the arity of the value setter") + "The number of fields needs to be the same as the arity of the value setter" + ) // Create the underlying scheme and explicitly set the source, sink fields to be only the specified fields override def hdfsScheme = { diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/ParquetTuple.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/ParquetTuple.scala index dced2b1c88..229d94041d 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/ParquetTuple.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/ParquetTuple.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.parquet.tuple @@ -20,15 +20,16 @@ import cascading.scheme.Scheme import cascading.tuple.Fields import com.twitter.scalding._ import com.twitter.scalding.parquet.HasFilterPredicate -import com.twitter.scalding.source.{ DailySuffixSource, HourlySuffixSource } +import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource} object ParquetTupleSource { def apply(fields: Fields, paths: String*) = new FixedPathParquetTuple(fields, paths: _*) } /** - * User should define their own source like: - * class MySource(path: String, dateRange: DateRange, requestedFields: Fields) extends DailySuffixParquetTuple(path, dateRange, requestedFields) with Mappable2[Int, Int] with TypedSink2[Int,Int] + * User should define their own source like: class MySource(path: String, dateRange: DateRange, + * requestedFields: Fields) extends DailySuffixParquetTuple(path, dateRange, requestedFields) with + * Mappable2[Int, Int] with TypedSink2[Int,Int] */ trait ParquetTupleSource extends FileSource with HasFilterPredicate { def fields: Fields @@ -37,7 +38,7 @@ trait ParquetTupleSource extends FileSource with HasFilterPredicate { val scheme = withFilter match { case Some(fp) => new ParquetTupleScheme(fp, fields) - case None => new ParquetTupleScheme(fields) + case None => new ParquetTupleScheme(fields) } HadoopSchemeInstance(scheme.asInstanceOf[Scheme[_, _, _, _, _]]) @@ -46,19 +47,17 @@ trait ParquetTupleSource extends FileSource with HasFilterPredicate { } /** - * See [[com.twitter.scalding.parquet.thrift.DailySuffixParquetThrift]] for documentation on - * how to specify filter predicates for these sources. + * See [[com.twitter.scalding.parquet.thrift.DailySuffixParquetThrift]] for documentation on how to specify + * filter predicates for these sources. */ -class DailySuffixParquetTuple( - path: String, - dateRange: DateRange, - override val fields: Fields) extends DailySuffixSource(path, dateRange) with ParquetTupleSource +class DailySuffixParquetTuple(path: String, dateRange: DateRange, override val fields: Fields) + extends DailySuffixSource(path, dateRange) + with ParquetTupleSource -class HourlySuffixParquetTuple( - path: String, - dateRange: DateRange, - override val fields: Fields) extends HourlySuffixSource(path, dateRange) with ParquetTupleSource +class HourlySuffixParquetTuple(path: String, dateRange: DateRange, override val fields: Fields) + extends HourlySuffixSource(path, dateRange) + with ParquetTupleSource -class FixedPathParquetTuple( - override val fields: Fields, - paths: String*) extends FixedPathSource(paths: _*) with ParquetTupleSource +class FixedPathParquetTuple(override val fields: Fields, paths: String*) + extends FixedPathSource(paths: _*) + with ParquetTupleSource diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/TypedParquet.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/TypedParquet.scala index 4947cce86f..7ad55698b4 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/TypedParquet.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/TypedParquet.scala @@ -4,62 +4,77 @@ import org.apache.parquet.filter2.predicate.FilterPredicate import cascading.scheme.Scheme import com.twitter.scalding._ import com.twitter.scalding.parquet.HasFilterPredicate -import com.twitter.scalding.parquet.tuple.scheme.{ ParquetReadSupport, ParquetWriteSupport, TypedParquetTupleScheme } +import com.twitter.scalding.parquet.tuple.scheme.{ + ParquetReadSupport, + ParquetWriteSupport, + TypedParquetTupleScheme +} /** * Typed parquet tuple - * @author Jian Tang + * @author + * Jian Tang */ object TypedParquet { + /** - * Create readable typed parquet source. - * Here is an example: - * import com.twitter.scalding.parquet.tuple.macros.Macros._ - * val parquetTuple = TypedParquet[SampleClass](Seq(outputPath)) + * Create readable typed parquet source. Here is an example: import + * com.twitter.scalding.parquet.tuple.macros.Macros._ val parquetTuple = + * TypedParquet[SampleClass](Seq(outputPath)) * - * @param paths paths of parquet I/O - * @tparam T Tuple type - * @return a typed parquet source. + * @param paths + * paths of parquet I/O + * @tparam T + * Tuple type + * @return + * a typed parquet source. */ def apply[T](paths: Seq[String])(implicit readSupport: ParquetReadSupport[T]): TypedParquet[T] = new TypedFixedPathParquetTuple[T](paths, readSupport, null) - def apply[T](path: String)(implicit readSupport: ParquetReadSupport[T]): TypedParquet[T] = apply[T](Seq(path)) + def apply[T](path: String)(implicit readSupport: ParquetReadSupport[T]): TypedParquet[T] = + apply[T](Seq(path)) /** * Create readable typed parquet source with filter predicate. */ - def apply[T](paths: Seq[String], fp: FilterPredicate)(implicit readSupport: ParquetReadSupport[T]): TypedParquet[T] = + def apply[T](paths: Seq[String], fp: FilterPredicate)(implicit + readSupport: ParquetReadSupport[T] + ): TypedParquet[T] = new TypedFixedPathParquetTuple[T](paths, readSupport, null) { override def withFilter = Some(fp) } - def apply[T](path: String, fp: FilterPredicate)(implicit readSupport: ParquetReadSupport[T]): TypedParquet[T] = + def apply[T](path: String, fp: FilterPredicate)(implicit + readSupport: ParquetReadSupport[T] + ): TypedParquet[T] = apply[T](Seq(path), fp) } object TypedParquetSink { + /** - * Create typed parquet sink. - * Here is an example: - * import com.twitter.scalding.parquet.tuple.macros.Macros._ - * val sink = TypedParquetSink[SampleClass](Seq(outputPath)) + * Create typed parquet sink. Here is an example: import com.twitter.scalding.parquet.tuple.macros.Macros._ + * val sink = TypedParquetSink[SampleClass](Seq(outputPath)) * - * @param paths paths of parquet I/O - * @tparam T Tuple type - * @return a typed parquet source. + * @param paths + * paths of parquet I/O + * @tparam T + * Tuple type + * @return + * a typed parquet source. */ def apply[T](paths: Seq[String])(implicit writeSupport: ParquetWriteSupport[T]): TypedParquet[T] = new TypedFixedPathParquetTuple[T](paths, null, writeSupport) - def apply[T](path: String)(implicit writeSupport: ParquetWriteSupport[T]): TypedParquet[T] = apply[T](Seq(path)) + def apply[T](path: String)(implicit writeSupport: ParquetWriteSupport[T]): TypedParquet[T] = + apply[T](Seq(path)) } /** * Typed Parquet tuple source/sink. */ -trait TypedParquet[T] extends FileSource with Mappable[T] - with TypedSink[T] with HasFilterPredicate { +trait TypedParquet[T] extends FileSource with Mappable[T] with TypedSink[T] with HasFilterPredicate { def readSupport: ParquetReadSupport[T] def writeSupport: ParquetWriteSupport[T] @@ -74,5 +89,9 @@ trait TypedParquet[T] extends FileSource with Mappable[T] } } -class TypedFixedPathParquetTuple[T](val paths: Seq[String], val readSupport: ParquetReadSupport[T], - val writeSupport: ParquetWriteSupport[T]) extends FixedPathSource(paths: _*) with TypedParquet[T] +class TypedFixedPathParquetTuple[T]( + val paths: Seq[String], + val readSupport: ParquetReadSupport[T], + val writeSupport: ParquetWriteSupport[T] +) extends FixedPathSource(paths: _*) + with TypedParquet[T] diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/Macros.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/Macros.scala index d29bce9d71..ee15f68e8f 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/Macros.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/Macros.scala @@ -1,16 +1,20 @@ package com.twitter.scalding.parquet.tuple.macros -import com.twitter.scalding.parquet.tuple.macros.impl.{ ParquetReadSupportProvider, ParquetSchemaProvider, WriteSupportProvider } -import com.twitter.scalding.parquet.tuple.scheme.{ ParquetReadSupport, ParquetWriteSupport } +import com.twitter.scalding.parquet.tuple.macros.impl.{ + ParquetReadSupportProvider, + ParquetSchemaProvider, + WriteSupportProvider +} +import com.twitter.scalding.parquet.tuple.scheme.{ParquetReadSupport, ParquetWriteSupport} import scala.reflect.macros.whitebox.Context import scala.language.experimental.macros /** - * Macros used to generate parquet tuple read/write support. - * These macros support only case class that contains primitive fields or nested case classes and also collection fields - * like scala List, Set, and Map. - * @author Jian TANG + * Macros used to generate parquet tuple read/write support. These macros support only case class that + * contains primitive fields or nested case classes and also collection fields like scala List, Set, and Map. + * @author + * Jian TANG */ class Impl(val c: Context) { @@ -29,26 +33,21 @@ class Impl(val c: Context) { } object Macros { + /** * Macro used to generate parquet schema for a given case class. For example if we have: * - * case class SampleClassA(x: Int, y: String) - * case class SampleClassB(a: SampleClassA, y: String) + * case class SampleClassA(x: Int, y: String) case class SampleClassB(a: SampleClassA, y: String) * * The macro will generate a parquet message type like this: * + * """ message SampleClassB { required group a { required int32 x; required binary y; } required binary y; } * """ - * message SampleClassB { - * required group a { - * required int32 x; - * required binary y; - * } - * required binary y; - * } - * """ * - * @tparam T Case class type that contains primitive fields or collection fields or nested case class. - * @return Generated case class parquet message type string + * @tparam T + * Case class type that contains primitive fields or collection fields or nested case class. + * @return + * Generated case class parquet message type string */ implicit def caseClassParquetSchema[T]: String = macro Impl.schema[T] @@ -59,8 +58,10 @@ object Macros { /** * Macro used to generate case class write support to parquet. - * @tparam T User defined case class tuple type. - * @return Generated case class tuple write support function. + * @tparam T + * User defined case class tuple type. + * @return + * Generated case class tuple write support function. */ implicit def caseClassParquetWriteSupport[T]: ParquetWriteSupport[T] = macro Impl.writeSupport[T] } diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetReadSupportProvider.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetReadSupportProvider.scala index 3c7cf49d97..b3e4bd91c4 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetReadSupportProvider.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetReadSupportProvider.scala @@ -14,17 +14,26 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { private[this] case object SET extends CollectionType private[this] case object MAP extends CollectionType - def toParquetReadSupportImpl[T](ctx: Context)(implicit T: ctx.WeakTypeTag[T]): ctx.Expr[ParquetReadSupport[T]] = { + def toParquetReadSupportImpl[T]( + ctx: Context + )(implicit T: ctx.WeakTypeTag[T]): ctx.Expr[ParquetReadSupport[T]] = { import ctx.universe._ if (!IsCaseClassImpl.isCaseClassType(ctx)(T.tpe)) - ctx.abort(ctx.enclosingPosition, + ctx.abort( + ctx.enclosingPosition, s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. - This will mean the macro is operating on a non-resolved type.""") - - def buildGroupConverter(tpe: Type, converters: List[Tree], converterGetters: List[Tree], - converterResetCalls: List[Tree], valueBuilder: Tree): Tree = + This will mean the macro is operating on a non-resolved type.""" + ) + + def buildGroupConverter( + tpe: Type, + converters: List[Tree], + converterGetters: List[Tree], + converterResetCalls: List[Tree], + valueBuilder: Tree + ): Tree = q"""new _root_.com.twitter.scalding.parquet.tuple.scheme.ParquetTupleConverter[$tpe]{ ..$converters @@ -59,21 +68,27 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { collectionType match { case OPTION => - val child = if (isPrimitive) primitiveCollectionElementConverter else caseClassFieldCollectionElementConverter + val child = + if (isPrimitive) primitiveCollectionElementConverter + else caseClassFieldCollectionElementConverter q""" val $converterName = new _root_.com.twitter.scalding.parquet.tuple.scheme.OptionConverter[$fieldType] { $child } """ case LIST => - val child = if (isPrimitive) primitiveCollectionElementConverter else caseClassFieldCollectionElementConverter + val child = + if (isPrimitive) primitiveCollectionElementConverter + else caseClassFieldCollectionElementConverter q""" val $converterName = new _root_.com.twitter.scalding.parquet.tuple.scheme.ListConverter[$fieldType] { $child } """ case SET => - val child = if (isPrimitive) primitiveCollectionElementConverter else caseClassFieldCollectionElementConverter + val child = + if (isPrimitive) primitiveCollectionElementConverter + else caseClassFieldCollectionElementConverter q""" val $converterName = new _root_.com.twitter.scalding.parquet.tuple.scheme.SetConverter[$fieldType] { @@ -81,13 +96,18 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { } """ case MAP => converter - case _ => q"val $converterName = $converter" + case _ => q"val $converterName = $converter" } } - def createMapFieldConverter(converterName: TermName, K: Type, V: Type, keyConverter: Tree, - valueConverter: Tree): Tree = + def createMapFieldConverter( + converterName: TermName, + K: Type, + V: Type, + keyConverter: Tree, + valueConverter: Tree + ): Tree = q"""val $converterName = new _root_.com.twitter.scalding.parquet.tuple.scheme.MapConverter[$K, $V] { override val child: _root_.com.twitter.scalding.parquet.tuple.scheme.TupleFieldConverter[($K, $V)] = @@ -118,7 +138,12 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { createFieldMatchResult(converterName, converter) } - def matchMapField(K: Type, V: Type, keyConverter: Tree, valueConverter: Tree): (Tree, Tree, Tree, Tree) = { + def matchMapField( + K: Type, + V: Type, + keyConverter: Tree, + valueConverter: Tree + ): (Tree, Tree, Tree, Tree) = { val converterName = newTermName(ctx.fresh("fieldConverter")) val mapConverter = createMapFieldConverter(converterName, K, V, keyConverter, valueConverter) createFieldMatchResult(converterName, mapConverter) @@ -156,27 +181,35 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { val (valueConverter, _, _, _) = matchField(0, valueType, MAP) matchMapField(keyType, valueType, keyConverter, valueConverter) case tpe if IsCaseClassImpl.isCaseClassType(ctx)(tpe) => - val (innerConverters, innerConvertersGetters, innerConvertersResetCalls, innerFieldValues) = unzip(expandMethod(tpe)) + val (innerConverters, innerConvertersGetters, innerConvertersResetCalls, innerFieldValues) = unzip( + expandMethod(tpe) + ) val innerValueBuilderTree = buildTupleValue(tpe, innerFieldValues) - val converterTree: Tree = buildGroupConverter(tpe, innerConverters, innerConvertersGetters, - innerConvertersResetCalls, innerValueBuilderTree) + val converterTree: Tree = buildGroupConverter( + tpe, + innerConverters, + innerConvertersGetters, + innerConvertersResetCalls, + innerValueBuilderTree + ) matchCaseClassField(converterTree) case _ => ctx.abort(ctx.enclosingPosition, s"Case class $T has unsupported field type : $fieldType ") } } def expandMethod(outerTpe: Type): List[(Tree, Tree, Tree, Tree)] = - outerTpe - .declarations + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .zipWithIndex - .map { - case (accessorMethod, idx) => - val fieldType = accessorMethod.returnType - matchField(idx, fieldType, NOT_A_COLLECTION) - }.toList + .map { case (accessorMethod, idx) => + val fieldType = accessorMethod.returnType + matchField(idx, fieldType, NOT_A_COLLECTION) + } + .toList - def unzip(treeTuples: List[(Tree, Tree, Tree, Tree)]): (List[Tree], List[Tree], List[Tree], List[Tree]) = { + def unzip( + treeTuples: List[(Tree, Tree, Tree, Tree)] + ): (List[Tree], List[Tree], List[Tree], List[Tree]) = { val emptyTreeList = List[Tree]() treeTuples.foldRight(emptyTreeList, emptyTreeList, emptyTreeList, emptyTreeList) { case ((t1, t2, t3, t4), (l1, l2, l3, l4)) => @@ -192,8 +225,13 @@ class ParquetReadSupportProvider(schemaProvider: ParquetSchemaProvider) { } val (converters, converterGetters, convertersResetCalls, fieldValues) = unzip(expandMethod(T.tpe)) - val groupConverter = buildGroupConverter(T.tpe, converters, converterGetters, convertersResetCalls, - buildTupleValue(T.tpe, fieldValues)) + val groupConverter = buildGroupConverter( + T.tpe, + converters, + converterGetters, + convertersResetCalls, + buildTupleValue(T.tpe, fieldValues) + ) val schema = schemaProvider.toParquetSchemaImpl[T](ctx) val readSupport = q""" diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetSchemaProvider.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetSchemaProvider.scala index 78810dbb89..8a06cd45f7 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetSchemaProvider.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/ParquetSchemaProvider.scala @@ -10,8 +10,11 @@ class ParquetSchemaProvider(fieldRenamer: (String => String)) { import c.universe._ if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) - c.abort(c.enclosingPosition, s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. - This will mean the macro is operating on a non-resolved type.""") + c.abort( + c.enclosingPosition, + s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. + This will mean the macro is operating on a non-resolved type.""" + ) def matchField(fieldType: Type, originalFieldName: String, isOption: Boolean): Tree = { val fieldName = fieldRenamer(originalFieldName) @@ -28,7 +31,9 @@ class ParquetSchemaProvider(fieldRenamer: (String => String)) { case tpe if tpe =:= typeOf[String] => createPrimitiveTypeField(q"_root_.org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY") case tpe if tpe =:= typeOf[Boolean] => - createPrimitiveTypeField(q"_root_.org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN") + createPrimitiveTypeField( + q"_root_.org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN" + ) case tpe if tpe =:= typeOf[Short] || tpe =:= typeOf[Int] || tpe =:= typeOf[Byte] => createPrimitiveTypeField(q"_root_.org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32") case tpe if tpe =:= typeOf[Long] => @@ -55,16 +60,15 @@ class ParquetSchemaProvider(fieldRenamer: (String => String)) { } } - def expandMethod(outerTpe: Type): List[Tree] = { - outerTpe - .declarations + def expandMethod(outerTpe: Type): List[Tree] = + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } .map { accessorMethod => val fieldName = accessorMethod.name.toString val fieldType = accessorMethod.returnType matchField(fieldType, fieldName, isOption = false) - }.toList - } + } + .toList val expanded = expandMethod(T.tpe) diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/WriteSupportProvider.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/WriteSupportProvider.scala index 2414907d35..ca108d0727 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/WriteSupportProvider.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/macros/impl/WriteSupportProvider.scala @@ -7,20 +7,27 @@ import scala.reflect.macros.whitebox.Context class WriteSupportProvider(schemaProvider: ParquetSchemaProvider) { - def toWriteSupportImpl[T](ctx: Context)(implicit T: ctx.WeakTypeTag[T]): ctx.Expr[ParquetWriteSupport[T]] = { + def toWriteSupportImpl[T]( + ctx: Context + )(implicit T: ctx.WeakTypeTag[T]): ctx.Expr[ParquetWriteSupport[T]] = { import ctx.universe._ if (!IsCaseClassImpl.isCaseClassType(ctx)(T.tpe)) - ctx.abort(ctx.enclosingPosition, + ctx.abort( + ctx.enclosingPosition, s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. - This will mean the macro is operating on a non-resolved type.""") + This will mean the macro is operating on a non-resolved type.""" + ) def matchField(idx: Int, fieldType: Type, fValue: Tree, groupName: TermName): (Int, Tree) = { def writePrimitiveField(wTree: Tree) = - (idx + 1, q"""rc.startField($groupName.getFieldName($idx), $idx) + ( + idx + 1, + q"""rc.startField($groupName.getFieldName($idx), $idx) $wTree - rc.endField($groupName.getFieldName($idx), $idx)""") + rc.endField($groupName.getFieldName($idx), $idx)""" + ) def writeGroupField(subTree: Tree) = q"""rc.startField($groupName.getFieldName($idx), $idx) @@ -57,29 +64,42 @@ class WriteSupportProvider(schemaProvider: ParquetSchemaProvider) { val cacheName = newTermName(ctx.fresh("optionIndex")) val innerType = tpe.asInstanceOf[TypeRefApi].args.head val (_, subTree) = matchField(idx, innerType, q"$cacheName", groupName) - (idx + 1, q"""if($fValue.isDefined) { + ( + idx + 1, + q"""if($fValue.isDefined) { val $cacheName = $fValue.get $subTree } - """) + """ + ) case tpe if tpe.erasure =:= typeOf[List[Any]] || tpe.erasure =:= typeOf[Set[_]] => val innerType = tpe.asInstanceOf[TypeRefApi].args.head val newGroupName = createGroupName() val (_, subTree) = matchField(0, innerType, q"element", newGroupName) - (idx + 1, writeCollectionField(newGroupName, q""" + ( + idx + 1, + writeCollectionField( + newGroupName, + q""" rc.startField("list", 0) $fValue.foreach{ element => rc.startGroup() $subTree rc.endGroup } - rc.endField("list", 0)""")) + rc.endField("list", 0)""" + ) + ) case tpe if tpe.erasure =:= typeOf[Map[_, Any]] => val List(keyType, valueType) = tpe.asInstanceOf[TypeRefApi].args val newGroupName = createGroupName() val (_, keySubTree) = matchField(0, keyType, q"key", newGroupName) val (_, valueSubTree) = matchField(1, valueType, q"value", newGroupName) - (idx + 1, writeCollectionField(newGroupName, q""" + ( + idx + 1, + writeCollectionField( + newGroupName, + q""" rc.startField("map", 0) $fValue.foreach{ case(key, value) => rc.startGroup() @@ -87,32 +107,36 @@ class WriteSupportProvider(schemaProvider: ParquetSchemaProvider) { $valueSubTree rc.endGroup } - rc.endField("map", 0)""")) + rc.endField("map", 0)""" + ) + ) case tpe if IsCaseClassImpl.isCaseClassType(ctx)(tpe) => val newGroupName = createGroupName() val (_, subTree) = expandMethod(tpe, fValue, newGroupName) - (idx + 1, + ( + idx + 1, q""" val $newGroupName = $groupName.getType($idx).asGroupType() - ${writeGroupField(subTree)}""") + ${writeGroupField(subTree)}""" + ) case _ => ctx.abort(ctx.enclosingPosition, s"Case class $T has unsupported field type : $fieldType") } } - def expandMethod(outerTpe: Type, pValueTree: Tree, groupName: TermName): (Int, Tree) = { - outerTpe - .declarations + def expandMethod(outerTpe: Type, pValueTree: Tree, groupName: TermName): (Int, Tree) = + outerTpe.declarations .collect { case m: MethodSymbol if m.isCaseAccessor => m } - .foldLeft((0, q"")) { - case ((idx, existingTree), getter) => - val (newIdx, subTree) = matchField(idx, getter.returnType, q"$pValueTree.$getter", groupName) - (newIdx, q""" + .foldLeft((0, q"")) { case ((idx, existingTree), getter) => + val (newIdx, subTree) = matchField(idx, getter.returnType, q"$pValueTree.$getter", groupName) + ( + newIdx, + q""" $existingTree $subTree - """) + """ + ) } - } def createGroupName(): TermName = newTermName(ctx.fresh("group")) diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/ParquetTupleConverter.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/ParquetTupleConverter.scala index 5bce0a9594..6387d639f0 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/ParquetTupleConverter.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/ParquetTupleConverter.scala @@ -1,9 +1,10 @@ package com.twitter.scalding.parquet.tuple.scheme -import org.apache.parquet.io.api.{ Binary, Converter, GroupConverter, PrimitiveConverter } +import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter} import scala.util.Try trait TupleFieldConverter[+T] extends Converter with Serializable { + /** * Current value read from parquet column */ @@ -25,7 +26,8 @@ abstract class ParquetTupleConverter[T] extends GroupConverter with TupleFieldCo /** * Primitive fields converter - * @tparam T primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean) + * @tparam T + * primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean) */ trait PrimitiveFieldConverter[T] extends PrimitiveConverter with TupleFieldConverter[T] { val defaultValue: T @@ -43,13 +45,13 @@ class StringConverter extends PrimitiveFieldConverter[String] { } class DoubleConverter extends PrimitiveFieldConverter[Double] { - override val defaultValue: Double = 0D + override val defaultValue: Double = 0d override def addDouble(v: Double): Unit = value = v } class FloatConverter extends PrimitiveFieldConverter[Float] { - override val defaultValue: Float = 0F + override val defaultValue: Float = 0f override def addFloat(v: Float): Unit = value = v } @@ -86,7 +88,8 @@ class BooleanConverter extends PrimitiveFieldConverter[Boolean] { /** * Collection field converter, such as list(Scala Option is also seen as a collection). - * @tparam T collection element type(can be primitive types or nested types) + * @tparam T + * collection element type(can be primitive types or nested types) */ trait CollectionConverter[T] { val child: TupleFieldConverter[T] @@ -96,10 +99,12 @@ trait CollectionConverter[T] { /** * A wrapper of primitive converters for modeling primitive fields in a collection - * @tparam T primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean) + * @tparam T + * primitive types (String, Double, Float, Long, Int, Short, Byte, Boolean) */ -abstract class CollectionElementPrimitiveConverter[T](val parent: CollectionConverter[T]) extends PrimitiveConverter - with TupleFieldConverter[T] { +abstract class CollectionElementPrimitiveConverter[T](val parent: CollectionConverter[T]) + extends PrimitiveConverter + with TupleFieldConverter[T] { val delegate: PrimitiveFieldConverter[T] override def addBinary(v: Binary) = { @@ -139,10 +144,12 @@ abstract class CollectionElementPrimitiveConverter[T](val parent: CollectionConv /** * A wrapper of group converters for modeling group type element in a collection - * @tparam T group tuple type(can be a collection type, such as list) + * @tparam T + * group tuple type(can be a collection type, such as list) */ -abstract class CollectionElementGroupConverter[T](val parent: CollectionConverter[T]) extends GroupConverter - with TupleFieldConverter[T] { +abstract class CollectionElementGroupConverter[T](val parent: CollectionConverter[T]) + extends GroupConverter + with TupleFieldConverter[T] { val delegate: TupleFieldConverter[T] @@ -162,7 +169,8 @@ abstract class CollectionElementGroupConverter[T](val parent: CollectionConverte /** * Option converter for modeling option field - * @tparam T option element type(can be primitive types or nested types) + * @tparam T + * option element type(can be primitive types or nested types) */ abstract class OptionConverter[T] extends TupleFieldConverter[Option[T]] with CollectionConverter[T] { var value: Option[T] = None @@ -184,9 +192,9 @@ abstract class OptionConverter[T] extends TupleFieldConverter[Option[T]] with Co } /** - * List in parquet is represented by 3-level structure. - * Check this https://github.com/apache/incubator-parquet-format/blob/master/LogicalTypes.md - * Helper class to wrap a converter for a list group converter + * List in parquet is represented by 3-level structure. Check this + * https://github.com/apache/incubator-parquet-format/blob/master/LogicalTypes.md Helper class to wrap a + * converter for a list group converter */ object ListElement { def wrapper(child: Converter): GroupConverter = new GroupConverter() { @@ -201,11 +209,16 @@ object ListElement { override def start(): Unit = () } } + /** * List converter for modeling list field - * @tparam T list element type(can be primitive types or nested types) + * @tparam T + * list element type(can be primitive types or nested types) */ -abstract class ListConverter[T] extends GroupConverter with TupleFieldConverter[List[T]] with CollectionConverter[T] { +abstract class ListConverter[T] + extends GroupConverter + with TupleFieldConverter[List[T]] + with CollectionConverter[T] { var value: List[T] = Nil @@ -243,9 +256,13 @@ abstract class ListConverter[T] extends GroupConverter with TupleFieldConverter[ /** * Set converter for modeling set field - * @tparam T list element type(can be primitive types or nested types) + * @tparam T + * list element type(can be primitive types or nested types) */ -abstract class SetConverter[T] extends GroupConverter with TupleFieldConverter[Set[T]] with CollectionConverter[T] { +abstract class SetConverter[T] + extends GroupConverter + with TupleFieldConverter[Set[T]] + with CollectionConverter[T] { var value: Set[T] = Set() @@ -274,10 +291,15 @@ abstract class SetConverter[T] extends GroupConverter with TupleFieldConverter[S /** * Map converter for modeling map field - * @tparam K map key type - * @tparam V map value type + * @tparam K + * map key type + * @tparam V + * map value type */ -abstract class MapConverter[K, V] extends GroupConverter with TupleFieldConverter[Map[K, V]] with CollectionConverter[(K, V)] { +abstract class MapConverter[K, V] + extends GroupConverter + with TupleFieldConverter[Map[K, V]] + with CollectionConverter[(K, V)] { var value: Map[K, V] = Map() @@ -302,13 +324,14 @@ abstract class MapConverter[K, V] extends GroupConverter with TupleFieldConverte } abstract class MapKeyValueConverter[K, V](parent: CollectionConverter[(K, V)]) - extends CollectionElementGroupConverter[(K, V)](parent) { + extends CollectionElementGroupConverter[(K, V)](parent) { val keyConverter: TupleFieldConverter[K] val valueConverter: TupleFieldConverter[V] - override lazy val delegate: TupleFieldConverter[(K, V)] = new GroupConverter with TupleFieldConverter[(K, V)] { + override lazy val delegate: TupleFieldConverter[(K, V)] = new GroupConverter + with TupleFieldConverter[(K, V)] { override def currentValue: (K, V) = (keyConverter.currentValue, valueConverter.currentValue) override def reset(): Unit = { @@ -316,15 +339,16 @@ abstract class MapKeyValueConverter[K, V](parent: CollectionConverter[(K, V)]) valueConverter.reset() } - override def getConverter(i: Int): Converter = { + override def getConverter(i: Int): Converter = if (i == 0) keyConverter else if (i == 1) valueConverter - else throw new IllegalArgumentException("key_value has only the key (0) and value (1) fields expected: " + i) - } + else + throw new IllegalArgumentException( + "key_value has only the key (0) and value (1) fields expected: " + i + ) override def end(): Unit = () override def start(): Unit = reset() } } - diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/TypedParquetTupleScheme.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/TypedParquetTupleScheme.scala index 4d9e7382e4..d4e22b84c8 100644 --- a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/TypedParquetTupleScheme.scala +++ b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/tuple/scheme/TypedParquetTupleScheme.scala @@ -2,31 +2,33 @@ package com.twitter.scalding.parquet.tuple.scheme import com.twitter.scalding.parquet.ScaldingDeprecatedParquetInputFormat -import java.util.{ HashMap => JHashMap, Map => JMap } +import java.util.{HashMap => JHashMap, Map => JMap} import org.apache.parquet.filter2.predicate.FilterPredicate import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.hadoop.api.WriteSupport.WriteContext -import org.apache.parquet.hadoop.api.{ InitContext, WriteSupport, ReadSupport } +import org.apache.parquet.hadoop.api.{InitContext, ReadSupport, WriteSupport} import org.apache.parquet.io.api._ import cascading.flow.FlowProcess -import cascading.scheme.{ Scheme, SinkCall, SourceCall } +import cascading.scheme.{Scheme, SinkCall, SourceCall} import cascading.tap.Tap import cascading.tuple.Tuple -import com.twitter.bijection.{ Injection, GZippedBase64String } +import com.twitter.bijection.{GZippedBase64String, Injection} import com.twitter.chill.KryoInjection import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred._ -import org.apache.parquet.hadoop.mapred.{ Container, DeprecatedParquetOutputFormat } -import org.apache.parquet.hadoop.{ ParquetInputFormat, ParquetOutputFormat } +import org.apache.parquet.hadoop.mapred.{Container, DeprecatedParquetOutputFormat} +import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} import org.apache.parquet.schema._ -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} /** * Parquet tuple materializer permits to create user defined type record from parquet tuple values - * @param converter root converter - * @tparam T User defined value type + * @param converter + * root converter + * @tparam T + * User defined value type */ class ParquetTupleMaterializer[T](val converter: ParquetTupleConverter[T]) extends RecordMaterializer[T] { override def getCurrentRecord: T = converter.currentValue @@ -35,23 +37,32 @@ class ParquetTupleMaterializer[T](val converter: ParquetTupleConverter[T]) exten } /** - * Parquet read support used by [[org.apache.parquet.hadoop.ParquetInputFormat]] to read values from parquet input. - * User must define record schema and parquet tuple converter that permits to convert parquet tuple to user defined type - * For case class types, we provide a macro to generate the schema and read support: - * [[com.twitter.scalding.parquet.tuple.macros.Macros.caseClassParquetReadSupport]] + * Parquet read support used by [[org.apache.parquet.hadoop.ParquetInputFormat]] to read values from parquet + * input. User must define record schema and parquet tuple converter that permits to convert parquet tuple to + * user defined type For case class types, we provide a macro to generate the schema and read support: + * [[com.twitter.scalding.parquet.tuple.macros.Macros.caseClassParquetReadSupport]] * - * @tparam T user defined value type + * @tparam T + * user defined value type */ abstract class ParquetReadSupport[T](val rootSchema: String) extends ReadSupport[T] with Serializable { val tupleConverter: ParquetTupleConverter[T] lazy val rootType: MessageType = MessageTypeParser.parseMessageType(rootSchema) - override def init(configuration: Configuration, map: JMap[String, String], messageType: MessageType): ReadContext = + override def init( + configuration: Configuration, + map: JMap[String, String], + messageType: MessageType + ): ReadContext = new ReadContext(rootType) - override def prepareForRead(configuration: Configuration, map: JMap[String, String], messageType: MessageType, - readContext: ReadContext): RecordMaterializer[T] = + override def prepareForRead( + configuration: Configuration, + map: JMap[String, String], + messageType: MessageType, + readContext: ReadContext + ): RecordMaterializer[T] = new ParquetTupleMaterializer(tupleConverter) } @@ -64,28 +75,33 @@ class ReadSupportInstanceProxy[T] extends ReadSupport[T] { readSupportInstance match { case Success(obj) => obj.asInstanceOf[ReadSupport[T]] - case Failure(e) => throw e + case Failure(e) => throw e } } - override def init(context: InitContext): ReadContext = { + override def init(context: InitContext): ReadContext = getDelegateInstance(context.getConfiguration).init(context) - } - override def prepareForRead(configuration: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, readContext: ReadContext): RecordMaterializer[T] = { - getDelegateInstance(configuration).prepareForRead(configuration, keyValueMetaData, fileSchema, readContext) - } + override def prepareForRead( + configuration: Configuration, + keyValueMetaData: JMap[String, String], + fileSchema: MessageType, + readContext: ReadContext + ): RecordMaterializer[T] = + getDelegateInstance(configuration) + .prepareForRead(configuration, keyValueMetaData, fileSchema, readContext) } /** - * Parquet write support used by [[org.apache.parquet.hadoop.ParquetOutputFormat]] to write values to parquet output. - * User must provide record schema and a function which permits to write a used defined case class to parquet store with - * the record consumer and schema definition. + * Parquet write support used by [[org.apache.parquet.hadoop.ParquetOutputFormat]] to write values to parquet + * output. User must provide record schema and a function which permits to write a used defined case class to + * parquet store with the record consumer and schema definition. * * For case class value types, we provide a macro to generate the write support, please check - * [[com.twitter.scalding.parquet.tuple.macros.Macros.caseClassParquetWriteSupport]] + * [[com.twitter.scalding.parquet.tuple.macros.Macros.caseClassParquetWriteSupport]] * - * @tparam T user defined value type + * @tparam T + * user defined value type */ abstract class ParquetWriteSupport[T](val rootSchema: String) extends WriteSupport[T] with Serializable { @@ -106,7 +122,8 @@ abstract class ParquetWriteSupport[T](val rootSchema: String) extends WriteSuppo object ParquetInputOutputFormat { val READ_SUPPORT_INSTANCE = "scalding.parquet.read.support.instance" val WRITE_SUPPORT_INSTANCE = "scalding.parquet.write.support.instance" - val injection: Injection[Any, String] = KryoInjection.andThen(Injection.connect[Array[Byte], GZippedBase64String, String]) + val injection: Injection[Any, String] = + KryoInjection.andThen(Injection.connect[Array[Byte], GZippedBase64String, String]) } class ParquetOutputFormatFromWriteSupportInstance[T] extends ParquetOutputFormat[T] { @@ -116,7 +133,7 @@ class ParquetOutputFormatFromWriteSupportInstance[T] extends ParquetOutputFormat val writeSupportInstance: Try[Any] = ParquetInputOutputFormat.injection.invert(writeSupport) writeSupportInstance match { case Success(obj) => obj.asInstanceOf[WriteSupport[T]] - case Failure(e) => throw e + case Failure(e) => throw e } } } @@ -127,14 +144,22 @@ private class InnerDeprecatedParquetOutputFormat[T] extends DeprecatedParquetOut /** * Typed parquet tuple scheme. - * @param readSupport read support class - * @param writeSupport write support class - * @param fp filter predicate - * @tparam T tuple value type + * @param readSupport + * read support class + * @param writeSupport + * write support class + * @param fp + * filter predicate + * @tparam T + * tuple value type */ -class TypedParquetTupleScheme[T](val readSupport: ParquetReadSupport[T], val writeSupport: ParquetWriteSupport[T], - val fp: Option[FilterPredicate] = None) - extends Scheme[JobConf, RecordReader[AnyRef, Container[T]], OutputCollector[AnyRef, T], Array[AnyRef], Array[AnyRef]] { +class TypedParquetTupleScheme[T]( + val readSupport: ParquetReadSupport[T], + val writeSupport: ParquetWriteSupport[T], + val fp: Option[FilterPredicate] = None +) extends Scheme[JobConf, RecordReader[AnyRef, Container[T]], OutputCollector[AnyRef, T], Array[ + AnyRef + ], Array[AnyRef]] { type Output = OutputCollector[AnyRef, T] type Reader = RecordReader[AnyRef, Container[T]] @@ -145,7 +170,10 @@ class TypedParquetTupleScheme[T](val readSupport: ParquetReadSupport[T], val wri override def sourceConfInit(flowProcess: FlowProcess[JobConf], tap: TapType, jobConf: JobConf): Unit = { fp.map(ParquetInputFormat.setFilterPredicate(jobConf, _)) jobConf.setInputFormat(classOf[ScaldingDeprecatedParquetInputFormat[T]]) - jobConf.set(ParquetInputOutputFormat.READ_SUPPORT_INSTANCE, ParquetInputOutputFormat.injection(readSupport)) + jobConf.set( + ParquetInputOutputFormat.READ_SUPPORT_INSTANCE, + ParquetInputOutputFormat.injection(readSupport) + ) ParquetInputFormat.setReadSupportClass(jobConf, classOf[ReadSupportInstanceProxy[_]]) } @@ -165,13 +193,18 @@ class TypedParquetTupleScheme[T](val readSupport: ParquetReadSupport[T], val wri override def sinkConfInit(flowProcess: FlowProcess[JobConf], tap: TapType, jobConf: JobConf): Unit = { jobConf.setOutputFormat(classOf[InnerDeprecatedParquetOutputFormat[T]]) - jobConf.set(ParquetInputOutputFormat.WRITE_SUPPORT_INSTANCE, ParquetInputOutputFormat.injection(writeSupport)) + jobConf.set( + ParquetInputOutputFormat.WRITE_SUPPORT_INSTANCE, + ParquetInputOutputFormat.injection(writeSupport) + ) } override def sink(flowProcess: FlowProcess[JobConf], sinkCall: SinkCallType): Unit = { val tuple = sinkCall.getOutgoingEntry - require(tuple.size == 1, - "TypedParquetTupleScheme expects tuple with an arity of exactly 1, but found " + tuple.getFields) + require( + tuple.size == 1, + "TypedParquetTupleScheme expects tuple with an arity of exactly 1, but found " + tuple.getFields + ) val value = tuple.getObject(0).asInstanceOf[T] val outputCollector = sinkCall.getOutput outputCollector.collect(null, value) diff --git a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/ParquetSourcesTests.scala b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/ParquetSourcesTests.scala index db74f2d0ba..03e3519fdc 100644 --- a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/ParquetSourcesTests.scala +++ b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/ParquetSourcesTests.scala @@ -1,15 +1,23 @@ package com.twitter.scalding.parquet import cascading.tuple.Fields -import com.twitter.scalding.parquet.thrift.{ DailySuffixParquetThrift, FixedPathParquetThrift, HourlySuffixParquetThrift } -import com.twitter.scalding.parquet.tuple.{ DailySuffixParquetTuple, FixedPathParquetTuple, HourlySuffixParquetTuple } -import com.twitter.scalding.{ DateRange, RichDate, Source } -import java.lang.{ Integer => JInt } +import com.twitter.scalding.parquet.thrift.{ + DailySuffixParquetThrift, + FixedPathParquetThrift, + HourlySuffixParquetThrift +} +import com.twitter.scalding.parquet.tuple.{ + DailySuffixParquetTuple, + FixedPathParquetTuple, + HourlySuffixParquetTuple +} +import com.twitter.scalding.{DateRange, RichDate, Source} +import java.lang.{Integer => JInt} import org.apache.thrift.protocol.TProtocol -import org.apache.thrift.{ TBase, TFieldIdEnum } +import org.apache.thrift.{TBase, TFieldIdEnum} import org.scalatest.WordSpec import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.{ FilterApi, FilterPredicate } +import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate} abstract class ParquetSourcesTestsBase extends WordSpec { @@ -19,27 +27,27 @@ abstract class ParquetSourcesTestsBase extends WordSpec { val fields = new Fields("foo", "bar") val columnStrings = Set("a", "b", "c") - def testDefaultFilter[S <: Source with HasFilterPredicate](src: S) = { + def testDefaultFilter[S <: Source with HasFilterPredicate](src: S) = "default to no filter predicate" in { assert(src.withFilter === None) } - } - def testReturnProvidedFilter[S <: Source with HasFilterPredicate](src: S) = { + def testReturnProvidedFilter[S <: Source with HasFilterPredicate](src: S) = "return the provided filter" in { assert(src.withFilter === Some(filter1)) } - } - def testDefaultColumns[S <: Source with HasColumnProjection](src: S) = { + def testDefaultColumns[S <: Source with HasColumnProjection](src: S) = "default to no column projection" in { assert(src.columnProjectionString === None) assert(src.withColumns === Set()) assert(src.withColumnProjections === Set()) } - } - def testReturnProvidedColumns[S <: Source with HasColumnProjection](src: S, expected: ColumnProjectionString) = { + def testReturnProvidedColumns[S <: Source with HasColumnProjection]( + src: S, + expected: ColumnProjectionString + ) = { "return the provided columns " + expected in { assert(src.columnProjectionString.get === expected) } @@ -49,9 +57,8 @@ abstract class ParquetSourcesTestsBase extends WordSpec { } } - private def verifyParquetStringFormat(s: String, expected: Set[String]) = { + private def verifyParquetStringFormat(s: String, expected: Set[String]) = assert(s.split(";").toSet === expected) - } } class ParquetSourcesTests extends ParquetSourcesTestsBase { @@ -61,22 +68,25 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new DailySuffixParquetThrift[MockTBase](path, dateRange) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new DailySuffixParquetThrift[MockTBase](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new DailySuffixParquetThrift[MockTBase](path, dateRange) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new DailySuffixParquetThrift[MockTBase](path, dateRange) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -85,22 +95,25 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new HourlySuffixParquetThrift[MockTBase](path, dateRange) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new HourlySuffixParquetThrift[MockTBase](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new HourlySuffixParquetThrift[MockTBase](path, dateRange) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new HourlySuffixParquetThrift[MockTBase](path, dateRange) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -109,22 +122,25 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new FixedPathParquetThrift[MockTBase](path, path, path) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new FixedPathParquetThrift[MockTBase](path, path, path) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) testDefaultColumns(default) testReturnProvidedColumns( new FixedPathParquetThrift[MockTBase](path, path, path) { override def withColumns: Set[String] = columnStrings - }, DeprecatedColumnProjectionString(columnStrings)) + }, + DeprecatedColumnProjectionString(columnStrings) + ) testReturnProvidedColumns( new FixedPathParquetThrift[MockTBase](path, path, path) { override def withColumnProjections: Set[String] = columnStrings - }, StrictColumnProjectionString(columnStrings)) + }, + StrictColumnProjectionString(columnStrings) + ) } @@ -133,10 +149,9 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new DailySuffixParquetTuple(path, dateRange, fields) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new DailySuffixParquetTuple(path, dateRange, fields) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) } "HourlySuffixParquetTuple" should { @@ -144,10 +159,9 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new HourlySuffixParquetTuple(path, dateRange, fields) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new HourlySuffixParquetTuple(path, dateRange, fields) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) } "FixedPathParquetTuple" should { @@ -155,10 +169,9 @@ class ParquetSourcesTests extends ParquetSourcesTestsBase { testDefaultFilter(default) - testReturnProvidedFilter( - new FixedPathParquetTuple(fields, path, path, path) { - override val withFilter: Option[FilterPredicate] = Some(filter1) - }) + testReturnProvidedFilter(new FixedPathParquetTuple(fields, path, path, path) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) } } diff --git a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala index e34da43d4e..f2d205a4a9 100644 --- a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala +++ b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala @@ -8,7 +8,7 @@ import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.thrift.ThriftParquetReader -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} object PartitionedParquetThriftTestSources { val path = "/a/path" @@ -17,10 +17,15 @@ object PartitionedParquetThriftTestSources { class PartitionedParquetThriftWriteJob(args: Args) extends Job(args) { import PartitionedParquetThriftTestSources._ - val input = Seq(new Address("123 Embarcadero", "94111"), new Address("123 E 79th St", "10075"), new Address("456 W 80th St", "10075")) + val input = Seq( + new Address("123 Embarcadero", "94111"), + new Address("123 E 79th St", "10075"), + new Address("456 W 80th St", "10075") + ) - TypedPipe.from(input) - .map { address => (address.getZip, address) } + TypedPipe + .from(input) + .map(address => (address.getZip, address)) .write(partitionSource) } @@ -40,21 +45,25 @@ class PartitionedParquetThriftSourceTests extends WordSpec with Matchers { job = new PartitionedParquetThriftWriteJob(args) job } - JobTest(buildJob(_)) - .runHadoop + JobTest(buildJob(_)).runHadoop .finish() val testMode = job.mode.asInstanceOf[HadoopTest] val directory = new File(testMode.getWritePathFor(partitionSource)) - directory.listFiles().map({ _.getName() }).toSet shouldBe Set("94111", "10075") + directory.listFiles().map { _.getName() }.toSet shouldBe Set("94111", "10075") // check that the partitioning is done correctly by zipcode - validate(new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"), - new Address("123 Embarcadero", "94111")) - validate(new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"), - new Address("123 E 79th St", "10075"), new Address("456 W 80th St", "10075")) + validate( + new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"), + new Address("123 Embarcadero", "94111") + ) + validate( + new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"), + new Address("123 E 79th St", "10075"), + new Address("456 W 80th St", "10075") + ) } } -} \ No newline at end of file +} diff --git a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/TypedParquetTupleTest.scala b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/TypedParquetTupleTest.scala index 67980622ac..d9a8d0853d 100644 --- a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/TypedParquetTupleTest.scala +++ b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/TypedParquetTupleTest.scala @@ -1,12 +1,12 @@ package com.twitter.scalding.parquet.tuple import com.twitter.scalding.parquet.tuple.macros.Macros._ -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopPlatformTest } +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopPlatformTest} import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.{ Args, Job, TypedTsv } -import org.scalatest.{ Matchers, WordSpec } +import com.twitter.scalding.{Args, Job, TypedTsv} +import org.scalatest.{Matchers, WordSpec} import org.apache.parquet.filter2.predicate.FilterApi.binaryColumn -import org.apache.parquet.filter2.predicate.{ FilterApi, FilterPredicate } +import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate} import org.apache.parquet.io.api.Binary class TypedParquetTupleTest extends WordSpec with Matchers with HadoopPlatformTest { @@ -21,12 +21,13 @@ class TypedParquetTupleTest extends WordSpec with Matchers with HadoopPlatformTe .arg("output", "output1") .sink[SampleClassB](TypedParquet[SampleClassB](Seq("output1"))) { toMap(_) shouldBe toMap(values) - }.run() + } + .run() HadoopPlatformJobTest(new ReadWithFilterPredicateJob(_), cluster) .arg("input", "output1") .arg("output", "output2") - .sink[Boolean]("output2") { toMap(_) shouldBe toMap(values.filter(_.string == "B1").map(_.a.bool)) } + .sink[Boolean]("output2")(toMap(_) shouldBe toMap(values.filter(_.string == "B1").map(_.a.bool))) .run() } } @@ -34,30 +35,71 @@ class TypedParquetTupleTest extends WordSpec with Matchers with HadoopPlatformTe object TestValues { val values = Seq( - SampleClassB("B1", Some(4.0D), SampleClassA(bool = true, 5, 1L, 1.2F, 1), List(1, 2), - List(SampleClassD(1, "1"), SampleClassD(2, "2")), Set(1D, 2D), Set(SampleClassF(1, 1F)), Map(1 -> "foo")), - SampleClassB("B2", Some(3.0D), SampleClassA(bool = false, 4, 2L, 2.3F, 2), List(3, 4), Nil, Set(3, 4), Set(), - Map(2 -> "bar"), Map(SampleClassD(0, "z") -> SampleClassF(0, 3), SampleClassD(0, "y") -> SampleClassF(2, 6))), - SampleClassB("B3", None, SampleClassA(bool = true, 6, 3L, 3.4F, 3), List(5, 6), - List(SampleClassD(3, "3"), SampleClassD(4, "4")), Set(5, 6), Set(SampleClassF(2, 2F))), - SampleClassB("B4", Some(5.0D), SampleClassA(bool = false, 7, 4L, 4.5F, 4), Nil, - List(SampleClassD(5, "5"), SampleClassD(6, "6")), Set(), Set(SampleClassF(3, 3F), SampleClassF(5, 4F)), - Map(3 -> "foo2"), Map(SampleClassD(0, "q") -> SampleClassF(4, 3)))) + SampleClassB( + "B1", + Some(4.0d), + SampleClassA(bool = true, 5, 1L, 1.2f, 1), + List(1, 2), + List(SampleClassD(1, "1"), SampleClassD(2, "2")), + Set(1d, 2d), + Set(SampleClassF(1, 1f)), + Map(1 -> "foo") + ), + SampleClassB( + "B2", + Some(3.0d), + SampleClassA(bool = false, 4, 2L, 2.3f, 2), + List(3, 4), + Nil, + Set(3, 4), + Set(), + Map(2 -> "bar"), + Map(SampleClassD(0, "z") -> SampleClassF(0, 3), SampleClassD(0, "y") -> SampleClassF(2, 6)) + ), + SampleClassB( + "B3", + None, + SampleClassA(bool = true, 6, 3L, 3.4f, 3), + List(5, 6), + List(SampleClassD(3, "3"), SampleClassD(4, "4")), + Set(5, 6), + Set(SampleClassF(2, 2f)) + ), + SampleClassB( + "B4", + Some(5.0d), + SampleClassA(bool = false, 7, 4L, 4.5f, 4), + Nil, + List(SampleClassD(5, "5"), SampleClassD(6, "6")), + Set(), + Set(SampleClassF(3, 3f), SampleClassF(5, 4f)), + Map(3 -> "foo2"), + Map(SampleClassD(0, "q") -> SampleClassF(4, 3)) + ) + ) } case class SampleClassA(bool: Boolean, short: Short, long: Long, float: Float, byte: Byte) -case class SampleClassB(string: String, double: Option[Double], a: SampleClassA, intList: List[Int], - dList: List[SampleClassD], doubleSet: Set[Double], fSet: Set[SampleClassF], intStringMap: Map[Int, String] = Map(), - dfMap: Map[SampleClassD, SampleClassF] = Map()) +case class SampleClassB( + string: String, + double: Option[Double], + a: SampleClassA, + intList: List[Int], + dList: List[SampleClassD], + doubleSet: Set[Double], + fSet: Set[SampleClassF], + intStringMap: Map[Int, String] = Map(), + dfMap: Map[SampleClassD, SampleClassF] = Map() +) case class SampleClassC(string: String, a: SampleClassA) case class SampleClassD(x: Int, y: String) case class SampleClassF(w: Byte, z: Float) /** - * Test job write a sequence of sample class values into a typed parquet tuple. - * To test typed parquet tuple can be used as sink + * Test job write a sequence of sample class values into a typed parquet tuple. To test typed parquet tuple + * can be used as sink */ class WriteToTypedParquetTupleJob(args: Args) extends Job(args) { import com.twitter.scalding.parquet.tuple.TestValues._ @@ -69,9 +111,9 @@ class WriteToTypedParquetTupleJob(args: Args) extends Job(args) { } /** - * Test job read from a typed parquet source with filter predicate and push down(SampleClassC takes only part of - * SampleClassB's data) - * To test typed parquet tuple can bse used as source and apply filter predicate and push down correctly + * Test job read from a typed parquet source with filter predicate and push down(SampleClassC takes only part + * of SampleClassB's data) To test typed parquet tuple can bse used as source and apply filter predicate and + * push down correctly */ class ReadWithFilterPredicateJob(args: Args) extends Job(args) { val fp: FilterPredicate = FilterApi.eq(binaryColumn("string"), Binary.fromString("B1")) @@ -83,4 +125,3 @@ class ReadWithFilterPredicateJob(args: Args) extends Job(args) { TypedPipe.from(input).map(_.a.bool).write(TypedTsv[Boolean](outputPath)) } - diff --git a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/macros/MacroUnitTests.scala b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/macros/MacroUnitTests.scala index defce8ec8f..babab39d19 100644 --- a/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/macros/MacroUnitTests.scala +++ b/scalding-parquet/src/test/scala/com/twitter/scalding/parquet/tuple/macros/MacroUnitTests.scala @@ -1,8 +1,8 @@ package com.twitter.scalding.parquet.tuple.macros import org.scalatest.mock.MockitoSugar -import org.scalatest.{ Matchers, WordSpec } -import org.apache.parquet.io.api.{ Binary, RecordConsumer } +import org.scalatest.{Matchers, WordSpec} +import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageTypeParser case class SampleClassA(x: Int, y: String) @@ -221,10 +221,10 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { boolean.addBoolean(true) val float = converter.getConverter(4).asPrimitiveConverter() - float.addFloat(3F) + float.addFloat(3f) val double = converter.getConverter(5).asPrimitiveConverter() - double.addDouble(4D) + double.addDouble(4d) val string = converter.getConverter(6).asPrimitiveConverter() string.addBinary(Binary.fromString("foo")) @@ -232,7 +232,7 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { val byte = converter.getConverter(7).asPrimitiveConverter() byte.addInt(1) converter.end() - converter.currentValue shouldEqual SampleClassE(0, 1L, 2, d = true, 3F, 4D, "foo", 1) + converter.currentValue shouldEqual SampleClassE(0, 1L, 2, d = true, 3f, 4d, "foo", 1) } "Generate converters for case class with nested class" in { @@ -276,9 +276,9 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { b.end() val c = converter.getConverter(2).asPrimitiveConverter() - c.addDouble(4D) + c.addDouble(4d) converter.end() - converter.currentValue shouldEqual SampleClassF(0, Some(SampleClassB(SampleClassA(2, "foo"), "b1")), 4D) + converter.currentValue shouldEqual SampleClassF(0, Some(SampleClassB(SampleClassA(2, "foo"), "b1")), 4d) } "Generate converters for case class with list fields" in { @@ -338,8 +338,10 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { keyValue.end() converter.end() - converter.currentValue shouldEqual SampleClassK("foo", - Map(SampleClassA(2, "bar") -> SampleClassB(SampleClassA(2, "bar"), "b1"))) + converter.currentValue shouldEqual SampleClassK( + "foo", + Map(SampleClassA(2, "bar") -> SampleClassB(SampleClassA(2, "bar"), "b1")) + ) } } @@ -347,7 +349,7 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { "Generate write support for class with all the primitive type fields" in { val writeSupport = Macros.caseClassParquetWriteSupport[SampleClassE] - val e = SampleClassE(0, 1L, 2, d = true, 3F, 4D, "foo", 1) + val e = SampleClassE(0, 1L, 2, d = true, 3f, 4d, "foo", 1) val schema = Macros.caseClassParquetSchema[SampleClassE] val rc = new StringBuilderRecordConsumer writeSupport.writeRecord(e, rc, MessageTypeParser.parseMessageType(schema)) @@ -391,7 +393,7 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { val schemaString: String = Macros.caseClassParquetSchema[SampleClassF] val writeSupport = Macros.caseClassParquetWriteSupport[SampleClassF] - val f = SampleClassF(0, Some(SampleClassB(SampleClassA(2, "foo"), "b1")), 4D) + val f = SampleClassF(0, Some(SampleClassB(SampleClassA(2, "foo"), "b1")), 4d) val schema = MessageTypeParser.parseMessageType(schemaString) val rc = new StringBuilderRecordConsumer @@ -424,7 +426,7 @@ class MacroUnitTests extends WordSpec with Matchers with MockitoSugar { |end message""".stripMargin //test write tuple with optional field = None - val f2 = SampleClassF(0, None, 4D) + val f2 = SampleClassF(0, None, 4d) val rc2 = new StringBuilderRecordConsumer writeSupport.writeRecord(f2, rc2, schema) rc2.writeScenario shouldEqual """start message @@ -626,4 +628,4 @@ class StringBuilderRecordConsumer extends RecordConsumer { override def addInteger(i: Int): Unit = sb.append(s"write INT32 $i\n") def writeScenario = sb.toString() -} \ No newline at end of file +} diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala index 553f37d436..6c80bf093f 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala @@ -3,8 +3,7 @@ package com.twitter.scalding.quotation import scala.reflect.macros.blackbox.Context /** - * These Liftables allows us to lift values into quasiquote trees. - * For example: + * These Liftables allows us to lift values into quasiquote trees. For example: * * def test(v: Source) => q"$v" * @@ -12,35 +11,36 @@ import scala.reflect.macros.blackbox.Context */ trait Liftables { val c: Context - import c.universe.{ TypeName => _, _ } + import c.universe.{TypeName => _, _} - protected implicit val sourceLiftable: Liftable[Source] = Liftable { - case Source(path, line) => q"_root_.com.twitter.scalding.quotation.Source($path, $line)" + protected implicit val sourceLiftable: Liftable[Source] = Liftable { case Source(path, line) => + q"_root_.com.twitter.scalding.quotation.Source($path, $line)" } - protected implicit val projectionsLiftable: Liftable[Projections] = Liftable { - case p => q"_root_.com.twitter.scalding.quotation.Projections(${p.set})" + protected implicit val projectionsLiftable: Liftable[Projections] = Liftable { case p => + q"_root_.com.twitter.scalding.quotation.Projections(${p.set})" } - protected implicit val typeNameLiftable: Liftable[TypeName] = Liftable { - case TypeName(name) => q"_root_.com.twitter.scalding.quotation.TypeName($name)" + protected implicit val typeNameLiftable: Liftable[TypeName] = Liftable { case TypeName(name) => + q"_root_.com.twitter.scalding.quotation.TypeName($name)" } - protected implicit val accessorLiftable: Liftable[Accessor] = Liftable { - case Accessor(name) => q"_root_.com.twitter.scalding.quotation.Accessor($name)" + protected implicit val accessorLiftable: Liftable[Accessor] = Liftable { case Accessor(name) => + q"_root_.com.twitter.scalding.quotation.Accessor($name)" } - protected implicit val quotedLiftable: Liftable[Quoted] = Liftable { - case Quoted(source, call, fa) => q"_root_.com.twitter.scalding.quotation.Quoted($source, $call, $fa)" + protected implicit val quotedLiftable: Liftable[Quoted] = Liftable { case Quoted(source, call, fa) => + q"_root_.com.twitter.scalding.quotation.Quoted($source, $call, $fa)" } protected implicit val projectionLiftable: Liftable[Projection] = Liftable { - case p: Property => q"$p" + case p: Property => q"$p" case p: TypeReference => q"$p" } protected implicit val propertyLiftable: Liftable[Property] = Liftable { - case Property(path, accessor, tpe) => q"_root_.com.twitter.scalding.quotation.Property($path, $accessor, $tpe)" + case Property(path, accessor, tpe) => + q"_root_.com.twitter.scalding.quotation.Property($path, $accessor, $tpe)" } protected implicit val typeReferenceLiftable: Liftable[TypeReference] = Liftable { diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Projection.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Projection.scala index 512a4e602e..0c9a6e0839 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Projection.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Projection.scala @@ -13,7 +13,7 @@ sealed trait Projection { @tailrec def loop(p: Projection): TypeReference = p match { case p @ TypeReference(_) => p - case Property(p, _, _) => loop(p) + case Property(p, _, _) => loop(p) } loop(this) } @@ -21,20 +21,16 @@ sealed trait Projection { /** * Given a base projection, returns the projection based on it if applicable. * - * For instance, given a quoted function - * `val contact = Quoted.function { (c: Contact) => c.contact }` - * and a call - * `(p: Person) => contact(p.name)` - * produces the projection - * `Person.name.contact` + * For instance, given a quoted function `val contact = Quoted.function { (c: Contact) => c.contact }` and a + * call `(p: Person) => contact(p.name)` produces the projection `Person.name.contact` */ def basedOn(base: Projection): Option[Projection] = this match { case TypeReference(tpe) => base match { - case TypeReference(`tpe`) => Some(base) + case TypeReference(`tpe`) => Some(base) case Property(_, _, `tpe`) => Some(base) - case other => None + case other => None } case Property(path, name, tpe) => path.basedOn(base).map(Property(_, name, tpe)) @@ -43,22 +39,18 @@ sealed trait Projection { /** * Limits projections to only values of `superClass`. Example: * - * case class Person(name: String, contact: Contact) extends ThriftObject - * case class Contact(phone: Phone) extends ThriftObject - * case class Phone(number: String) + * case class Person(name: String, contact: Contact) extends ThriftObject case class Contact(phone: Phone) + * extends ThriftObject case class Phone(number: String) * * For the super class `ThriftObject`, it produces the transformations: * - * Person.contact.phone => Some(Person.contact.phone) - * Person.contact.phone.number => Some(Person.contact.phone) - * Person.name.isEmpty => Some(Person.name) - * Phone.number => None + * Person.contact.phone => Some(Person.contact.phone) Person.contact.phone.number => + * Some(Person.contact.phone) Person.name.isEmpty => Some(Person.name) Phone.number => None */ def bySuperClass(superClass: Class[_]): Option[Projection] = { def isSubclass(c: TypeName) = - try - superClass.isAssignableFrom(Class.forName(c.asString)) + try superClass.isAssignableFrom(Class.forName(c.asString)) catch { case _: ClassNotFoundException => false @@ -104,12 +96,13 @@ final case class Property(path: Projection, accessor: Accessor, typeName: TypeNa final class Projections private (val set: Set[Projection]) extends Serializable { /** - * Returns the projections that are based on `typeName` and limits projections - * to only properties that extend from `superClass`. + * Returns the projections that are based on `typeName` and limits projections to only properties that + * extend from `superClass`. */ def of(typeName: TypeName, superClass: Class[_]): Projections = Projections { - set.filter(_.rootProjection.typeName == typeName) + set + .filter(_.rootProjection.typeName == typeName) .flatMap(_.bySuperClass(superClass)) } @@ -129,7 +122,7 @@ final class Projections private (val set: Set[Projection]) extends Serializable override def equals(other: Any) = other match { case other: Projections => set == other.set - case other => false + case other => false } override def hashCode = @@ -140,9 +133,8 @@ object Projections { val empty = apply(Set.empty) /** - * Creates a normalized projections collection. For instance, - * given two projections `Person.contact` and `Person.contact.phone`, - * creates a collection with only `Person.contact`. + * Creates a normalized projections collection. For instance, given two projections `Person.contact` and + * `Person.contact.phone`, creates a collection with only `Person.contact`. */ def apply(set: Set[Projection]) = { @tailrec def isNested(p: Projection): Boolean = @@ -157,4 +149,4 @@ object Projections { def flatten(list: Iterable[Projections]): Projections = list.foldLeft(empty)(_ ++ _) -} \ No newline at end of file +} diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/ProjectionMacro.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/ProjectionMacro.scala index f4529ff2cb..5d094bf58d 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/ProjectionMacro.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/ProjectionMacro.scala @@ -4,7 +4,7 @@ import scala.reflect.macros.blackbox.Context trait ProjectionMacro extends TreeOps with Liftables { val c: Context - import c.universe.{ TypeName => _, _ } + import c.universe.{TypeName => _, _} def projections(params: List[Tree]): Tree = { @@ -18,13 +18,12 @@ trait ProjectionMacro extends TreeOps with Liftables { TypeReference(TypeName(tpe.typeSymbol.fullName)) def isFunction(t: Tree) = - Option(t.symbol).map { - _.typeSignature - .erasure - .typeSymbol - .fullName - .contains("scala.Function") - }.getOrElse(false) + Option(t.symbol) + .map { + _.typeSignature.erasure.typeSymbol.fullName + .contains("scala.Function") + } + .getOrElse(false) def functionBodyProjections(param: Tree, inputs: List[Tree], body: Tree): List[Tree] = { @@ -37,7 +36,6 @@ trait ProjectionMacro extends TreeOps with Liftables { case q"$v.$m(..$params)" => unapply(v) case q"$v.$m" if t.symbol.isMethod => - if (inputSymbols.contains(v.symbol)) { val p = TypeReference(typeName(v)) @@ -79,7 +77,8 @@ trait ProjectionMacro extends TreeOps with Liftables { def functionInstanceProjections(func: Tree): List[Tree] = { val paramProjections = - func.symbol.typeSignature.typeArgs.dropRight(1) + func.symbol.typeSignature.typeArgs + .dropRight(1) .map(typeReference) q""" $func match { diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala index 805c174b5f..1abcfce4df 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala @@ -16,10 +16,14 @@ object Quoted { private[scalding] def internal: Quoted = macro QuotedMacro.internal def function[T1, U](f: T1 => U): Function1[T1, U] with QuotedFunction = macro QuotedMacro.function - def function[T1, T2, U](f: (T1, T2) => U): Function2[T1, T2, U] with QuotedFunction = macro QuotedMacro.function - def function[T1, T2, T3, U](f: (T1, T2, T3) => U): Function3[T1, T2, T3, U] with QuotedFunction = macro QuotedMacro.function - def function[T1, T2, T3, T4, U](f: (T1, T2, T3, T4) => U): Function4[T1, T2, T3, T4, U] with QuotedFunction = macro QuotedMacro.function - def function[T1, T2, T3, T4, T5, U](f: (T1, T2, T3, T4, T5) => U): Function5[T1, T2, T3, T4, T5, U] with QuotedFunction = macro QuotedMacro.function + def function[T1, T2, U](f: (T1, T2) => U): Function2[T1, T2, U] with QuotedFunction = + macro QuotedMacro.function + def function[T1, T2, T3, U](f: (T1, T2, T3) => U): Function3[T1, T2, T3, U] with QuotedFunction = + macro QuotedMacro.function + def function[T1, T2, T3, T4, U](f: (T1, T2, T3, T4) => U): Function4[T1, T2, T3, T4, U] + with QuotedFunction = macro QuotedMacro.function + def function[T1, T2, T3, T4, T5, U](f: (T1, T2, T3, T4, T5) => U): Function5[T1, T2, T3, T4, T5, U] + with QuotedFunction = macro QuotedMacro.function } case class Source(path: String, line: Int) { diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/QuotedMacro.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/QuotedMacro.scala index 251aa77843..ea7fec6393 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/QuotedMacro.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/QuotedMacro.scala @@ -1,13 +1,9 @@ package com.twitter.scalding.quotation import scala.reflect.macros.blackbox.Context -import scala.reflect.macros.runtime.{ Context => ReflectContext } +import scala.reflect.macros.runtime.{Context => ReflectContext} -class QuotedMacro(val c: Context) - extends TreeOps - with TextMacro - with ProjectionMacro - with Liftables { +class QuotedMacro(val c: Context) extends TreeOps with TextMacro with ProjectionMacro with Liftables { import c.universe._ def internal: Tree = quoted @@ -23,7 +19,8 @@ class QuotedMacro(val c: Context) .callsiteTyper .context .tree - .asInstanceOf[Tree]) + .asInstanceOf[Tree] + ) val QuotedCompanion = q"_root_.com.twitter.scalding.quotation.Quoted" @@ -39,9 +36,7 @@ class QuotedMacro(val c: Context) case q"val $name = $body" => quoted(body) case q"$m.method" if m.symbol.fullName == classOf[Quoted].getName => - c.abort( - c.enclosingPosition, - "Quoted.method can be invoked only as an implicit parameter") + c.abort(c.enclosingPosition, "Quoted.method can be invoked only as an implicit parameter") case tree @ q"$instance.$method[..$t]" => q"${Quoted(source, Some(callText(method, t)), Projections.empty)}" @@ -93,7 +88,7 @@ class QuotedMacro(val c: Context) sym.fullName.startsWith("com.twitter.scalding") || { sym.owner match { case NoSymbol => false - case owner => isScalding(owner) + case owner => isScalding(owner) } } @@ -101,7 +96,7 @@ class QuotedMacro(val c: Context) c.abort( c.enclosingPosition, "The quotation must happen at the level of the user-facing API. Add an `implicit q: Quoted` to the enclosing method. " + - "If that's not possible and the transformation doesn't introduce projections, use Quoted.internal.") + "If that's not possible and the transformation doesn't introduce projections, use Quoted.internal." + ) } } - diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TextMacro.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TextMacro.scala index f5538c9969..8d1e172f27 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TextMacro.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TextMacro.scala @@ -50,21 +50,16 @@ trait TextMacro { val firstParamStart = start(firstParam) val newStart = - paramsStartPosition( - fileContent.take(firstParamStart).reverse, - firstParamStart) + paramsStartPosition(fileContent.take(firstParamStart).reverse, firstParamStart) fileContent.drop(newStart).toList } val blockDelimiters = - Map( - '(' -> ')', - '{' -> '}', - '[' -> ']') + Map('(' -> ')', '{' -> '}', '[' -> ']') /* - * Reads the parameters block. It takes in consideration nested blocks like `map(v => { ... })` + * Reads the parameters block. It takes in consideration nested blocks like `map(v => { ... })` */ def readParams(chars: List[Char], open: List[Char], acc: List[Char] = Nil): (List[Char], List[Char]) = chars match { @@ -85,4 +80,4 @@ trait TextMacro { readParams(content, Nil)._1.mkString } -} \ No newline at end of file +} diff --git a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala index 09c459e502..57a492a67c 100644 --- a/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala +++ b/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala @@ -12,35 +12,32 @@ trait TreeOps { def find(tree: Tree)(f: Tree => Boolean): Option[Tree] = { var res: Option[Tree] = None val t = new Traverser { - override def traverse(t: Tree) = { + override def traverse(t: Tree) = if (res.isEmpty) if (f(t)) res = Some(t) else super.traverse(t) - } } t.traverse(tree) res } /** - * Similar to tree.collect but it doesn't collect the children of a - * collected tree. + * Similar to tree.collect but it doesn't collect the children of a collected tree. */ def collect[T](tree: Tree)(f: PartialFunction[Tree, T]): List[T] = { var res = List.newBuilder[T] val t = new Traverser { - override def traverse(t: Tree) = { + override def traverse(t: Tree) = f.lift(t) match { case Some(v) => res += v case None => super.traverse(t) } - } } t.traverse(tree) res.result() } -} \ No newline at end of file +} diff --git a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala index e8c47bc201..29ec9e4984 100644 --- a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala +++ b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala @@ -10,16 +10,20 @@ class LimitationsTest extends Test { "nested transitive projection" in pendingUntilFixed { test.function[Person, Option[String]](_.alternativeContact.map(_.phone))._1.projections.set mustEqual - Set(Person.typeReference.andThen(Accessor("alternativeContact"), typeName[Option[Contact]]).andThen(Accessor("phone"), typeName[String])) + Set( + Person.typeReference + .andThen(Accessor("alternativeContact"), typeName[Option[Contact]]) + .andThen(Accessor("phone"), typeName[String]) + ) } "nested quoted function projection" in pendingUntilFixed { - val contactFunction = Quoted.function { - (p: Person) => p.contact + val contactFunction = Quoted.function { (p: Person) => + p.contact } - val phoneFunction = Quoted.function { - (p: Person) => contactFunction(p).phone + val phoneFunction = Quoted.function { (p: Person) => + contactFunction(p).phone } phoneFunction.quoted.projections.set mustEqual Set(Person.phoneProjection) } -} \ No newline at end of file +} diff --git a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala index f578c407ec..c8643ba1be 100644 --- a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala +++ b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala @@ -8,4 +8,4 @@ object Person { val nameProjection = typeReference.andThen(Accessor("name"), typeName[String]) val contactProjection = typeReference.andThen(Accessor("contact"), typeName[Contact]) val phoneProjection = contactProjection.andThen(Accessor("phone"), typeName[String]) -} \ No newline at end of file +} diff --git a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionMacroTest.scala b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionMacroTest.scala index e576315ea5..337a6fd934 100644 --- a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionMacroTest.scala +++ b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionMacroTest.scala @@ -15,57 +15,57 @@ class ProjectionMacroTest extends Test { "method with params isn't considered as projection" in { test - .function[Person, String](_.name.substring(1))._1 - .projections.set mustEqual Set(Person.nameProjection) + .function[Person, String](_.name.substring(1)) + ._1 + .projections + .set mustEqual Set(Person.nameProjection) } "simple" in { - test.function[Person, String](_.name)._1 - .projections.set mustEqual Set(Person.nameProjection) + test.function[Person, String](_.name)._1.projections.set mustEqual Set(Person.nameProjection) } "nested" in { - test.function[Person, String](_.contact.phone)._1 - .projections.set mustEqual Set(Person.phoneProjection) + test.function[Person, String](_.contact.phone)._1.projections.set mustEqual Set(Person.phoneProjection) } "all properties" in { - test.function[Person, Person](p => p)._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, Person](p => p)._1.projections.set mustEqual Set(Person.typeReference) } "empty projection" in { - test.function[Person, Int](p => 1)._1 - .projections.set mustEqual Set.empty + test.function[Person, Int](p => 1)._1.projections.set mustEqual Set.empty } "function call" - { "implicit apply" - { "non-quoted" in { val function = (p: Person) => p.name - test.function[Person, String](p => function(p))._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, String](p => function(p))._1.projections.set mustEqual Set(Person.typeReference) } "quoted" in { - val function = Quoted.function { - (p: Person) => p.name + val function = Quoted.function { (p: Person) => + p.name } - test.function[Person, String](p => function(p))._1 - .projections.set mustEqual Set(Person.nameProjection) + test.function[Person, String](p => function(p))._1.projections.set mustEqual Set( + Person.nameProjection + ) } } "explicit apply" - { "non-quoted" in { val function = (p: Person) => p.name - test.function[Person, String](p => function.apply(p))._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, String](p => function.apply(p))._1.projections.set mustEqual Set( + Person.typeReference + ) } "quoted" in { - val function = Quoted.function { - (p: Person) => p.name + val function = Quoted.function { (p: Person) => + p.name } - test.function[Person, String](p => function.apply(p))._1 - .projections.set mustEqual Set(Person.nameProjection) + test.function[Person, String](p => function.apply(p))._1.projections.set mustEqual Set( + Person.nameProjection + ) } } } @@ -73,28 +73,24 @@ class ProjectionMacroTest extends Test { "function instance" - { "non-quoted" in { val function = (p: Person) => p.name - test.function[Person, String](function)._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, String](function)._1.projections.set mustEqual Set(Person.typeReference) } "quoted" in { - val function = Quoted.function { - (p: Person) => p.name + val function = Quoted.function { (p: Person) => + p.name } - test.function[Person, String](function)._1 - .projections.set mustEqual Set(Person.nameProjection) + test.function[Person, String](function)._1.projections.set mustEqual Set(Person.nameProjection) } } "method call" - { "in the function body" in { def method(p: Person) = p.name - test.function[Person, String](p => method(p))._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, String](p => method(p))._1.projections.set mustEqual Set(Person.typeReference) } "as function" in { def method(p: Person) = p.name - test.function[Person, String](method)._1 - .projections.set mustEqual Set(Person.typeReference) + test.function[Person, String](method)._1.projections.set mustEqual Set(Person.typeReference) } } } diff --git a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionTest.scala b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionTest.scala index a9d441dc02..f4fd41f812 100644 --- a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionTest.scala +++ b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/ProjectionTest.scala @@ -68,15 +68,11 @@ class ProjectionTest extends Test { Projections.flatten(Nil).set mustEqual Set() } "non-empty" in { - val list = List( - Projections(Set(p1)), - Projections(Set(p2))) + val list = List(Projections(Set(p1)), Projections(Set(p2))) Projections.flatten(list).set mustEqual Set(p1, p2) } "non-empty with merge" in { - val list = List( - Projections(Set(t1)), - Projections(Set(p1))) + val list = List(Projections(Set(t1)), Projections(Set(p1))) Projections.flatten(list).set mustEqual Set(t1) } } @@ -87,9 +83,7 @@ class ProjectionTest extends Test { p.set mustEqual Set(p1, p2) } "with merge" in { - val list = List( - Projections(Set(p1)), - Projections(Set(t1))) + val list = List(Projections(Set(p1)), Projections(Set(t1))) Projections.flatten(list).set mustEqual Set(t1) } } @@ -162,4 +156,4 @@ class ProjectionTest extends Test { } } -} \ No newline at end of file +} diff --git a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/TextMacroTest.scala b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/TextMacroTest.scala index c2bf3c1bac..9e23988de6 100644 --- a/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/TextMacroTest.scala +++ b/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/TextMacroTest.scala @@ -65,10 +65,13 @@ class TextMacroTest extends Test { } } "with function" in { - (test.paramGroupsWithFunction(1) { - case 1 => 2 - case _ => 3 - })._1.text mustEqual + test + .paramGroupsWithFunction(1) { + case 1 => 2 + case _ => 3 + } + ._1 + .text mustEqual Some("""paramGroupsWithFunction(1) { case 1 => 2 case _ => 3 @@ -85,7 +88,7 @@ class TextMacroTest extends Test { test.function { case _ => 4 }._1.text mustEqual Some("function { case _ => 4 }") } "curly braces" in { - test.function { _ + 1 }._1.text mustEqual Some("function { _ + 1 }") + test.function(_ + 1)._1.text mustEqual Some("function { _ + 1 }") } } @@ -95,8 +98,7 @@ class TextMacroTest extends Test { _ + 1 } c._1.text mustEqual - Some( - """function { + Some("""function { def test = 1 _ + 1 }""") @@ -106,4 +108,4 @@ class TextMacroTest extends Test { test.tupleParam((1, 2))._1.text mustEqual Some("tupleParam((1, 2))") } -} \ No newline at end of file +} diff --git a/scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala b/scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala index 1d44b67093..9fbcf0640e 100644 --- a/scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala +++ b/scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala @@ -5,7 +5,6 @@ import java.io.BufferedReader import scala.tools.nsc.interpreter.ILoop import scala.tools.nsc.interpreter.JPrintWriter -class ILoopCompat(in: Option[BufferedReader], out: JPrintWriter) - extends ILoop(in, out) { +class ILoopCompat(in: Option[BufferedReader], out: JPrintWriter) extends ILoop(in, out) { def addThunk(f: => Unit): Unit = intp.initialize(f) } diff --git a/scalding-repl/src/main/scala/com/twitter/scalding/ReplImplicits.scala b/scalding-repl/src/main/scala/com/twitter/scalding/ReplImplicits.scala index 33006b447d..6b0a2590be 100644 --- a/scalding-repl/src/main/scala/com/twitter/scalding/ReplImplicits.scala +++ b/scalding-repl/src/main/scala/com/twitter/scalding/ReplImplicits.scala @@ -18,14 +18,14 @@ package com.twitter.scalding import cascading.flow.FlowDef import cascading.pipe.Pipe import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{ FsShell, FileSystem } +import org.apache.hadoop.fs.{FileSystem, FsShell} import typed.KeyedListLike -import scala.util.{ Failure, Success } -import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext } +import scala.util.{Failure, Success} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} /** - * Object containing various implicit conversions required to create Scalding flows in the REPL. - * Most of these conversions come from the [[com.twitter.scalding.Job]] class. + * Object containing various implicit conversions required to create Scalding flows in the REPL. Most of these + * conversions come from the [[com.twitter.scalding.Job]] class. */ trait BaseReplState { @@ -36,11 +36,12 @@ trait BaseReplState { /** Implicit flowDef for this Scalding shell session. */ var flowDef: FlowDef = getEmptyFlowDef + /** Defaults to running in local mode if no mode is specified. */ var mode: Mode = com.twitter.scalding.Local(false) + /** - * If the repl is started in Hdfs mode, this field is used to preserve the settings - * when switching Modes. + * If the repl is started in Hdfs mode, this field is used to preserve the settings when switching Modes. */ var storedHdfsMode: Option[Hdfs] = None @@ -56,12 +57,14 @@ trait BaseReplState { } /** Switch to Hdfs mode */ - private def useHdfsMode_(): Unit = { + private def useHdfsMode_(): Unit = storedHdfsMode match { case Some(hdfsMode) => mode = hdfsMode - case None => println("To use HDFS/Hadoop mode, you must *start* the repl in hadoop mode to get the hadoop configuration from the hadoop command.") + case None => + println( + "To use HDFS/Hadoop mode, you must *start* the repl in hadoop mode to get the hadoop configuration from the hadoop command." + ) } - } def useHdfsMode(): Unit = { useHdfsMode_() @@ -93,42 +96,40 @@ trait BaseReplState { (m, defaultFs.getWorkingDirectory.toString) } } - println(s"${Console.GREEN}#### Scalding mode: ${modeString}") - println(s"#### User home: ${homeDir}${Console.RESET}") + println(s"${Console.GREEN}#### Scalding mode: $modeString") + println(s"#### User home: $homeDir${Console.RESET}") } - private def modeHadoopConf: Configuration = { + private def modeHadoopConf: Configuration = mode match { case hdfsMode: Hdfs => hdfsMode.jobConf - case _ => new Configuration(false) + case _ => new Configuration(false) } - } /** * Access to Hadoop FsShell * - * @param cmdArgs list of command line parameters for FsShell, one per method argument + * @param cmdArgs + * list of command line parameters for FsShell, one per method argument * @return */ - def fsShellExp(cmdArgs: String*): Int = { + def fsShellExp(cmdArgs: String*): Int = new FsShell(modeHadoopConf).run(cmdArgs.toArray) - } /** * Access to Hadoop FsShell * - * @param cmdLine command line parameters for FsShell as a single string + * @param cmdLine + * command line parameters for FsShell as a single string * @return */ - def fsShell(cmdLine: String): Int = { + def fsShell(cmdLine: String): Int = new FsShell(modeHadoopConf).run(cmdLine.trim.split(" ")) - } /** * Configuration to use for REPL executions. * - * To make changes, don't forget to assign back to this var: - * config += "mapred.reduce.tasks" -> 2 + * To make changes, don't forget to assign back to this var: config += "mapred.reduce.tasks" -> 2 */ var customConfig = Config.empty @@ -149,7 +150,10 @@ trait BaseReplState { case Some(jar) => Map("tmpjars" -> { // Use tmpjars already in the configuration. - config.get("tmpjars").map(_ + ",").getOrElse("") + config + .get("tmpjars") + .map(_ + ",") + .getOrElse("") // And a jar of code compiled by the REPL. .concat("file://" + jar.getAbsolutePath) }) @@ -163,14 +167,14 @@ trait BaseReplState { /** * Sets the flow definition in implicit scope to an empty flow definition. */ - def resetFlowDef(): Unit = { + def resetFlowDef(): Unit = flowDef = getEmptyFlowDef - } /** * Gets a new, empty, flow definition. * - * @return a new, empty flow definition. + * @return + * a new, empty flow definition. */ def getEmptyFlowDef: FlowDef = { val fd = new FlowDef @@ -208,29 +212,34 @@ trait BaseReplState { object ReplImplicits extends FieldConversions { /** - * Converts a Cascading Pipe to a Scalding RichPipe. This method permits implicit conversions from - * Pipe to RichPipe. + * Converts a Cascading Pipe to a Scalding RichPipe. This method permits implicit conversions from Pipe to + * RichPipe. * - * @param pipe to convert to a RichPipe. - * @return a RichPipe wrapping the specified Pipe. + * @param pipe + * to convert to a RichPipe. + * @return + * a RichPipe wrapping the specified Pipe. */ implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) /** - * Converts a Scalding RichPipe to a Cascading Pipe. This method permits implicit conversions from - * RichPipe to Pipe. + * Converts a Scalding RichPipe to a Cascading Pipe. This method permits implicit conversions from RichPipe + * to Pipe. * - * @param richPipe to convert to a Pipe. - * @return the Pipe wrapped by the specified RichPipe. + * @param richPipe + * to convert to a Pipe. + * @return + * the Pipe wrapped by the specified RichPipe. */ implicit def richPipeToPipe(richPipe: RichPipe): Pipe = richPipe.pipe /** - * Converts a Source to a RichPipe. This method permits implicit conversions from Source to - * RichPipe. + * Converts a Source to a RichPipe. This method permits implicit conversions from Source to RichPipe. * - * @param source to convert to a RichPipe. - * @return a RichPipe wrapping the result of reading the specified Source. + * @param source + * to convert to a RichPipe. + * @return + * a RichPipe wrapping the result of reading the specified Source. */ implicit def sourceToRichPipe(source: Source)(implicit flowDef: FlowDef, mode: Mode): RichPipe = RichPipe(source.read(flowDef, mode)) @@ -238,65 +247,77 @@ object ReplImplicits extends FieldConversions { /** * Converts an iterable into a Source with index (int-based) fields. * - * @param iterable to convert into a Source. - * @param setter implicitly retrieved and used to convert the specified iterable into a Source. - * @param converter implicitly retrieved and used to convert the specified iterable into a Source. - * @return a Source backed by the specified iterable. + * @param iterable + * to convert into a Source. + * @param setter + * implicitly retrieved and used to convert the specified iterable into a Source. + * @param converter + * implicitly retrieved and used to convert the specified iterable into a Source. + * @return + * a Source backed by the specified iterable. */ implicit def iterableToSource[T]( - iterable: Iterable[T])(implicit setter: TupleSetter[T], - converter: TupleConverter[T]): Source = { + iterable: Iterable[T] + )(implicit setter: TupleSetter[T], converter: TupleConverter[T]): Source = IterableSource[T](iterable)(setter, converter) - } /** * Converts an iterable into a Pipe with index (int-based) fields. * - * @param iterable to convert into a Pipe. - * @param setter implicitly retrieved and used to convert the specified iterable into a Pipe. - * @param converter implicitly retrieved and used to convert the specified iterable into a Pipe. - * @return a Pipe backed by the specified iterable. + * @param iterable + * to convert into a Pipe. + * @param setter + * implicitly retrieved and used to convert the specified iterable into a Pipe. + * @param converter + * implicitly retrieved and used to convert the specified iterable into a Pipe. + * @return + * a Pipe backed by the specified iterable. */ implicit def iterableToPipe[T]( - iterable: Iterable[T])(implicit setter: TupleSetter[T], - converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): Pipe = { + iterable: Iterable[T] + )(implicit setter: TupleSetter[T], converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): Pipe = iterableToSource(iterable)(setter, converter).read - } /** * Converts an iterable into a RichPipe with index (int-based) fields. * - * @param iterable to convert into a RichPipe. - * @param setter implicitly retrieved and used to convert the specified iterable into a RichPipe. - * @param converter implicitly retrieved and used to convert the specified iterable into a - * RichPipe. - * @return a RichPipe backed by the specified iterable. + * @param iterable + * to convert into a RichPipe. + * @param setter + * implicitly retrieved and used to convert the specified iterable into a RichPipe. + * @param converter + * implicitly retrieved and used to convert the specified iterable into a RichPipe. + * @return + * a RichPipe backed by the specified iterable. */ implicit def iterableToRichPipe[T]( - iterable: Iterable[T])(implicit setter: TupleSetter[T], - converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): RichPipe = { + iterable: Iterable[T] + )(implicit setter: TupleSetter[T], converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): RichPipe = RichPipe(iterableToPipe(iterable)(setter, converter, flowDef, mode)) - } /** - * Convert KeyedListLike to enriched ShellTypedPipe - * (e.g. allows .snapshot to be called on Grouped, CoGrouped, etc) + * Convert KeyedListLike to enriched ShellTypedPipe (e.g. allows .snapshot to be called on Grouped, + * CoGrouped, etc) */ - implicit def keyedListLikeToShellTypedPipe[K, V, T[K, +V] <: KeyedListLike[K, V, T]](kll: KeyedListLike[K, V, T])(implicit state: BaseReplState): ShellTypedPipe[(K, V)] = + implicit def keyedListLikeToShellTypedPipe[K, V, T[K, +V] <: KeyedListLike[K, V, T]]( + kll: KeyedListLike[K, V, T] + )(implicit state: BaseReplState): ShellTypedPipe[(K, V)] = new ShellTypedPipe(kll.toTypedPipe)(state) /** - * Enrich TypedPipe for the shell - * (e.g. allows .snapshot to be called on it) + * Enrich TypedPipe for the shell (e.g. allows .snapshot to be called on it) */ - implicit def typedPipeToShellTypedPipe[T](pipe: TypedPipe[T])(implicit state: BaseReplState): ShellTypedPipe[T] = + implicit def typedPipeToShellTypedPipe[T](pipe: TypedPipe[T])(implicit + state: BaseReplState + ): ShellTypedPipe[T] = new ShellTypedPipe[T](pipe)(state) /** - * Enrich ValuePipe for the shell - * (e.g. allows .toOption to be called on it) + * Enrich ValuePipe for the shell (e.g. allows .toOption to be called on it) */ - implicit def valuePipeToShellValuePipe[T](pipe: ValuePipe[T])(implicit state: BaseReplState): ShellValuePipe[T] = + implicit def valuePipeToShellValuePipe[T](pipe: ValuePipe[T])(implicit + state: BaseReplState + ): ShellValuePipe[T] = new ShellValuePipe[T](pipe)(state) } @@ -304,16 +325,19 @@ object ReplImplicits extends FieldConversions { object ReplState extends BaseReplState /** - * Implicit FlowDef and Mode, import in the REPL to have the global context implicitly - * used everywhere. + * Implicit FlowDef and Mode, import in the REPL to have the global context implicitly used everywhere. */ object ReplImplicitContext { + /** Implicit execution context for using the Execution monad */ implicit val executionContext: scala.concurrent.ExecutionContextExecutor = ConcurrentExecutionContext.global + /** Implicit repl state used for ShellPipes */ implicit def stateImpl: ReplState.type = ReplState + /** Implicit flowDef for this Scalding shell session. */ implicit def flowDefImpl: FlowDef = ReplState.flowDef + /** Defaults to running in local mode if no mode is specified. */ implicit def modeImpl: Mode = ReplState.mode implicit def configImpl: Config = ReplState.config diff --git a/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingILoop.scala b/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingILoop.scala index ed6bd81d95..4f5de2cf19 100644 --- a/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingILoop.scala +++ b/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingILoop.scala @@ -23,9 +23,9 @@ import scala.tools.nsc.interpreter.JPrintWriter import scala.tools.nsc.GenericRunnerSettings object ScaldingILoop { + /** - * Search for files with the given name in all directories from current directory - * up to root. + * Search for files with the given name in all directories from current directory up to root. */ private[scalding] def findAllUpPath(currentDir: String)(filename: String): List[File] = { val matchingFiles = for { @@ -49,8 +49,7 @@ object ScaldingILoop { /** * A class providing Scalding specific commands for inclusion in the Scalding REPL. */ -class ScaldingILoop(in: Option[BufferedReader], out: JPrintWriter) - extends ILoopCompat(in, out) { +class ScaldingILoop(in: Option[BufferedReader], out: JPrintWriter) extends ILoopCompat(in, out) { def this() = this(None, new JPrintWriter(Console.out, true)) settings = new GenericRunnerSettings({ s => echo(s) }) @@ -59,31 +58,33 @@ class ScaldingILoop(in: Option[BufferedReader], out: JPrintWriter) val fc = Console.YELLOW val wc = Console.RED def wrapFlames(s: String) = s.replaceAll("[()]+", fc + "$0" + wc) - echo(fc + - " ( \n" + - " )\\ ) ( ( \n" + - "(()/( ) )\\ )\\ ) ( ( ( \n" + - " /(_)) ( ( /( ((_)(()/( )\\ ( )\\))( \n" + - "(_)) )\\ )( )) _ ((_)(( ) )\\ ) (( ))\\ \n".replaceAll("_", wc + "_" + fc) + wc + - wrapFlames("/ __|((_) ((_)_ | | _| | (_) _(_(( (_()_) \n") + - wrapFlames("\\__ \\/ _| / _` || |/ _` | | || ' \\))/ _` \\ \n") + - "|___/\\__| \\__,_||_|\\__,_| |_||_||_| \\__, | \n" + - " |___/ ") + echo( + fc + + " ( \n" + + " )\\ ) ( ( \n" + + "(()/( ) )\\ )\\ ) ( ( ( \n" + + " /(_)) ( ( /( ((_)(()/( )\\ ( )\\))( \n" + + "(_)) )\\ )( )) _ ((_)(( ) )\\ ) (( ))\\ \n".replaceAll("_", wc + "_" + fc) + wc + + wrapFlames("/ __|((_) ((_)_ | | _| | (_) _(_(( (_()_) \n") + + wrapFlames("\\__ \\/ _| / _` || |/ _` | | || ' \\))/ _` \\ \n") + + "|___/\\__| \\__,_||_|\\__,_| |_||_||_| \\__, | \n" + + " |___/ " + ) } /** - * Commands specific to the Scalding REPL. To define a new command use one of the following - * factory methods: - * - `LoopCommand.nullary` for commands that take no arguments - * - `LoopCommand.cmd` for commands that take one string argument - * - `LoopCommand.varargs` for commands that take multiple string arguments + * Commands specific to the Scalding REPL. To define a new command use one of the following factory methods: + * - `LoopCommand.nullary` for commands that take no arguments + * - `LoopCommand.cmd` for commands that take one string argument + * - `LoopCommand.varargs` for commands that take multiple string arguments */ private val scaldingCommands: List[LoopCommand] = List() /** * Change the shell prompt to read scalding> * - * @return a prompt string to use for this REPL. + * @return + * a prompt string to use for this REPL. */ override def prompt: String = Console.BLUE + "\nscalding> " + Console.RESET @@ -94,7 +95,8 @@ class ScaldingILoop(in: Option[BufferedReader], out: JPrintWriter) /** * Gets the list of commands that this REPL supports. * - * @return a list of the command supported by this REPL. + * @return + * a list of the command supported by this REPL. */ override def commands: List[LoopCommand] = super.commands ++ scaldingCommands @@ -102,7 +104,8 @@ class ScaldingILoop(in: Option[BufferedReader], out: JPrintWriter) "com.twitter.scalding._", "com.twitter.scalding.ReplImplicits._", "com.twitter.scalding.ReplImplicitContext._", - "com.twitter.scalding.ReplState._") + "com.twitter.scalding.ReplState._" + ) override def createInterpreter(): Unit = { super.createInterpreter() diff --git a/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingShell.scala b/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingShell.scala index cd312010ea..7861a2b145 100644 --- a/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingShell.scala +++ b/scalding-repl/src/main/scala/com/twitter/scalding/ScaldingShell.scala @@ -23,7 +23,7 @@ import java.util.jar.JarOutputStream import org.apache.hadoop.util.GenericOptionsParser import org.apache.hadoop.conf.Configuration -import scala.tools.nsc.{ GenericRunnerCommand, MainGenericRunner } +import scala.tools.nsc.{GenericRunnerCommand, MainGenericRunner} import scala.tools.nsc.interpreter.ILoop import scala.tools.nsc.io.VirtualDirectory @@ -32,8 +32,7 @@ import com.google.common.io.Files case class ShellArgs(cfg: Config, mode: Mode, cmdArgs: List[String]) /** - * A runner for a Scala REPL providing functionality extensions specific to working with - * Scalding. + * A runner for a Scala REPL providing functionality extensions specific to working with Scalding. */ trait BaseScaldingShell extends MainGenericRunner { @@ -54,12 +53,14 @@ trait BaseScaldingShell extends MainGenericRunner { /** * The main entry point for executing the REPL. * - * This method is lifted from [[scala.tools.nsc.MainGenericRunner]] and modified to allow - * for custom functionality, including determining at runtime if the REPL is running, - * and making custom REPL colon-commands available to the user. + * This method is lifted from [[scala.tools.nsc.MainGenericRunner]] and modified to allow for custom + * functionality, including determining at runtime if the REPL is running, and making custom REPL + * colon-commands available to the user. * - * @param args passed from the command line. - * @return `true` if execution was successful, `false` otherwise. + * @param args + * passed from the command line. + * @return + * `true` if execution was successful, `false` otherwise. */ override def process(args: Array[String]): Boolean = { // Get the mode (hdfs or local), and initialize the configuration @@ -86,13 +87,13 @@ trait BaseScaldingShell extends MainGenericRunner { replState.mode = mode replState.customConfig = replState.customConfig ++ (mode match { case _: HadoopMode => cfg - case _ => Config.empty + case _ => Config.empty }) // if in Hdfs mode, store the mode to enable switching between Local and Hdfs mode match { case m @ Hdfs(_, _) => replState.storedHdfsMode = Some(m) - case _ => () + case _ => () } replState.printModeBanner() @@ -106,11 +107,13 @@ trait BaseScaldingShell extends MainGenericRunner { (new GenericOptionsParser(conf, args)).getRemainingArgs /** - * Sets the mode for this job, updates jobConf with hadoop arguments - * and returns all the non-hadoop arguments. + * Sets the mode for this job, updates jobConf with hadoop arguments and returns all the non-hadoop + * arguments. * - * @param args from the command line. - * @return a Mode for the job (e.g. local, hdfs), config and the non-hadoop params + * @param args + * from the command line. + * @return + * a Mode for the job (e.g. local, hdfs), config and the non-hadoop params */ def parseModeArgs(args: Array[String]): ShellArgs = { val a = nonHadoopArgsFrom(args) @@ -121,7 +124,8 @@ trait BaseScaldingShell extends MainGenericRunner { /** * Runs an instance of the shell. * - * @param args from the command line. + * @param args + * from the command line. */ def main(args: Array[String]): Unit = { val retVal = process(ExpandLibJarsGlobs(args)) @@ -133,23 +137,26 @@ trait BaseScaldingShell extends MainGenericRunner { /** * Creates a jar file in a temporary directory containing the code thus far compiled by the REPL. * - * @return some file for the jar created, or `None` if the REPL is not running. + * @return + * some file for the jar created, or `None` if the REPL is not running. */ - private[scalding] def createReplCodeJar(): Option[File] = { + private[scalding] def createReplCodeJar(): Option[File] = scaldingREPL.map { repl => val virtualDirectory = repl.virtualDirectory - val tempJar = new File(Files.createTempDir(), - "scalding-repl-session-" + System.currentTimeMillis() + ".jar") + val tempJar = + new File(Files.createTempDir(), "scalding-repl-session-" + System.currentTimeMillis() + ".jar") createJar(virtualDirectory.asInstanceOf[VirtualDirectory], tempJar) } - } /** * Creates a jar file from the classes contained in a virtual directory. * - * @param virtualDirectory containing classes that should be added to the jar. - * @param jarFile that will be written. - * @return the jarFile specified and written. + * @param virtualDirectory + * containing classes that should be added to the jar. + * @param jarFile + * that will be written. + * @return + * the jarFile specified and written. */ private def createJar(virtualDirectory: VirtualDirectory, jarFile: File): File = { val jarStream = new JarOutputStream(new FileOutputStream(jarFile)) @@ -163,17 +170,21 @@ trait BaseScaldingShell extends MainGenericRunner { } /** - * Add the contents of the specified virtual directory to a jar. This method will recursively - * descend into subdirectories to add their contents. + * Add the contents of the specified virtual directory to a jar. This method will recursively descend into + * subdirectories to add their contents. * - * @param dir is a virtual directory whose contents should be added. - * @param entryPath for classes found in the virtual directory. - * @param jarStream for writing the jar file. + * @param dir + * is a virtual directory whose contents should be added. + * @param entryPath + * for classes found in the virtual directory. + * @param jarStream + * for writing the jar file. */ private def addVirtualDirectoryToJar( - dir: VirtualDirectory, - entryPath: String, - jarStream: JarOutputStream): Unit = { + dir: VirtualDirectory, + entryPath: String, + jarStream: JarOutputStream + ): Unit = dir.foreach { file => if (file.isDirectory) { // Recursively descend into subdirectories, adjusting the package name as we do. @@ -190,7 +201,6 @@ trait BaseScaldingShell extends MainGenericRunner { jarStream.closeEntry() } } - } } object ScaldingShell extends BaseScaldingShell diff --git a/scalding-repl/src/main/scala/com/twitter/scalding/ShellPipe.scala b/scalding-repl/src/main/scala/com/twitter/scalding/ShellPipe.scala index 6f614f2db8..de8800811a 100644 --- a/scalding-repl/src/main/scala/com/twitter/scalding/ShellPipe.scala +++ b/scalding-repl/src/main/scala/com/twitter/scalding/ShellPipe.scala @@ -17,7 +17,8 @@ package com.twitter.scalding /** * Enrichment on TypedPipes allowing them to be run locally, independent of the overall flow. - * @param pipe to wrap + * @param pipe + * to wrap */ class ShellTypedPipe[T](pipe: TypedPipe[T])(implicit state: BaseReplState) { import state.execute @@ -30,23 +31,24 @@ class ShellTypedPipe[T](pipe: TypedPipe[T])(implicit state: BaseReplState) { /** * Save snapshot of a typed pipe to a temporary sequence file. - * @return A TypedPipe to a new Source, reading from the sequence file. + * @return + * A TypedPipe to a new Source, reading from the sequence file. */ def snapshot: TypedPipe[T] = execute(pipe.forceToDiskExecution) /** - * Create a (local) iterator over the pipe. For non-trivial pipes (anything except - * a head-pipe reading from a source), a snapshot is automatically created and - * iterated over. - * @return local iterator + * Create a (local) iterator over the pipe. For non-trivial pipes (anything except a head-pipe reading from + * a source), a snapshot is automatically created and iterated over. + * @return + * local iterator */ def toIterator: Iterator[T] = execute(pipe.toIterableExecution).iterator /** - * Create a list from the pipe in memory. Uses `ShellTypedPipe.toIterator`. - * Warning: user must ensure that the results will actually fit in memory. + * Create a list from the pipe in memory. Uses `ShellTypedPipe.toIterator`. Warning: user must ensure that + * the results will actually fit in memory. */ def toList: List[T] = toIterator.toList diff --git a/scalding-repl/src/test/scala/com/twitter/scalding/ReplTest.scala b/scalding-repl/src/test/scala/com/twitter/scalding/ReplTest.scala index 6047e0d7d4..2006925f1a 100644 --- a/scalding-repl/src/test/scala/com/twitter/scalding/ReplTest.scala +++ b/scalding-repl/src/test/scala/com/twitter/scalding/ReplTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.File @@ -35,7 +35,7 @@ class ReplTest extends WordSpec { val suffix = mode match { case _: CascadingLocal => "local" - case _: HadoopMode => "hadoop" + case _: HadoopMode => "hadoop" } val testPath = "/tmp/scalding-repl/test/" + suffix + "/" val helloRef = List("Hello world", "Goodbye world") @@ -58,13 +58,14 @@ class ReplTest extends WordSpec { // it's a TypedPipe from a MemorySink or SequenceFile) s match { case TypedPipe.IterablePipe(_) => succeed - case TypedPipe.SourcePipe(s) => assert(s.toString.contains("SequenceFile")) + case TypedPipe.SourcePipe(s) => assert(s.toString.contains("SequenceFile")) case _ => fail(s"expected an IterablePipe or source from a SequenceFile, found: $s") } } "can be mapped and saved -- TypedPipe[String]" in { - val s = TypedPipe.from(TextLine(helloPath)) + val s = TypedPipe + .from(TextLine(helloPath)) .flatMap(_.split("\\s+")) .snapshot @@ -78,7 +79,8 @@ class ReplTest extends WordSpec { } "tuples -- TypedPipe[(String,Int)]" in { - val s = TypedPipe.from(TextLine(helloPath)) + val s = TypedPipe + .from(TextLine(helloPath)) .flatMap(_.split("\\s+")) .map(w => (w.toLowerCase, w.length)) .snapshot @@ -87,8 +89,9 @@ class ReplTest extends WordSpec { assert(output === helloRef.flatMap(_.split("\\s+")).map(w => (w.toLowerCase, w.length))) } - "grouped -- Grouped[String,String]" which { - val grp = TypedPipe.from(TextLine(helloPath)) + "grouped -- Grouped[String,String]".which { + val grp = TypedPipe + .from(TextLine(helloPath)) .groupBy(_.toLowerCase) val correct = helloRef.map(l => (l.toLowerCase, l)) @@ -103,15 +106,14 @@ class ReplTest extends WordSpec { } } - "joined -- CoGrouped[String, Long]" which { - val linesByWord = TypedPipe.from(TextLine(helloPath)) + "joined -- CoGrouped[String, Long]".which { + val linesByWord = TypedPipe + .from(TextLine(helloPath)) .flatMap(_.split("\\s+")) .groupBy(_.toLowerCase) val wordScores = TypedPipe.from(TypedTsv[(String, Double)](tutorialData + "/word_scores.tsv")).group - val grp = linesByWord.join(wordScores) - .mapValues { case (text, score) => score } - .sum + val grp = linesByWord.join(wordScores).mapValues { case (text, score) => score }.sum val correct = Map("hello" -> 1.0, "goodbye" -> 3.0, "world" -> 4.0) @@ -139,7 +141,8 @@ class ReplTest extends WordSpec { "run entire flow" in { resetFlowDef() - val hello = TypedPipe.from(TextLine(helloPath)) + val hello = TypedPipe + .from(TextLine(helloPath)) .flatMap(_.split("\\s+")) .map(_.toLowerCase) .distinct diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Boxed.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Boxed.scala index b386cbd780..c8e01119f4 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Boxed.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Boxed.scala @@ -12,17 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import java.util.concurrent.atomic.AtomicReference -import java.io.{ InputStream, OutputStream } +import java.io.{InputStream, OutputStream} /** - * This interface is a way of wrapping a value in a marker class - * whose class identity is used to control which serialization we - * use. This is an internal implementation detail about how we - * interact with cascading and hadoop. Users should never care. + * This interface is a way of wrapping a value in a marker class whose class identity is used to control which + * serialization we use. This is an internal implementation detail about how we interact with cascading and + * hadoop. Users should never care. */ trait Boxed[+K] { def get: K @@ -530,8 +529,8 @@ class Boxed249[K](override val get: K) extends Boxed[K] class Boxed250[K](override val get: K) extends Boxed[K] -case class BoxedOrderedSerialization[K](box: K => Boxed[K], - ord: OrderedSerialization[K]) extends OrderedSerialization[Boxed[K]] { +case class BoxedOrderedSerialization[K](box: K => Boxed[K], ord: OrderedSerialization[K]) + extends OrderedSerialization[Boxed[K]] { override def compare(a: Boxed[K], b: Boxed[K]) = ord.compare(a.get, b.get) override def hash(k: Boxed[K]) = ord.hash(k.get) @@ -687,7 +686,8 @@ object BoxedLambdas { ({ t: Any => new Boxed121(t) }, classOf[Boxed121[Any]]), ({ t: Any => new Boxed122(t) }, classOf[Boxed122[Any]]), ({ t: Any => new Boxed123(t) }, classOf[Boxed123[Any]]), - ({ t: Any => new Boxed124(t) }, classOf[Boxed124[Any]])) + ({ t: Any => new Boxed124(t) }, classOf[Boxed124[Any]]) + ) private[serialization] val boxes2 = List( ({ t: Any => new Boxed125(t) }, classOf[Boxed125[Any]]), @@ -815,7 +815,8 @@ object BoxedLambdas { ({ t: Any => new Boxed247(t) }, classOf[Boxed247[Any]]), ({ t: Any => new Boxed248(t) }, classOf[Boxed248[Any]]), ({ t: Any => new Boxed249(t) }, classOf[Boxed249[Any]]), - ({ t: Any => new Boxed250(t) }, classOf[Boxed250[Any]])) + ({ t: Any => new Boxed250(t) }, classOf[Boxed250[Any]]) + ) } object Boxed { @@ -828,7 +829,8 @@ object Boxed { def allClasses: Seq[Class[_ <: Boxed[_]]] = allBoxes.map(_._2) - private[this] val boxedCache = new java.util.concurrent.ConcurrentHashMap[AnyRef, (Any => Boxed[Any], Class[Boxed[Any]])]() + private[this] val boxedCache = + new java.util.concurrent.ConcurrentHashMap[AnyRef, (Any => Boxed[Any], Class[Boxed[Any]])]() private[scalding] def nextCached[K](cacheKey: Option[AnyRef]): (K => Boxed[K], Class[Boxed[K]]) = cacheKey match { @@ -848,8 +850,8 @@ object Boxed { case list @ (h :: tail) if boxes.compareAndSet(list, tail) => h.asInstanceOf[(K => Boxed[K], Class[Boxed[K]])] case (h :: tail) => next[K]() // Try again - case Nil => sys.error( - """|Scalding's ordered serialization logic exhausted the finite supply of boxed classes. + case Nil => + sys.error("""|Scalding's ordered serialization logic exhausted the finite supply of boxed classes. | |Explanation: Scalding's ordered serialization logic internally uses |a large, but fixed, supply of unique wrapper types to box values in diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Hasher.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Hasher.scala index 52600598d5..ce9b454c6c 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Hasher.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Hasher.scala @@ -12,15 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization // Be careful using this, the product/array or similar will attempt to call system hash codes. import scala.util.hashing.MurmurHash3 + /** - * This is a specialized typeclass to make it easier to implement Serializations. - * The specialization *should* mean that there is no boxing and if the JIT - * does its work, Hasher should compose well (via collections, Tuple2, Option, Either) + * This is a specialized typeclass to make it easier to implement Serializations. The specialization *should* + * mean that there is no boxing and if the JIT does its work, Hasher should compose well (via collections, + * Tuple2, Option, Either) */ trait Hasher[@specialized(Boolean, Byte, Char, Short, Int, Long, Float, Double) -T] { @inline @@ -32,8 +33,8 @@ object Hasher { final val seed = 0xf7ca7fd2 @inline - def hash[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T]( - i: T)(implicit h: Hasher[T]): Int = h.hash(i) + def hash[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T](i: T)(implicit h: Hasher[T]): Int = + h.hash(i) /* * Instances below @@ -43,13 +44,12 @@ object Hasher { def hash(i: Unit) = 0 } implicit val boolean: Hasher[Boolean] = new Hasher[Boolean] { + /** - * Here we use the two large primes as the hash codes. - * We use primes because we want the probability of collision when - * we mod with some size (to fit into hash-buckets stored in an array) - * to be low. The choice of prime numbers means that they have no factors - * in common with any size, but they could have the same remainder. - * We actually just use the exact same values as Java here. + * Here we use the two large primes as the hash codes. We use primes because we want the probability of + * collision when we mod with some size (to fit into hash-buckets stored in an array) to be low. The + * choice of prime numbers means that they have no factors in common with any size, but they could have + * the same remainder. We actually just use the exact same values as Java here. */ @inline def hash(i: Boolean) = if (i) 1231 else 1237 diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/JavaStreamEnrichments.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/JavaStreamEnrichments.scala index 1405589620..e0dec9379a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/JavaStreamEnrichments.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/JavaStreamEnrichments.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import java.io._ @@ -26,8 +26,8 @@ object JavaStreamEnrichments { throw new IllegalArgumentException(s) /** - * Note this is only recommended for testing. - * You may want to use ByteArrayInputOutputStream for performance critical concerns + * Note this is only recommended for testing. You may want to use ByteArrayInputOutputStream for performance + * critical concerns */ implicit class RichByteArrayOutputStream(val baos: ByteArrayOutputStream) extends AnyVal { def toInputStream: ByteArrayInputStream = new ByteArrayInputStream(baos.toByteArray) @@ -41,10 +41,10 @@ object JavaStreamEnrichments { def wrapAsOutputStreamAt(pos: Int): ArrayWrappingOutputStream = new ArrayWrappingOutputStream(bytes, pos) } + /** - * Wraps an Array so that you can write into it as a stream without reallocations - * or copying at the end. Useful if you know an upper bound on the number of bytes - * you will write + * Wraps an Array so that you can write into it as a stream without reallocations or copying at the end. + * Useful if you know an upper bound on the number of bytes you will write */ class ArrayWrappingOutputStream(val buffer: Array[Byte], initPos: Int) extends OutputStream { if (buffer.length < initPos) { @@ -75,38 +75,31 @@ object JavaStreamEnrichments { } /** - * This has a lot of methods from DataInputStream without - * having to allocate to get them - * This code is similar to those algorithms + * This has a lot of methods from DataInputStream without having to allocate to get them This code is + * similar to those algorithms */ implicit class RichInputStream(val s: InputStream) extends AnyVal { + /** - * If s supports marking, we mark it. Otherwise we read the needed - * bytes out into a ByteArrayStream and return that. - * This is intended for the case where you need possibly - * read size bytes but may stop early, then skip this exact - * number of bytes. - * Intended use is: - * {code} - * val size = 100 - * val marked = s.markOrBuffer(size) - * val y = fn(marked) - * marked.reset - * marked.skipFully(size) - * {/code} + * If s supports marking, we mark it. Otherwise we read the needed bytes out into a ByteArrayStream and + * return that. This is intended for the case where you need possibly read size bytes but may stop early, + * then skip this exact number of bytes. Intended use is: {code} val size = 100 val marked = + * s.markOrBuffer(size) val y = fn(marked) marked.reset marked.skipFully(size) {/code} */ def markOrBuffer(size: Int): InputStream = { - val ms = if (s.markSupported) s else { - val buf = new Array[Byte](size) - s.readFully(buf) - new ByteArrayInputStream(buf) - } + val ms = + if (s.markSupported) s + else { + val buf = new Array[Byte](size) + s.readFully(buf) + new ByteArrayInputStream(buf) + } // Make sure we can reset after we read this many bytes ms.mark(size) ms } - def readBoolean: Boolean = (readUnsignedByte != 0) + def readBoolean: Boolean = readUnsignedByte != 0 /** * Like read, but throws eof on error @@ -145,9 +138,8 @@ object JavaStreamEnrichments { def readFloat: Float = java.lang.Float.intBitsToFloat(readInt) /** - * This is the algorithm from DataInputStream - * it was also benchmarked against the approach - * used in readLong and found to be faster + * This is the algorithm from DataInputStream it was also benchmarked against the approach used in + * readLong and found to be faster */ def readInt: Int = { val c1 = s.read @@ -189,11 +181,8 @@ object JavaStreamEnrichments { } /** - * This reads a varInt encoding that only encodes non-negative - * numbers. It uses: - * 1 byte for values 0 - 255, - * 3 bytes for 256 - 65535, - * 7 bytes for 65536 - Int.MaxValue + * This reads a varInt encoding that only encodes non-negative numbers. It uses: 1 byte for values 0 - + * 255, 3 bytes for 256 - 65535, 7 bytes for 65536 - Int.MaxValue */ final def readPosVarInt: Int = { val c1 = readUnsignedByte @@ -210,7 +199,8 @@ object JavaStreamEnrichments { def go(c: Long): Unit = { val skipped = s.skip(c) if (skipped == c) () - else if (skipped == 0L) throw new IOException(s"could not skipFully: count, c, skipped = ${(count, c, skipped)}") + else if (skipped == 0L) + throw new IOException(s"could not skipFully: count, c, skipped = ${(count, c, skipped)}") else go(c - skipped) } if (count != 0L) go(count) else () @@ -220,23 +210,19 @@ object JavaStreamEnrichments { implicit class RichOutputStream(val s: OutputStream) extends AnyVal { def writeBoolean(b: Boolean): Unit = if (b) s.write(1: Byte) else s.write(0: Byte) - def writeBytes(b: Array[Byte], off: Int, len: Int): Unit = { + def writeBytes(b: Array[Byte], off: Int, len: Int): Unit = s.write(b, off, len) - } def writeByte(b: Byte): Unit = s.write(b) def writeBytes(b: Array[Byte]): Unit = writeBytes(b, 0, b.length) /** - * This reads a varInt encoding that only encodes non-negative - * numbers. It uses: - * 1 byte for values 0 - 255, - * 3 bytes for 256 - 65535, - * 7 bytes for 65536 - Int.MaxValue + * This reads a varInt encoding that only encodes non-negative numbers. It uses: 1 byte for values 0 - + * 255, 3 bytes for 256 - 65535, 7 bytes for 65536 - Int.MaxValue */ def writePosVarInt(i: Int): Unit = { - if (i < 0) illegal(s"must be non-negative: ${i}") + if (i < 0) illegal(s"must be non-negative: $i") if (i < ((1 << 8) - 1)) s.write(i) else { s.write(-1: Byte) diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala index db4f6344bf..5613b9bf3a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala @@ -12,12 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization /** - * This is a simple trait for describing laws on single parameter - * type classes (Serialization, Monoid, Ordering, etc...) + * This is a simple trait for describing laws on single parameter type classes (Serialization, Monoid, + * Ordering, etc...) */ sealed trait Law[T] { def name: String diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/MurmurHashUtils.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/MurmurHashUtils.scala index cacf8a6f60..d630135c5a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/MurmurHashUtils.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/MurmurHashUtils.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization // Taking a few functions from: diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/OrderedSerialization.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/OrderedSerialization.scala index 3b9be5bc91..662e5cc624 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/OrderedSerialization.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/OrderedSerialization.scala @@ -12,40 +12,41 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.{ ByteArrayInputStream, InputStream, OutputStream } -import scala.util.{ Failure, Success, Try } +import java.io.{ByteArrayInputStream, InputStream, OutputStream} +import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal /** - * In large-scale partitioning algorithms, we often use sorting. - * This typeclass represents something we can efficiently serialize - * with an added law: that we can (hopefully fast) compare the raw - * data. + * In large-scale partitioning algorithms, we often use sorting. This typeclass represents something we can + * efficiently serialize with an added law: that we can (hopefully fast) compare the raw data. */ trait OrderedSerialization[T] extends Ordering[T] with Serialization[T] { + /** - * This compares two InputStreams. After this call, the position in - * the InputStreams is mutated to be the end of the record. + * This compares two InputStreams. After this call, the position in the InputStreams is mutated to be the + * end of the record. */ def compareBinary(a: InputStream, b: InputStream): OrderedSerialization.Result } object OrderedSerialization { + /** - * Represents the result of a comparison that might fail due - * to an error deserializing + * Represents the result of a comparison that might fail due to an error deserializing */ sealed trait Result { + /** * Throws if the items cannot be compared */ def unsafeToInt: Int def toTry: Try[Int] } + /** * Create a Result from an Int. */ @@ -101,104 +102,112 @@ object OrderedSerialization { case NonFatal(e) => CompareFailure(e) } - private[this] def internalTransformer[T, U, V](packFn: T => U, - unpackFn: U => V, - presentFn: Try[V] => Try[T])(implicit otherOrdSer: OrderedSerialization[U]): OrderedSerialization[T] = - { - new OrderedSerialization[T] { - private[this] var cache: (T, U) = null - private[this] def packCache(t: T): U = { - val readCache = cache - if (readCache == null || readCache._1 != t) { - val u = packFn(t) - cache = (t, u) - u - } else { - readCache._2 - } + private[this] def internalTransformer[T, U, V]( + packFn: T => U, + unpackFn: U => V, + presentFn: Try[V] => Try[T] + )(implicit otherOrdSer: OrderedSerialization[U]): OrderedSerialization[T] = + new OrderedSerialization[T] { + private[this] var cache: (T, U) = null + private[this] def packCache(t: T): U = { + val readCache = cache + if (readCache == null || readCache._1 != t) { + val u = packFn(t) + cache = (t, u) + u + } else { + readCache._2 } + } - override def hash(t: T) = otherOrdSer.hash(packCache(t)) + override def hash(t: T) = otherOrdSer.hash(packCache(t)) - override def compareBinary(a: java.io.InputStream, b: java.io.InputStream): OrderedSerialization.Result = - otherOrdSer.compareBinary(a, b) + override def compareBinary( + a: java.io.InputStream, + b: java.io.InputStream + ): OrderedSerialization.Result = + otherOrdSer.compareBinary(a, b) - override def compare(x: T, y: T) = - otherOrdSer.compare(packFn(x), packFn(y)) + override def compare(x: T, y: T) = + otherOrdSer.compare(packFn(x), packFn(y)) - override def read(in: InputStream): Try[T] = - presentFn(otherOrdSer.read(in).map(unpackFn)) + override def read(in: InputStream): Try[T] = + presentFn(otherOrdSer.read(in).map(unpackFn)) - override def write(out: OutputStream, t: T): Try[Unit] = - otherOrdSer.write(out, packCache(t)) + override def write(out: OutputStream, t: T): Try[Unit] = + otherOrdSer.write(out, packCache(t)) - override def staticSize: Option[Int] = otherOrdSer.staticSize + override def staticSize: Option[Int] = otherOrdSer.staticSize - override def dynamicSize(t: T): Option[Int] = otherOrdSer.dynamicSize(packCache(t)) - } + override def dynamicSize(t: T): Option[Int] = otherOrdSer.dynamicSize(packCache(t)) } - def viaTransform[T, U]( - packFn: T => U, - unpackFn: U => T)(implicit otherOrdSer: OrderedSerialization[U]): OrderedSerialization[T] = + def viaTransform[T, U](packFn: T => U, unpackFn: U => T)(implicit + otherOrdSer: OrderedSerialization[U] + ): OrderedSerialization[T] = internalTransformer[T, U, T](packFn, unpackFn, identity) - def viaTryTransform[T, U]( - packFn: T => U, - unpackFn: U => Try[T])(implicit otherOrdSer: OrderedSerialization[U]): OrderedSerialization[T] = + def viaTryTransform[T, U](packFn: T => U, unpackFn: U => Try[T])(implicit + otherOrdSer: OrderedSerialization[U] + ): OrderedSerialization[T] = internalTransformer[T, U, Try[T]](packFn, unpackFn, _.flatMap(identity)) /** * The the serialized comparison matches the unserialized comparison */ def compareBinaryMatchesCompare[T](implicit ordb: OrderedSerialization[T]): Law2[T] = - Law2("compare(a, b) == compareBinary(aBin, bBin)", - { (a: T, b: T) => resultFrom(ordb.compare(a, b)) == writeThenCompare(a, b) }) + Law2( + "compare(a, b) == compareBinary(aBin, bBin)", + (a: T, b: T) => resultFrom(ordb.compare(a, b)) == writeThenCompare(a, b) + ) /** - * ordering must be transitive. If this is not so, sort-based partitioning - * will be broken + * ordering must be transitive. If this is not so, sort-based partitioning will be broken */ def orderingTransitive[T](implicit ordb: OrderedSerialization[T]): Law3[T] = - Law3("transitivity", - { (a: T, b: T, c: T) => + Law3( + "transitivity", + (a: T, b: T, c: T) => if (ordb.lteq(a, b) && ordb.lteq(b, c)) { ordb.lteq(a, c) } else true - }) + ) + /** - * ordering must be antisymmetric. If this is not so, sort-based partitioning - * will be broken + * ordering must be antisymmetric. If this is not so, sort-based partitioning will be broken */ def orderingAntisymmetry[T](implicit ordb: OrderedSerialization[T]): Law2[T] = - Law2("antisymmetry", - { (a: T, b: T) => + Law2( + "antisymmetry", + (a: T, b: T) => if (ordb.lteq(a, b) && ordb.lteq(b, a)) { ordb.equiv(a, b) } else true - }) + ) + /** - * ordering must be total. If this is not so, sort-based partitioning - * will be broken + * ordering must be total. If this is not so, sort-based partitioning will be broken */ def orderingTotality[T](implicit ordb: OrderedSerialization[T]): Law2[T] = - Law2("totality", { (a: T, b: T) => (ordb.lteq(a, b) || ordb.lteq(b, a)) }) + Law2("totality", (a: T, b: T) => (ordb.lteq(a, b) || ordb.lteq(b, a))) def allLaws[T: OrderedSerialization]: Iterable[Law[T]] = - Serialization.allLaws ++ List[Law[T]](compareBinaryMatchesCompare[T], + Serialization.allLaws ++ List[Law[T]]( + compareBinaryMatchesCompare[T], orderingTransitive[T], orderingAntisymmetry[T], - orderingTotality[T]) + orderingTotality[T] + ) } /** - * This may be useful when a type is used deep in a tuple or case class, and in that case - * the earlier comparators will have likely already done the work. Be aware that avoiding - * deserialization on compare usually very helpful. + * This may be useful when a type is used deep in a tuple or case class, and in that case the earlier + * comparators will have likely already done the work. Be aware that avoiding deserialization on compare + * usually very helpful. * - * Note: it is your responsibility that the hash in serialization is consistent - * with the ordering (if equivalent in the ordering, the hash must match). + * Note: it is your responsibility that the hash in serialization is consistent with the ordering (if + * equivalent in the ordering, the hash must match). */ -final case class DeserializingOrderedSerialization[T](serialization: Serialization[T], - ordering: Ordering[T]) extends OrderedSerialization[T] { +final case class DeserializingOrderedSerialization[T](serialization: Serialization[T], ordering: Ordering[T]) + extends OrderedSerialization[T] { final override def read(i: InputStream) = serialization.read(i) final override def write(o: OutputStream, t: T) = serialization.write(o, t) @@ -207,8 +216,7 @@ final case class DeserializingOrderedSerialization[T](serialization: Serializati final override def compareBinary(a: InputStream, b: InputStream) = try OrderedSerialization.resultFrom { compare(read(a).get, read(b).get) - } - catch { + } catch { case NonFatal(e) => OrderedSerialization.CompareFailure(e) } final override def staticSize = serialization.staticSize diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/PositionInputStream.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/PositionInputStream.scala index 88737b4857..0b61fc44b9 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/PositionInputStream.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/PositionInputStream.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import java.io.InputStream @@ -21,7 +21,7 @@ import JavaStreamEnrichments._ object PositionInputStream { def apply(in: InputStream): PositionInputStream = in match { case p: PositionInputStream => p - case nonPos => new PositionInputStream(nonPos) + case nonPos => new PositionInputStream(nonPos) } } @@ -32,7 +32,7 @@ class PositionInputStream(val wraps: InputStream) extends InputStream { override def available = wraps.available - override def close(): Unit = { wraps.close() } + override def close(): Unit = wraps.close() override def mark(limit: Int): Unit = { wraps.mark(limit) diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Reader.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Reader.scala index 0e1a4e6199..03804c7048 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Reader.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Reader.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import java.io.InputStream @@ -20,9 +20,9 @@ import scala.reflect.ClassTag import scala.collection.generic.CanBuildFrom /** - * This is a specialized typeclass to make it easier to implement Serializations. - * The specialization *should* mean that there is no boxing and if the JIT - * does its work, Reader should compose well (via collections, Tuple2, Option, Either) + * This is a specialized typeclass to make it easier to implement Serializations. The specialization *should* + * mean that there is no boxing and if the JIT does its work, Reader should compose well (via collections, + * Tuple2, Option, Either) */ trait Reader[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) +T] { def read(is: InputStream): T @@ -31,8 +31,9 @@ trait Reader[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) +T] { object Reader { import JavaStreamEnrichments._ - def read[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T]( - is: InputStream)(implicit r: Reader[T]): T = r.read(is) + def read[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T](is: InputStream)(implicit + r: Reader[T] + ): T = r.read(is) /* * Instances below */ @@ -90,7 +91,8 @@ object Reader { def read(is: InputStream) = (r1.read(is), r2.read(is)) } - implicit def array[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T: Reader: ClassTag]: Reader[Array[T]] = + implicit def array[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T: Reader: ClassTag] + : Reader[Array[T]] = new Reader[Array[T]] { val readerT = implicitly[Reader[T]] def read(is: InputStream) = { diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization.scala index 5930260999..7c9b8704e6 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization.scala @@ -12,57 +12,53 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, InputStream, OutputStream, Serializable } +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream, OutputStream, Serializable} -import scala.util.{ Success, Try } +import scala.util.{Success, Try} import scala.util.hashing.Hashing /** - * This is a base Input/OutputStream-based serialization typeclass - * This is useful for value serialization in hadoop when we don't - * need to do key sorting for partitioning. + * This is a base Input/OutputStream-based serialization typeclass This is useful for value serialization in + * hadoop when we don't need to do key sorting for partitioning. * - * This serialization typeclass must serialize equivalent objects - * identically to be lawful. Serialization should be the same - * on all JVMs run at any time, in other words, Serialization is a - * pure function. Given that constraint, we can always - * get an Equiv and Hashing from a Serialization (by doing byte-wise + * This serialization typeclass must serialize equivalent objects identically to be lawful. Serialization + * should be the same on all JVMs run at any time, in other words, Serialization is a pure function. Given + * that constraint, we can always get an Equiv and Hashing from a Serialization (by doing byte-wise * equivalence or byte-wise hashing). * - * A serialization always gives a hash because one can just - * serialize and then hash the bytes. You might prefer another - * implementation. This must satisfy: - * (!equiv(a, b)) || (hash(a) == hash(b)) + * A serialization always gives a hash because one can just serialize and then hash the bytes. You might + * prefer another implementation. This must satisfy: (!equiv(a, b)) || (hash(a) == hash(b)) */ trait Serialization[T] extends Equiv[T] with Hashing[T] with Serializable { def read(in: InputStream): Try[T] def write(out: OutputStream, t: T): Try[Unit] + /** - * If all items have a static size, this returns Some, else None - * NOTE: lawful implementations that return Some here much return - * Some on dynamicSize so callers don't need to check both when - * they have an instance. + * If all items have a static size, this returns Some, else None NOTE: lawful implementations that return + * Some here much return Some on dynamicSize so callers don't need to check both when they have an instance. */ def staticSize: Option[Int] + /** - * returns Some if the size is cheap to calculate. - * otherwise the caller should just serialize into an ByteArrayOutputStream + * returns Some if the size is cheap to calculate. otherwise the caller should just serialize into an + * ByteArrayOutputStream */ def dynamicSize(t: T): Option[Int] } /** - * In order to cache Serializations having equality and hashes can be useful. - * Extend this trait when those two properties can be satisfied + * In order to cache Serializations having equality and hashes can be useful. Extend this trait when those two + * properties can be satisfied */ trait EquivSerialization[T] extends Serialization[T] object Serialization { import JavaStreamEnrichments._ + /** * This is a constant for us to reuse in Serialization.write */ @@ -80,7 +76,7 @@ object Serialization { def write[T](out: OutputStream, t: T)(implicit ser: Serialization[T]): Try[Unit] = ser.write(out, t) - def toBytes[T](t: T)(implicit ser: Serialization[T]): Array[Byte] = { + def toBytes[T](t: T)(implicit ser: Serialization[T]): Array[Byte] = ser.dynamicSize(t) match { case None => val baos = new ByteArrayOutputStream @@ -94,7 +90,6 @@ object Serialization { write(os, t).get // this should only throw on OOM bytes } - } def fromBytes[T: Serialization](b: Array[Byte]): Try[T] = read(new ByteArrayInputStream(b)) @@ -117,57 +112,65 @@ object Serialization { /** * write followed by read should give an equivalent T * - * This is a law that serialization must follow. It is here for - * documentation and for use within tests without any dependence on - * specific test frameworks. + * This is a law that serialization must follow. It is here for documentation and for use within tests + * without any dependence on specific test frameworks. * * forAll(roundTripLaw[T]) in a valid test in scalacheck style */ def roundTripLaw[T: Serialization]: Law1[T] = - Law1("roundTrip", { (t: T) => equiv(roundTrip(t), t) }) + Law1("roundTrip", (t: T) => equiv(roundTrip(t), t)) /** * If two items are equal, they should serialize byte for byte equivalently */ def serializationIsEquivalence[T: Serialization]: Law2[T] = - Law2("equiv(a, b) == (write(a) == write(b))", { (t1: T, t2: T) => - equiv(t1, t2) == writeEquiv(t1, t2) - }) + Law2( + "equiv(a, b) == (write(a) == write(b))", + (t1: T, t2: T) => + equiv(t1, t2) == writeEquiv(t1, t2) + ) def hashCodeImpliesEquality[T: Serialization]: Law2[T] = - Law2("equiv(a, b) => hash(a) == hash(b)", { (t1: T, t2: T) => - !equiv(t1, t2) || (hash(t1) == hash(t2)) - }) + Law2( + "equiv(a, b) => hash(a) == hash(b)", + (t1: T, t2: T) => + !equiv(t1, t2) || (hash(t1) == hash(t2)) + ) def reflexivity[T: Serialization]: Law1[T] = - Law1("equiv(a, a) == true", { (t1: T) => equiv(t1, t1) }) + Law1("equiv(a, a) == true", (t1: T) => equiv(t1, t1)) /** * The sizes must match and be correct if they are present */ def sizeLaw[T: Serialization]: Law1[T] = - Law1("staticSize.orElse(dynamicSize(t)).map { _ == toBytes(t).length }", + Law1( + "staticSize.orElse(dynamicSize(t)).map { _ == toBytes(t).length }", { (t: T) => val ser = implicitly[Serialization[T]] (ser.staticSize, ser.dynamicSize(t)) match { case (Some(s), Some(d)) if d == s => toBytes(t).length == s - case (Some(s), _) => false // if static exists it must match dynamic - case (None, Some(d)) => toBytes(t).length == d - case (None, None) => true // can't tell + case (Some(s), _) => false // if static exists it must match dynamic + case (None, Some(d)) => toBytes(t).length == d + case (None, None) => true // can't tell } - }) + } + ) def transitivity[T: Serialization]: Law3[T] = - Law3("equiv(a, b) && equiv(b, c) => equiv(a, c)", - { (t1: T, t2: T, t3: T) => + Law3( + "equiv(a, b) && equiv(b, c) => equiv(a, c)", + (t1: T, t2: T, t3: T) => !(equiv(t1, t2) && equiv(t2, t3)) || equiv(t1, t3) - }) + ) def allLaws[T: Serialization]: Iterable[Law[T]] = - List[Law[T]](roundTripLaw, + List[Law[T]]( + roundTripLaw, serializationIsEquivalence, hashCodeImpliesEquality, reflexivity, sizeLaw, - transitivity) + transitivity + ) } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization2.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization2.scala index 27e75a7a67..eab33ed520 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization2.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Serialization2.scala @@ -12,15 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.{ InputStream, OutputStream } +import java.io.{InputStream, OutputStream} -import scala.util.{ Failure, Success, Try } +import scala.util.{Failure, Success, Try} -class Serialization2[A, B](val serA: Serialization[A], val serB: Serialization[B]) extends Serialization[(A, B)] { +class Serialization2[A, B](val serA: Serialization[A], val serB: Serialization[B]) + extends Serialization[(A, B)] { override def hash(x: (A, B)) = { import MurmurHashUtils._ val h1 = mixH1(seed, serA.hash(x._1)) @@ -35,8 +36,8 @@ class Serialization2[A, B](val serA: Serialization[A], val serB: Serialization[B val b = serB.read(in) (a, b) match { case (Success(a), Success(b)) => Success((a, b)) - case (Failure(e), _) => Failure(e) - case (_, Failure(e)) => Failure(e) + case (Failure(e), _) => Failure(e) + case (_, Failure(e)) => Failure(e) } } @@ -52,24 +53,28 @@ class Serialization2[A, B](val serA: Serialization[A], val serB: Serialization[B } yield a + b override def dynamicSize(t: (A, B)) = if (staticSize.isDefined) staticSize - else for { - a <- serA.dynamicSize(t._1) - b <- serB.dynamicSize(t._2) - } yield a + b + else + for { + a <- serA.dynamicSize(t._1) + b <- serB.dynamicSize(t._2) + } yield a + b } object OrderedSerialization2 { - def maybeOrderedSerialization2[A, B](implicit ordA: Ordering[A], ordB: Ordering[B]): Ordering[(A, B)] = { + def maybeOrderedSerialization2[A, B](implicit ordA: Ordering[A], ordB: Ordering[B]): Ordering[(A, B)] = (ordA, ordB) match { case (ordA: OrderedSerialization[_], ordB: OrderedSerialization[_]) => - new OrderedSerialization2(ordA.asInstanceOf[OrderedSerialization[A]], ordB.asInstanceOf[OrderedSerialization[B]]) + new OrderedSerialization2( + ordA.asInstanceOf[OrderedSerialization[A]], + ordB.asInstanceOf[OrderedSerialization[B]] + ) case _ => Ordering.Tuple2(ordA, ordB) } - } } -class OrderedSerialization2[A, B](val ordA: OrderedSerialization[A], - val ordB: OrderedSerialization[B]) extends Serialization2[A, B](ordA, ordB) with OrderedSerialization[(A, B)] { +class OrderedSerialization2[A, B](val ordA: OrderedSerialization[A], val ordB: OrderedSerialization[B]) + extends Serialization2[A, B](ordA, ordB) + with OrderedSerialization[(A, B)] { override def compare(x: (A, B), y: (A, B)) = { val ca = ordA.compare(x._1, y._1) if (ca != 0) ca @@ -81,9 +86,9 @@ class OrderedSerialization2[A, B](val ordA: OrderedSerialization[A], // we have to read the second ones to skip val cB = ordB.compareBinary(a, b) cA match { - case OrderedSerialization.Equal => cB + case OrderedSerialization.Equal => cB case f @ OrderedSerialization.CompareFailure(_) => f - case _ => cA // the first is not equal + case _ => cA // the first is not equal } } } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/StringOrderedSerialization.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/StringOrderedSerialization.scala index 23ab371c4d..96e662bca6 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/StringOrderedSerialization.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/StringOrderedSerialization.scala @@ -12,38 +12,40 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.{ InputStream, OutputStream } -import scala.util.{ Failure, Success } +import java.io.{InputStream, OutputStream} +import scala.util.{Failure, Success} import scala.util.control.NonFatal import JavaStreamEnrichments._ object StringOrderedSerialization { - final def binaryIntCompare(leftSize: Int, seekingLeft: InputStream, rightSize: Int, seekingRight: InputStream): Int = { + final def binaryIntCompare( + leftSize: Int, + seekingLeft: InputStream, + rightSize: Int, + seekingRight: InputStream + ): Int = { /* - * This algorithm only works if count in {0, 1, 2, 3}. Since we only - * call it that way below it is safe. - */ + * This algorithm only works if count in {0, 1, 2, 3}. Since we only + * call it that way below it is safe. + */ @inline def compareBytes(count: Int): Int = if ((count & 2) == 2) { // there are 2 or 3 bytes to read - val cmp = Integer.compare(seekingLeft.readUnsignedShort, - seekingRight.readUnsignedShort) + val cmp = Integer.compare(seekingLeft.readUnsignedShort, seekingRight.readUnsignedShort) if (cmp != 0) cmp - else if (count == 3) Integer.compare(seekingLeft.readUnsignedByte, - seekingRight.readUnsignedByte) + else if (count == 3) Integer.compare(seekingLeft.readUnsignedByte, seekingRight.readUnsignedByte) else 0 } else { // there are 0 or 1 bytes to read if (count == 0) 0 - else Integer.compare(seekingLeft.readUnsignedByte, - seekingRight.readUnsignedByte) + else Integer.compare(seekingLeft.readUnsignedByte, seekingRight.readUnsignedByte) } /** @@ -98,17 +100,18 @@ class StringOrderedSerialization extends OrderedSerialization[String] { val leftStart = seekingLeft.position val rightStart = seekingRight.position - val res = OrderedSerialization.resultFrom(binaryIntCompare(leftSize, seekingLeft, rightSize, seekingRight)) + val res = + OrderedSerialization.resultFrom(binaryIntCompare(leftSize, seekingLeft, rightSize, seekingRight)) seekingLeft.seekToPosition(leftStart + leftSize) seekingRight.seekToPosition(rightStart + rightSize) res } catch { case NonFatal(e) => OrderedSerialization.CompareFailure(e) } + /** - * generally there is no way to see how big a utf-8 string is without serializing. - * We could scan looking for all ascii characters, but it's hard to see if - * we'd get the balance right. + * generally there is no way to see how big a utf-8 string is without serializing. We could scan looking for + * all ascii characters, but it's hard to see if we'd get the balance right. */ override def staticSize = None override def dynamicSize(s: String) = None diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala index 86c0839c7d..482be29185 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala @@ -12,25 +12,25 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization object UnsignedComparisons { - final def unsignedLongCompare(a: Long, b: Long): Int = if (a == b) 0 else { - val xor = (a ^ b) + final def unsignedLongCompare(a: Long, b: Long): Int = if (a == b) 0 + else { + val xor = a ^ b // If xor >= 0, then a and b are on the same side of zero if (xor >= 0L) java.lang.Long.compare(a, b) else if (b >= 0L) 1 else -1 } final def unsignedIntCompare(a: Int, b: Int): Int = - java.lang.Long.compare(a.toLong & 0xFFFFFFFFL, b.toLong & 0xFFFFFFFFL) + java.lang.Long.compare(a.toLong & 0xffffffffL, b.toLong & 0xffffffffL) final def unsignedShortCompare(a: Short, b: Short): Int = - Integer.compare(a & 0xFFFF, b & 0xFFFF) + Integer.compare(a & 0xffff, b & 0xffff) final def unsignedByteCompare(a: Byte, b: Byte): Int = - Integer.compare(a & 0xFF, b & 0xFF) + Integer.compare(a & 0xff, b & 0xff) } - diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Writer.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Writer.scala index c99a4134dc..8ab4d1138a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Writer.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Writer.scala @@ -12,15 +12,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import java.io.OutputStream /** - * This is a specialized typeclass to make it easier to implement Serializations. - * The specialization *should* mean that there is no boxing and if the JIT - * does its work, Writer should compose well (via collections, Tuple2, Option, Either) + * This is a specialized typeclass to make it easier to implement Serializations. The specialization *should* + * mean that there is no boxing and if the JIT does its work, Writer should compose well (via collections, + * Tuple2, Option, Either) */ trait Writer[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) -T] { def write(os: OutputStream, t: T): Unit @@ -29,8 +29,9 @@ trait Writer[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) -T] { object Writer { import JavaStreamEnrichments._ - def write[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T](os: OutputStream, - t: T)(implicit w: Writer[T]): Unit = + def write[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T](os: OutputStream, t: T)(implicit + w: Writer[T] + ): Unit = w.write(os, t) /* * Instances below @@ -100,7 +101,8 @@ object Writer { } } - implicit def array[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T: Writer]: Writer[Array[T]] = + implicit def array[@specialized(Boolean, Byte, Short, Int, Long, Float, Double) T: Writer] + : Writer[Array[T]] = new Writer[Array[T]] { val writerT = implicitly[Writer[T]] def write(os: OutputStream, a: Array[T]) = { @@ -127,4 +129,3 @@ object Writer { } } } - diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala index 5afa9d88b7..297256f81a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala @@ -5,7 +5,8 @@ import com.twitter.scalding.serialization.OrderedSerialization import scala.language.experimental.macros trait BinaryOrdering { - implicit def ordSer[T]: OrderedSerialization[T] = macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + implicit def ordSer[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] } object BinaryOrdering extends BinaryOrdering diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/OrderedBufferableProviderImpl.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/OrderedBufferableProviderImpl.scala index 82927b80b8..cabd6528fd 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/OrderedBufferableProviderImpl.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/OrderedBufferableProviderImpl.scala @@ -23,12 +23,14 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization.prov object OrderedSerializationProviderImpl { def normalizedDispatcher(c: Context)( - buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { case tpe if !(tpe.normalize == tpe) => buildDispatcher(tpe.normalize) } def scaldingBasicDispatchers(c: Context)( - buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { val primitiveDispatcher = PrimitiveOrderedBuf.dispatch(c) val optionDispatcher = OptionOrderedBuf.dispatch(c)(buildDispatcher) @@ -65,7 +67,7 @@ object OrderedSerializationProviderImpl { import c.universe._ scaldingBasicDispatchers(c)(OrderedSerializationProviderImpl.innerDispatcher(c)).orElse { case tpe: Type => - c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type ${tpe}""") + c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type $tpe""") } } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/ProductLike.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/ProductLike.scala index f246d8547f..7ed5b0ed9c 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/ProductLike.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/ProductLike.scala @@ -19,17 +19,17 @@ import scala.reflect.macros.blackbox.Context object ProductLike { def compareBinary(c: Context)(inputStreamA: c.TermName, inputStreamB: c.TermName)( - elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): c.Tree = { + elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) elementData - .foldLeft(Option.empty[Tree]) { - case (existingTreeOpt, (tpe, accessorSymbol, tBuf)) => - existingTreeOpt match { - case Some(t) => - val lastCmp = freshT("lastCmp") - Some(q""" + .foldLeft(Option.empty[Tree]) { case (existingTreeOpt, (tpe, accessorSymbol, tBuf)) => + existingTreeOpt match { + case Some(t) => + val lastCmp = freshT("lastCmp") + Some(q""" val $lastCmp = $t if($lastCmp != 0) { $lastCmp @@ -37,48 +37,46 @@ object ProductLike { ${tBuf.compareBinary(inputStreamA, inputStreamB)} } """) - case None => - Some(tBuf.compareBinary(inputStreamA, inputStreamB)) - } + case None => + Some(tBuf.compareBinary(inputStreamA, inputStreamB)) + } } .getOrElse(q"0") } - def hash(c: Context)(element: c.TermName)( - elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): c.Tree = { + def hash(c: Context)( + element: c.TermName + )(elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val currentHash = freshT("last") - val hashUpdates = elementData.map { - case (tpe, accessorSymbol, tBuf) => - val target = freshT("target") - q""" + val hashUpdates = elementData.map { case (tpe, accessorSymbol, tBuf) => + val target = freshT("target") + q""" val $target = $element.$accessorSymbol - $currentHash = _root_.com.twitter.scalding.serialization.MurmurHashUtils.mixH1($currentHash, ${ - tBuf - .hash(target) - }) + $currentHash = _root_.com.twitter.scalding.serialization.MurmurHashUtils.mixH1($currentHash, ${tBuf + .hash(target)}) """ } q""" var $currentHash: _root_.scala.Int = _root_.com.twitter.scalding.serialization.MurmurHashUtils.seed - ..${hashUpdates} + ..$hashUpdates _root_.com.twitter.scalding.serialization.MurmurHashUtils.fmix($currentHash, ${elementData.size}) """ } def put(c: Context)(inputStream: c.TermName, element: c.TermName)( - elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): c.Tree = { + elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val innerElement = freshT("innerElement") - elementData.foldLeft(q"") { - case (existingTree, (tpe, accessorSymbol, tBuf)) => - q""" + elementData.foldLeft(q"") { case (existingTree, (tpe, accessorSymbol, tBuf)) => + q""" $existingTree val $innerElement = $element.$accessorSymbol ${tBuf.put(inputStream, innerElement)} @@ -87,37 +85,42 @@ object ProductLike { } def length(c: Context)(element: c.Tree)( - elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): CompileTimeLengthTypes[c.type] = { + elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])] + ): CompileTimeLengthTypes[c.type] = { import c.universe._ import CompileTimeLengthTypes._ val (constSize, dynamicFunctions, maybeLength, noLength) = elementData.foldLeft((0, Vector[c.Tree](), Vector[c.Tree](), 0)) { - case ((constantLength, dynamicLength, maybeLength, noLength), - (tpe, accessorSymbol, tBuf)) => + case ((constantLength, dynamicLength, maybeLength, noLength), (tpe, accessorSymbol, tBuf)) => tBuf.length(q"$element.$accessorSymbol") match { case const: ConstantLengthCalculation[_] => - (constantLength + const.asInstanceOf[ConstantLengthCalculation[c.type]].toInt, + ( + constantLength + const.asInstanceOf[ConstantLengthCalculation[c.type]].toInt, dynamicLength, maybeLength, - noLength) + noLength + ) case f: FastLengthCalculation[_] => - (constantLength, + ( + constantLength, dynamicLength :+ f.asInstanceOf[FastLengthCalculation[c.type]].t, maybeLength, - noLength) + noLength + ) case m: MaybeLengthCalculation[_] => - (constantLength, + ( + constantLength, dynamicLength, maybeLength :+ m.asInstanceOf[MaybeLengthCalculation[c.type]].t, - noLength) + noLength + ) case _: NoLengthCalculationAvailable[_] => (constantLength, dynamicLength, maybeLength, noLength + 1) } } - val combinedDynamic = dynamicFunctions.foldLeft(q"""$constSize""") { - case (prev, t) => - q"$prev + $t" + val combinedDynamic = dynamicFunctions.foldLeft(q"""$constSize""") { case (prev, t) => + q"$prev + $t" } if (noLength > 0) { @@ -157,7 +160,8 @@ object ProductLike { } def compare(c: Context)(elementA: c.TermName, elementB: c.TermName)( - elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])]): c.Tree = { + elementData: List[(c.universe.Type, c.universe.TermName, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) @@ -166,23 +170,21 @@ object ProductLike { val innerElementB = freshT("innerElementB") elementData - .map { - case (tpe, accessorSymbol, tBuf) => - val curCmp = freshT("curCmp") - val cmpTree = q""" + .map { case (tpe, accessorSymbol, tBuf) => + val curCmp = freshT("curCmp") + val cmpTree = q""" val $curCmp: _root_.scala.Int = { val $innerElementA = $elementA.$accessorSymbol val $innerElementB = $elementB.$accessorSymbol ${tBuf.compare(innerElementA, innerElementB)} } """ - (cmpTree, curCmp) + (cmpTree, curCmp) } .reverse // go through last to first .foldLeft(None: Option[Tree]) { case (Some(rest), (tree, valname)) => - Some( - q"""$tree; + Some(q"""$tree; if ($valname != 0) $valname else { $rest diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/SealedTraitLike.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/SealedTraitLike.scala index 8f2ffb4d94..3e8c04fb83 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/SealedTraitLike.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/SealedTraitLike.scala @@ -17,23 +17,22 @@ package com.twitter.scalding.serialization.macros.impl.ordered_serialization import scala.reflect.macros.blackbox.Context -import com.twitter.scalding.serialization.Hasher.int.{ hash => intHash } +import com.twitter.scalding.serialization.Hasher.int.{hash => intHash} object SealedTraitLike { /** - * Compare Binary for generating similar types of binary comparasion code - * Args: - * inputStreamA: should contain the variable name that has the input stream A bound to - * inputStreamB: should contain the variable name that has the input stream B bound to - * subData: Its a list of the sub components of this sealed trait, for each one - * we include an index of this sub type, the clase class/type of this sub type, - * and finally a means to compare two instances of this type. + * Compare Binary for generating similar types of binary comparasion code Args: inputStreamA: should contain + * the variable name that has the input stream A bound to inputStreamB: should contain the variable name + * that has the input stream B bound to subData: Its a list of the sub components of this sealed trait, for + * each one we include an index of this sub type, the clase class/type of this sub type, and finally a means + * to compare two instances of this type. */ // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def compareBinary(c: Context)(inputStreamA: c.TermName, inputStreamB: c.TermName)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { + subData: List[(Int, c.Type, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val valueA = freshT("valueA") @@ -41,28 +40,28 @@ object SealedTraitLike { val idxCmp = freshT("idxCmp") val compareSameTypes: Tree = subData - .foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, tBuf)) => - val commonCmp: Tree = tBuf.compareBinary(inputStreamA, inputStreamB) + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, tBuf)) => + val commonCmp: Tree = tBuf.compareBinary(inputStreamA, inputStreamB) - existing match { - case Some(t) => - Some(q""" + existing match { + case Some(t) => + Some(q""" if($valueA == $idx) { $commonCmp } else { $t } """) - case None => - Some(q""" + case None => + Some(q""" if($valueA == $idx) { $commonCmp } else { sys.error("unreachable code -- this could only be reached by corruption in serialization.") }""") - } - }.get // linter:ignore:wartermover:OptionPartial + } + } + .get // linter:ignore:wartermover:OptionPartial q""" val $valueA: Int = $inputStreamA.readByte.toInt @@ -79,38 +78,36 @@ object SealedTraitLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def hash(c: Context)(element: c.TermName)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { + def hash(c: Context)(element: c.TermName)(subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) subData - .foldLeft(Option.empty[Tree]) { - case (optiExisting, (idx, tpe, tBuf)) => - val innerArg = freshT("innerArg") - val elementHash: Tree = q""" + .foldLeft(Option.empty[Tree]) { case (optiExisting, (idx, tpe, tBuf)) => + val innerArg = freshT("innerArg") + val elementHash: Tree = q""" val $innerArg: $tpe = $element.asInstanceOf[$tpe] ${tBuf.hash(innerArg)} """ - optiExisting match { - case Some(s) => - Some(q""" + optiExisting match { + case Some(s) => + Some(q""" if($element.isInstanceOf[$tpe]) { $elementHash ^ ${intHash(idx)} } else { $s } """) - case None => - Some(q""" + case None => + Some(q""" if($element.isInstanceOf[$tpe]) { $elementHash ^ ${intHash(idx)} } else { _root_.scala.Int.MaxValue } """) - } + } } .get } @@ -118,21 +115,21 @@ object SealedTraitLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def put(c: Context)(inputStream: c.TermName, element: c.TermName)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { + subData: List[(Int, c.Type, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val innerArg = freshT("innerArg") subData - .foldLeft(Option.empty[Tree]) { - case (optiExisting, (idx, tpe, tBuf)) => - val commonPut: Tree = q"""val $innerArg: $tpe = $element.asInstanceOf[$tpe] + .foldLeft(Option.empty[Tree]) { case (optiExisting, (idx, tpe, tBuf)) => + val commonPut: Tree = q"""val $innerArg: $tpe = $element.asInstanceOf[$tpe] ${tBuf.put(inputStream, innerArg)} """ - optiExisting match { - case Some(s) => - Some(q""" + optiExisting match { + case Some(s) => + Some(q""" if($element.isInstanceOf[$tpe]) { $inputStream.writeByte($idx.toByte) $commonPut @@ -140,49 +137,47 @@ object SealedTraitLike { $s } """) - case None => - Some(q""" + case None => + Some(q""" if($element.isInstanceOf[$tpe]) { $inputStream.writeByte($idx.toByte) $commonPut } """) - } + } } .get } // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial", "org.wartremover.warts.Return")) - def length(c: Context)(element: c.Tree)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): CompileTimeLengthTypes[c.type] = { + def length( + c: Context + )(element: c.Tree)(subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): CompileTimeLengthTypes[c.type] = { import CompileTimeLengthTypes._ import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val prevSizeData = subData - .foldLeft(Option.empty[Tree]) { - case (optiTree, (idx, tpe, tBuf)) => - val baseLenT: Tree = tBuf.length(q"$element.asInstanceOf[$tpe]") match { - case m: MaybeLengthCalculation[_] => - m.asInstanceOf[MaybeLengthCalculation[c.type]].t - - case f: FastLengthCalculation[_] => - q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${ - f - .asInstanceOf[FastLengthCalculation[c.type]] - .t - })""" - - case _: NoLengthCalculationAvailable[_] => - return NoLengthCalculationAvailable(c) - case const: ConstantLengthCalculation[_] => - q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.ConstLen(${const.toInt})""" - case e => sys.error("unexpected input to union length code of " + e) - } - val tmpPreLen = freshT("tmpPreLen") + .foldLeft(Option.empty[Tree]) { case (optiTree, (idx, tpe, tBuf)) => + val baseLenT: Tree = tBuf.length(q"$element.asInstanceOf[$tpe]") match { + case m: MaybeLengthCalculation[_] => + m.asInstanceOf[MaybeLengthCalculation[c.type]].t + + case f: FastLengthCalculation[_] => + q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${f + .asInstanceOf[FastLengthCalculation[c.type]] + .t})""" + + case _: NoLengthCalculationAvailable[_] => + return NoLengthCalculationAvailable(c) + case const: ConstantLengthCalculation[_] => + q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.ConstLen(${const.toInt})""" + case e => sys.error("unexpected input to union length code of " + e) + } + val tmpPreLen = freshT("tmpPreLen") - val lenT = q""" + val lenT = q""" val $tmpPreLen: _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MaybeLength = $baseLenT ($tmpPreLen match { @@ -194,23 +189,23 @@ object SealedTraitLike { _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.NoLengthCalculation }): _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MaybeLength """ - optiTree match { - case Some(t) => - Some(q""" + optiTree match { + case Some(t) => + Some(q""" if($element.isInstanceOf[$tpe]) { $lenT } else { $t } """) - case None => - Some(q""" + case None => + Some(q""" if($element.isInstanceOf[$tpe]) { $lenT } else { sys.error("Unreachable code, did not match sealed trait type") }""") - } + } } .get @@ -219,36 +214,36 @@ object SealedTraitLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def get(c: Context)(inputStream: c.TermName)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { + def get( + c: Context + )(inputStream: c.TermName)(subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val valueA = freshT("valueA") val expandedOut = subData - .foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, tBuf)) => - val extract = q"${tBuf.get(inputStream)}" + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, tBuf)) => + val extract = q"${tBuf.get(inputStream)}" - existing match { - case Some(t) => - Some(q""" + existing match { + case Some(t) => + Some(q""" if($valueA == $idx) { $extract : $tpe } else { $t } """) - case None => - Some(q""" + case None => + Some(q""" if($valueA == $idx) { $extract } else { sys.error("Did not understand sealed trait with idx: " + $valueA + ", this should only happen in a serialization failure.") } """) - } + } } .get @@ -261,7 +256,8 @@ object SealedTraitLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def compare(c: Context)(cmpType: c.Type, elementA: c.TermName, elementB: c.TermName)( - subData: List[(Int, c.Type, TreeOrderedBuf[c.type])]): c.Tree = { + subData: List[(Int, c.Type, TreeOrderedBuf[c.type])] + ): c.Tree = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) @@ -272,25 +268,24 @@ object SealedTraitLike { val idxB = freshT("idxB") val toIdOpt: Tree = subData - .foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, _)) => - existing match { - case Some(t) => - Some(q""" + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, _)) => + existing match { + case Some(t) => + Some(q""" if($arg.isInstanceOf[$tpe]) { $idx } else { $t } """) - case None => - Some(q""" + case None => + Some(q""" if($arg.isInstanceOf[$tpe]) { $idx } else { sys.error("This should be unreachable code, failure in serializer or deserializer to reach here.") }""") - } + } } .get @@ -327,7 +322,7 @@ object SealedTraitLike { val compareFn = q""" def instanceToIdx($arg: $cmpType): Int = { - ${toIdOpt}: Int + $toIdOpt: Int } val $idxA: Int = instanceToIdx($elementA) diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/TreeOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/TreeOrderedBuf.scala index 231c9c6d28..d06c4a85bc 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/TreeOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/TreeOrderedBuf.scala @@ -27,45 +27,42 @@ object CommonCompareBinary { val minSizeForFulBinaryCompare = 24 /** - * This method will compare two InputStreams of given lengths - * If the inputsteam supports mark/reset (such as those backed by Array[Byte]), - * and the lengths are equal and longer than minSizeForFulBinaryCompare we first - * check if they are byte-for-byte identical, which is a cheap way to avoid doing - * potentially complex logic in binary comparators + * This method will compare two InputStreams of given lengths If the inputsteam supports mark/reset (such as + * those backed by Array[Byte]), and the lengths are equal and longer than minSizeForFulBinaryCompare we + * first check if they are byte-for-byte identical, which is a cheap way to avoid doing potentially complex + * logic in binary comparators */ @SuppressWarnings(Array("org.wartremover.warts.Return")) - final def earlyEqual(inputStreamA: InputStream, - lenA: Int, - inputStreamB: InputStream, - lenB: Int): Boolean = + final def earlyEqual(inputStreamA: InputStream, lenA: Int, inputStreamB: InputStream, lenB: Int): Boolean = (lenA > minSizeForFulBinaryCompare && (lenA == lenB) && inputStreamA.markSupported && inputStreamB.markSupported) && { - inputStreamA.mark(lenA) - inputStreamB.mark(lenB) - - var pos: Int = 0 - while (pos < lenA) { - val a = inputStreamA.read - val b = inputStreamB.read - pos += 1 - if (a != b) { - inputStreamA.reset() - inputStreamB.reset() - // yeah, return sucks, but trying to optimize here - return false - } - else if (a < 0) return JavaStreamEnrichments.eof - // a == b, but may be eof - } - // we consumed all the bytes, and they were all equal - true + inputStreamA.mark(lenA) + inputStreamB.mark(lenB) + + var pos: Int = 0 + while (pos < lenA) { + val a = inputStreamA.read + val b = inputStreamB.read + pos += 1 + if (a != b) { + inputStreamA.reset() + inputStreamB.reset() + // yeah, return sucks, but trying to optimize here + return false + } else if (a < 0) return JavaStreamEnrichments.eof + // a == b, but may be eof } + // we consumed all the bytes, and they were all equal + true + } } object TreeOrderedBuf { import CompileTimeLengthTypes._ - def toOrderedSerialization[T](c: Context)(t: TreeOrderedBuf[c.type])(implicit T: t.ctx.WeakTypeTag[T]): t.ctx.Expr[OrderedSerialization[T]] = { + def toOrderedSerialization[T]( + c: Context + )(t: TreeOrderedBuf[c.type])(implicit T: t.ctx.WeakTypeTag[T]): t.ctx.Expr[OrderedSerialization[T]] = { import t.ctx.universe._ def freshT(id: String) = TermName(c.freshName(s"fresh_$id")) val outputLength = freshT("outputLength") @@ -74,29 +71,35 @@ object TreeOrderedBuf { val element = freshT("element") val fnBodyOpt = t.length(q"$element") match { - case _: NoLengthCalculationAvailable[_] => None + case _: NoLengthCalculationAvailable[_] => None case const: ConstantLengthCalculation[_] => None - case f: FastLengthCalculation[_] => Some(q""" - _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${f.asInstanceOf[FastLengthCalculation[c.type]].t}) + case f: FastLengthCalculation[_] => + Some(q""" + _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${f + .asInstanceOf[FastLengthCalculation[c.type]] + .t}) """) case m: MaybeLengthCalculation[_] => Some(m.asInstanceOf[MaybeLengthCalculation[c.type]].t) } - fnBodyOpt.map { fnBody => - q""" + fnBodyOpt + .map { fnBody => + q""" private[this] def payloadLength($element: $T): _root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MaybeLength = { lengthCalculationAttempts += 1 $fnBody } """ - }.getOrElse(q"()") + } + .getOrElse(q"()") } def binaryLengthGen(typeName: Tree): (Tree, Tree) = { val tempLen = freshT("tempLen") val lensLen = freshT("lensLen") val element = freshT("element") - val callDynamic = (q"""override def staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""", + val callDynamic = ( + q"""override def staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""", q""" override def dynamicSize($element: $typeName): _root_.scala.Option[_root_.scala.Int] = { @@ -116,16 +119,25 @@ object TreeOrderedBuf { } else _root_.scala.None): _root_.scala.Option[_root_.scala.Int] } } - """) + """ + ) t.length(q"$element") match { - case _: NoLengthCalculationAvailable[_] => (q""" - override def staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""", q""" - override def dynamicSize($element: $typeName): _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""") - case const: ConstantLengthCalculation[_] => (q""" - override val staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.Some(${const.toInt})""", q""" - override def dynamicSize($element: $typeName): _root_.scala.Option[_root_.scala.Int] = staticSize""") - case f: FastLengthCalculation[_] => callDynamic + case _: NoLengthCalculationAvailable[_] => + ( + q""" + override def staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""", + q""" + override def dynamicSize($element: $typeName): _root_.scala.Option[_root_.scala.Int] = _root_.scala.None""" + ) + case const: ConstantLengthCalculation[_] => + ( + q""" + override val staticSize: _root_.scala.Option[_root_.scala.Int] = _root_.scala.Some(${const.toInt})""", + q""" + override def dynamicSize($element: $typeName): _root_.scala.Option[_root_.scala.Int] = staticSize""" + ) + case f: FastLengthCalculation[_] => callDynamic case m: MaybeLengthCalculation[_] => callDynamic } } @@ -137,9 +149,8 @@ object TreeOrderedBuf { val len = freshT("len") /** - * This is the worst case: we have to serialize in a side buffer - * and then see how large it actually is. This happens for cases, like - * string, where the cost to see the serialized size is not cheaper than + * This is the worst case: we have to serialize in a side buffer and then see how large it actually is. + * This happens for cases, like string, where the cost to see the serialized size is not cheaper than * directly serializing. */ q""" @@ -157,9 +168,10 @@ object TreeOrderedBuf { def putFnGen(outerbaos: TermName, element: TermName) = { val oldPos = freshT("oldPos") val len = freshT("len") + /** - * This is the case where the length is cheap to compute, either - * constant or easily computable from an instance. + * This is the case where the length is cheap to compute, either constant or easily computable from an + * instance. */ def withLenCalc(lenC: Tree) = q""" val $len = $lenC @@ -194,24 +206,21 @@ object TreeOrderedBuf { } } - def readLength(inputStream: TermName) = { + def readLength(inputStream: TermName) = t.length(q"e") match { case const: ConstantLengthCalculation[_] => q"${const.toInt}" - case _ => q"$inputStream.readPosVarInt" + case _ => q"$inputStream.readPosVarInt" } - } - def discardLength(inputStream: TermName) = { + def discardLength(inputStream: TermName) = t.length(q"e") match { case const: ConstantLengthCalculation[_] => q"()" - case _ => q"$inputStream.readPosVarInt" + case _ => q"$inputStream.readPosVarInt" } - } - val lazyVariables = t.lazyOuterVariables.map { - case (n, t) => - val termName = TermName(n) - q"""lazy val $termName = $t""" + val lazyVariables = t.lazyOuterVariables.map { case (n, t) => + val termName = TermName(n) + q"""lazy val $termName = $t""" } val element = freshT("element") diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseClassOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseClassOrderedBuf.scala index a9a81d66f5..f3c3733e6f 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseClassOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseClassOrderedBuf.scala @@ -17,20 +17,22 @@ package com.twitter.scalding.serialization.macros.impl.ordered_serialization.pro import scala.reflect.macros.blackbox.Context -import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ - ProductLike, - TreeOrderedBuf -} +import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ProductLike, TreeOrderedBuf} @SuppressWarnings(Array("org.wartremover.warts.MergeMaps")) object CaseClassOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass && !tpe.typeSymbol.asClass.isModuleClass => + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + case tpe + if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass && !tpe.typeSymbol.asClass.isModuleClass => CaseClassOrderedBuf(c)(buildDispatcher, tpe) } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) @@ -60,15 +62,14 @@ object CaseClassOrderedBuf { override def get(inputStream: ctx.TermName): ctx.Tree = { - val getValProcessor = elementData.map { - case (tpe, accessorSymbol, tBuf) => - val curR = freshT("curR") - val builderTree = q""" + val getValProcessor = elementData.map { case (tpe, accessorSymbol, tBuf) => + val curR = freshT("curR") + val builderTree = q""" val $curR: ${tBuf.tpe} = { ${tBuf.get(inputStream)} } """ - (builderTree, curR) + (builderTree, curR) } q""" diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseObjectOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseObjectOrderedBuf.scala index e49891235e..bfa9a6be7f 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseObjectOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/CaseObjectOrderedBuf.scala @@ -25,7 +25,8 @@ import CompileTimeLengthTypes._ object CaseObjectOrderedBuf { def dispatch(c: Context)(): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass && tpe.typeSymbol.asClass.isModuleClass && !tpe.typeConstructor.takesTypeArgs => + case tpe + if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass && tpe.typeSymbol.asClass.isModuleClass && !tpe.typeConstructor.takesTypeArgs => CaseObjectOrderedBuf(c)(tpe) } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/EitherOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/EitherOrderedBuf.scala index d70e88a49c..53543b7cd9 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/EitherOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/EitherOrderedBuf.scala @@ -24,13 +24,17 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ import CompileTimeLengthTypes._ object EitherOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { case tpe if tpe.erasure =:= c.universe.typeOf[Either[Any, Any]] => EitherOrderedBuf(c)(buildDispatcher, tpe) } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val dispatcher = buildDispatcher @@ -83,7 +87,8 @@ object EitherOrderedBuf { val tmpGetHolder = freshT("tmpGetHolder") q""" val $tmpGetHolder: _root_.scala.Byte = $inputStreamA.readByte - if($tmpGetHolder == (0: _root_.scala.Byte)) _root_.scala.util.Left[${leftBuf.tpe}, ${rightBuf.tpe}](${leftBuf.get(inputStreamA)}) + if($tmpGetHolder == (0: _root_.scala.Byte)) _root_.scala.util.Left[${leftBuf.tpe}, ${rightBuf.tpe}](${leftBuf + .get(inputStreamA)}) else _root_.scala.util.Right[${leftBuf.tpe}, ${rightBuf.tpe}](${rightBuf.get(inputStreamA)}) """ } @@ -151,7 +156,8 @@ object EitherOrderedBuf { q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen""" (leftBuf.length(q"$element.left.get"), rightBuf.length(q"$element.right.get")) match { - case (lconst: ConstantLengthCalculation[_], rconst: ConstantLengthCalculation[_]) if lconst.toInt == rconst.toInt => + case (lconst: ConstantLengthCalculation[_], rconst: ConstantLengthCalculation[_]) + if lconst.toInt == rconst.toInt => // We got lucky, they are the same size: ConstantLengthCalculation(c)(1 + rconst.toInt) case (_: NoLengthCalculationAvailable[_], _) => NoLengthCalculationAvailable(c) diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ImplicitOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ImplicitOrderedBuf.scala index e357b27c22..e8de0e82ca 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ImplicitOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ImplicitOrderedBuf.scala @@ -26,8 +26,8 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization._ object ImplicitOrderedBuf { def dispatch(c: Context): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe => ImplicitOrderedBuf(c)(tpe) + val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { case tpe => + ImplicitOrderedBuf(c)(tpe) } pf } @@ -41,7 +41,7 @@ object ImplicitOrderedBuf { val variableName = TermName(variableNameStr) val implicitInstanciator = q""" - implicitly[_root_.com.twitter.scalding.serialization.OrderedSerialization[${outerType}]]""" + implicitly[_root_.com.twitter.scalding.serialization.OrderedSerialization[$outerType]]""" new TreeOrderedBuf[c.type] { override val ctx: c.type = c diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/OptionOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/OptionOrderedBuf.scala index f452fd129a..cab77f8825 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/OptionOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/OptionOrderedBuf.scala @@ -24,13 +24,17 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ import CompileTimeLengthTypes._ object OptionOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { case tpe if tpe.erasure =:= c.universe.typeOf[Option[Any]] => OptionOrderedBuf(c)(buildDispatcher, tpe) } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) val dispatcher = buildDispatcher diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/PrimitiveOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/PrimitiveOrderedBuf.scala index c96a3b7203..00c982b86f 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/PrimitiveOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/PrimitiveOrderedBuf.scala @@ -59,11 +59,9 @@ object PrimitiveOrderedBuf { PrimitiveOrderedBuf(c)(tpe, "Double", 8, true) } - def apply(c: Context)( - outerType: c.Type, - javaTypeStr: String, - lenInBytes: Int, - boxed: Boolean): TreeOrderedBuf[c.type] = { + def apply( + c: Context + )(outerType: c.Type, javaTypeStr: String, lenInBytes: Int, boxed: Boolean): TreeOrderedBuf[c.type] = { import c.universe._ val javaType = TermName(javaTypeStr) diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ProductOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ProductOrderedBuf.scala index 29da177cad..e6e431052a 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ProductOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/ProductOrderedBuf.scala @@ -17,14 +17,12 @@ package com.twitter.scalding.serialization.macros.impl.ordered_serialization.pro import scala.reflect.macros.blackbox.Context -import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ -ProductLike, - TreeOrderedBuf -} +import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ProductLike, TreeOrderedBuf} object ProductOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]) - : PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { import c.universe._ val validTypes: List[Type] = List( typeOf[Product1[Any]], @@ -42,131 +40,101 @@ object ProductOrderedBuf { typeOf[Product13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], typeOf[Product14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], typeOf[Product15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Product16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Product17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], typeOf[ - Product16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], - typeOf[ - Product17[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]], - typeOf[ - Product18[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]], - typeOf[ - Product19[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]], - typeOf[ - Product20[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]], - typeOf[ - Product21[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]], - typeOf[ - Product22[Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any, - Any]] + Product18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] + ], + typeOf[Product19[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Product20[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Product21[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Product22[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]] ) def validType(curType: Type): Boolean = @@ -191,9 +159,11 @@ object ProductOrderedBuf { pf } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - originalType: c.Type, - outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + originalType: c.Type, + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(id)) @@ -224,19 +194,18 @@ object ProductOrderedBuf { override def get(inputStream: ctx.TermName): ctx.Tree = { - val getValProcessor = elementData.map { - case (tpe, accessorSymbol, tBuf) => - val curR = freshT("curR") - val builderTree = q""" + val getValProcessor = elementData.map { case (tpe, accessorSymbol, tBuf) => + val curR = freshT("curR") + val builderTree = q""" val $curR: ${tBuf.tpe} = { ${tBuf.get(inputStream)} } """ - (builderTree, curR) + (builderTree, curR) } q""" ..${getValProcessor.map(_._1)} - new ${originalType}(..${getValProcessor.map(_._2)}) + new $originalType(..${getValProcessor.map(_._2)}) """ } override def compare(elementA: ctx.TermName, elementB: ctx.TermName): ctx.Tree = diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/SealedTraitOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/SealedTraitOrderedBuf.scala index 56695ee484..3375108ad2 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/SealedTraitOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/SealedTraitOrderedBuf.scala @@ -20,17 +20,22 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization._ import scala.reflect.macros.blackbox.Context object SealedTraitOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if (tpe.typeSymbol.isClass && (tpe.typeSymbol.asClass.isAbstractClass || tpe.typeSymbol.asClass.isTrait)) => + case tpe + if tpe.typeSymbol.isClass && (tpe.typeSymbol.asClass.isAbstractClass || tpe.typeSymbol.asClass.isTrait) => SealedTraitOrderedBuf(c)(buildDispatcher, tpe) } pf } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(s"$id")) @@ -38,13 +43,15 @@ object SealedTraitOrderedBuf { if (knownDirectSubclasses.isEmpty) sys.error( - s"Unable to access any knownDirectSubclasses for $outerType , a bug in scala 2.10/2.11 makes this unreliable. -- ${c.enclosingPosition}") + s"Unable to access any knownDirectSubclasses for $outerType , a bug in scala 2.10/2.11 makes this unreliable. -- ${c.enclosingPosition}" + ) // 22 is a magic number, so pick it aligning with usual size for case class fields // could be bumped, but the getLength method may get slow, or fail to compile at some point. if (knownDirectSubclasses.size > 22) sys.error( - s"More than 22 subclasses($outerType). This code is inefficient for this and may cause jvm errors. Supply code manually. -- ${c.enclosingPosition}") + s"More than 22 subclasses($outerType). This code is inefficient for this and may cause jvm errors. Supply code manually. -- ${c.enclosingPosition}" + ) val subClassesValid = knownDirectSubclasses.forall { sc => scala.util.Try(sc.asType.asClass.isCaseClass).getOrElse(false) @@ -52,7 +59,8 @@ object SealedTraitOrderedBuf { if (!subClassesValid) sys.error( - s"We only support the extension of a sealed trait with case classes, for type $outerType -- ${c.enclosingPosition}") + s"We only support the extension of a sealed trait with case classes, for type $outerType -- ${c.enclosingPosition}" + ) val dispatcher = buildDispatcher @@ -66,8 +74,10 @@ object SealedTraitOrderedBuf { .zipWithIndex .map { case ((tpe, tbuf), idx) => (idx, tpe, tbuf) } - require(subData.nonEmpty, - "Unable to parse any subtypes for the sealed trait, error. This must be an error.") + require( + subData.nonEmpty, + "Unable to parse any subtypes for the sealed trait, error. This must be an error." + ) new TreeOrderedBuf[c.type] { override val ctx: c.type = c diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala index 3c9cd39e62..cdf88d55b7 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala @@ -3,13 +3,10 @@ package com.twitter.scalding.serialization.macros.impl.ordered_serialization.pro import scala.reflect.macros.whitebox.Context /** - * The `knownDirectSubclasses` method doesn't provide stable ordering - * since it returns an unordered `Set` and the `Type` AST nodes don't - * override the `hashCode` method, relying on the default identity - * `hashCode`. + * The `knownDirectSubclasses` method doesn't provide stable ordering since it returns an unordered `Set` and + * the `Type` AST nodes don't override the `hashCode` method, relying on the default identity `hashCode`. * - * This function makes the ordering stable using a list ordered by the - * full name of the types. + * This function makes the ordering stable using a list ordered by the full name of the types. */ object StableKnownDirectSubclasses { diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/TraversablesOrderedBuf.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/TraversablesOrderedBuf.scala index 378f6d6a99..865afa5921 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/TraversablesOrderedBuf.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/TraversablesOrderedBuf.scala @@ -21,8 +21,8 @@ import com.twitter.scalding.serialization.macros.impl.ordered_serialization.{ TreeOrderedBuf } import CompileTimeLengthTypes._ -import scala.{ collection => sc } -import scala.collection.{ immutable => sci } +import scala.{collection => sc} +import scala.collection.{immutable => sci} sealed trait ShouldSort case object DoSort extends ShouldSort @@ -33,7 +33,9 @@ case object IsArray extends MaybeArray case object NotArray extends MaybeArray object TraversablesOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { case tpe if tpe.erasure =:= c.universe.typeOf[Iterable[Any]] => TraversablesOrderedBuf(c)(buildDispatcher, tpe, NoSort, NotArray) case tpe if tpe.erasure =:= c.universe.typeOf[sci.Iterable[Any]] => @@ -86,10 +88,11 @@ object TraversablesOrderedBuf { } def apply(c: Context)( - buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], - outerType: c.Type, - maybeSort: ShouldSort, - maybeArray: MaybeArray): TreeOrderedBuf[c.type] = { + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type, + maybeSort: ShouldSort, + maybeArray: MaybeArray + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = TermName(c.freshName(s"fresh_$id")) @@ -101,8 +104,10 @@ object TraversablesOrderedBuf { // When dealing with a map we have 2 type args, and need to generate the tuple type // it would correspond to if we .toList the Map. val innerType = if (outerType.asInstanceOf[TypeRefApi].args.size == 2) { - val (tpe1, tpe2) = (outerType.asInstanceOf[TypeRefApi].args.head, - outerType.asInstanceOf[TypeRefApi].args(1)) // linter:ignore + val (tpe1, tpe2) = ( + outerType.asInstanceOf[TypeRefApi].args.head, + outerType.asInstanceOf[TypeRefApi].args(1) + ) // linter:ignore val containerType = typeOf[Tuple2[Any, Any]].asInstanceOf[TypeRef] import compat._ TypeRef.apply(containerType.pre, containerType.sym, List(tpe1, tpe2)) @@ -193,10 +198,8 @@ object TraversablesOrderedBuf { $element.foreach { t => val $target = t $currentHash = - _root_.com.twitter.scalding.serialization.MurmurHashUtils.mixH1($currentHash, ${ - innerBuf - .hash(target) - }) + _root_.com.twitter.scalding.serialization.MurmurHashUtils.mixH1($currentHash, ${innerBuf + .hash(target)}) // go ahead and compute the length so we don't traverse twice for lists $len += 1 } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala index f69f83dfa2..5f6ea20ec6 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala @@ -16,8 +16,7 @@ package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers /** - * There is a Monoid on MaybeLength, with - * ConstLen(0) being the zero. + * There is a Monoid on MaybeLength, with ConstLen(0) being the zero. */ sealed trait MaybeLength { def +(that: MaybeLength): MaybeLength @@ -28,15 +27,15 @@ case object NoLengthCalculation extends MaybeLength { } final case class ConstLen(toInt: Int) extends MaybeLength { def +(that: MaybeLength): MaybeLength = that match { - case ConstLen(c) => ConstLen(toInt + c) - case DynamicLen(d) => DynamicLen(toInt + d) + case ConstLen(c) => ConstLen(toInt + c) + case DynamicLen(d) => DynamicLen(toInt + d) case NoLengthCalculation => NoLengthCalculation } } final case class DynamicLen(toInt: Int) extends MaybeLength { def +(that: MaybeLength): MaybeLength = that match { - case ConstLen(c) => DynamicLen(toInt + c) - case DynamicLen(d) => DynamicLen(toInt + d) + case ConstLen(c) => DynamicLen(toInt + c) + case DynamicLen(d) => DynamicLen(toInt + d) case NoLengthCalculation => NoLengthCalculation } } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala index 91c90e3c00..534fcd5790 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala @@ -15,19 +15,19 @@ */ package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers -import com.twitter.scalding.serialization.{ EquivSerialization, OrderedSerialization } +import com.twitter.scalding.serialization.{EquivSerialization, OrderedSerialization} object MacroEqualityOrderedSerialization { private val seed = "MacroEqualityOrderedSerialization".hashCode } abstract class MacroEqualityOrderedSerialization[T] - extends OrderedSerialization[T] - with EquivSerialization[T] { + extends OrderedSerialization[T] + with EquivSerialization[T] { def uniqueId: String override def hashCode = MacroEqualityOrderedSerialization.seed ^ uniqueId.hashCode override def equals(other: Any): Boolean = other match { case o: MacroEqualityOrderedSerialization[_] => o.uniqueId == uniqueId - case _ => false + case _ => false } } diff --git a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/TraversableHelpers.scala b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/TraversableHelpers.scala index a754de1f82..4c720ddea8 100644 --- a/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/TraversableHelpers.scala +++ b/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/TraversableHelpers.scala @@ -22,7 +22,8 @@ object TraversableHelpers { import com.twitter.scalding.serialization.JavaStreamEnrichments._ final def rawCompare(inputStreamA: InputStream, inputStreamB: InputStream)( - consume: (InputStream, InputStream) => Int): Int = { + consume: (InputStream, InputStream) => Int + ): Int = { val lenA = inputStreamA.readPosVarInt val lenB = inputStreamB.readPosVarInt @@ -38,8 +39,9 @@ object TraversableHelpers { else java.lang.Integer.compare(lenA, lenB) } - final def iteratorCompare[T](iteratorA: Iterator[T], iteratorB: Iterator[T])( - implicit ord: Ordering[T]): Int = { + final def iteratorCompare[T](iteratorA: Iterator[T], iteratorB: Iterator[T])(implicit + ord: Ordering[T] + ): Int = { @annotation.tailrec def result: Int = if (iteratorA.isEmpty) { @@ -57,8 +59,9 @@ object TraversableHelpers { result } - final def iteratorEquiv[T](iteratorA: Iterator[T], iteratorB: Iterator[T])( - implicit eq: Equiv[T]): Boolean = { + final def iteratorEquiv[T](iteratorA: Iterator[T], iteratorB: Iterator[T])(implicit + eq: Equiv[T] + ): Boolean = { @annotation.tailrec def result: Boolean = if (iteratorA.isEmpty) iteratorB.isEmpty @@ -71,15 +74,12 @@ object TraversableHelpers { /** * This returns the same result as * - * implicit val o = ord - * Ordering[Iterable[T]].compare(travA.toList.sorted, travB.toList.sorted) + * implicit val o = ord Ordering[Iterable[T]].compare(travA.toList.sorted, travB.toList.sorted) * - * but it does not do a full sort. Instead it uses a partial quicksort approach - * the complexity should be O(N + M) rather than O(N log N + M log M) for the full - * sort case + * but it does not do a full sort. Instead it uses a partial quicksort approach the complexity should be O(N + * + M) rather than O(N log N + M log M) for the full sort case */ - final def sortedCompare[T](travA: Iterable[T], travB: Iterable[T])( - implicit ord: Ordering[T]): Int = { + final def sortedCompare[T](travA: Iterable[T], travB: Iterable[T])(implicit ord: Ordering[T]): Int = { def compare(startA: Int, endA: Int, a: Buffer[T], startB: Int, endB: Int, b: Buffer[T]): Int = if (startA == endA) { if (startB == endB) 0 // both empty @@ -87,11 +87,7 @@ object TraversableHelpers { } else if (startB == endB) 1 // non-empty is bigger than empty else { @annotation.tailrec - def partition(pivot: T, - pivotStart: Int, - pivotEnd: Int, - endX: Int, - x: Buffer[T]): (Int, Int) = + def partition(pivot: T, pivotStart: Int, pivotEnd: Int, endX: Int, x: Buffer[T]): (Int, Int) = if (pivotEnd >= endX) (pivotStart, pivotEnd) else { val t = x(pivotEnd) @@ -153,9 +149,11 @@ object TraversableHelpers { val minpsize = math.min(apsize, bpsize) val acheck = aps + minpsize val bcheck = bps + minpsize - if (apsize != bpsize && + if ( + apsize != bpsize && acheck < endA && - bcheck < endB) { + bcheck < endB + ) { // exactly one of them has a pivot value ord.compare(a(acheck), b(bcheck)) } else { @@ -167,8 +165,7 @@ object TraversableHelpers { } /** - * If we are equal unsorted, we are equal. - * this is useful because often scala will build identical sets + * If we are equal unsorted, we are equal. this is useful because often scala will build identical sets * exactly the same way, so this fast check will work. */ if (iteratorEquiv(travA.iterator, travB.iterator)(ord)) 0 diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/JavaStreamEnrichmentsProperties.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/JavaStreamEnrichmentsProperties.scala index 71cec6cccf..79f825f5bb 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/JavaStreamEnrichmentsProperties.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/JavaStreamEnrichmentsProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import org.scalacheck.Arbitrary @@ -44,10 +44,9 @@ object JavaStreamEnrichmentsProperties extends Properties("JavaStreamEnrichments } } implicit def teq[T1: Equiv, T2: Equiv]: Equiv[(T1, T2)] = new Equiv[(T1, T2)] { - def equiv(a: (T1, T2), b: (T1, T2)) = { + def equiv(a: (T1, T2), b: (T1, T2)) = Equiv[T1].equiv(a._1, b._1) && Equiv[T2].equiv(a._2, b._2) - } } def writeRead[T: Equiv](g: Gen[T], w: (T, OutputStream) => Unit, r: InputStream => T): Prop = @@ -59,11 +58,10 @@ object JavaStreamEnrichmentsProperties extends Properties("JavaStreamEnrichments def writeRead[T: Equiv: Arbitrary](w: (T, OutputStream) => Unit, r: InputStream => T): Prop = writeRead(implicitly[Arbitrary[T]].arbitrary, w, r) - property("Can (read/write)Size") = writeRead(Gen.chooseNum(0, Int.MaxValue), - { (i: Int, os) => os.writePosVarInt(i) }, { _.readPosVarInt }) + property("Can (read/write)Size") = + writeRead(Gen.chooseNum(0, Int.MaxValue), (i: Int, os) => os.writePosVarInt(i), _.readPosVarInt) - property("Can (read/write)Float") = writeRead( - { (i: Float, os) => os.writeFloat(i) }, { _.readFloat }) + property("Can (read/write)Float") = writeRead((i: Float, os) => os.writeFloat(i), _.readFloat) property("Can (read/write)Array[Byte]") = writeRead( // Use list because Array has a shitty toString @@ -72,26 +70,25 @@ object JavaStreamEnrichmentsProperties extends Properties("JavaStreamEnrichments val bytes = new Array[Byte](is.readPosVarInt) is.readFully(bytes) bytes.toList - }) + } + ) - property("Can (read/write)Boolean") = writeRead( - { (i: Boolean, os) => os.writeBoolean(i) }, { _.readBoolean }) + property("Can (read/write)Boolean") = writeRead((i: Boolean, os) => os.writeBoolean(i), _.readBoolean) - property("Can (read/write)Double") = writeRead( - { (i: Double, os) => os.writeDouble(i) }, { _.readDouble }) + property("Can (read/write)Double") = writeRead((i: Double, os) => os.writeDouble(i), _.readDouble) - property("Can (read/write)Int") = writeRead(Gen.chooseNum(Int.MinValue, Int.MaxValue), - { (i: Int, os) => os.writeInt(i) }, { _.readInt }) + property("Can (read/write)Int") = + writeRead(Gen.chooseNum(Int.MinValue, Int.MaxValue), (i: Int, os) => os.writeInt(i), _.readInt) - property("Can (read/write)Long") = writeRead(Gen.chooseNum(Long.MinValue, Long.MaxValue), - { (i: Long, os) => os.writeLong(i) }, { _.readLong }) + property("Can (read/write)Long") = + writeRead(Gen.chooseNum(Long.MinValue, Long.MaxValue), (i: Long, os) => os.writeLong(i), _.readLong) - property("Can (read/write)Short") = writeRead(Gen.chooseNum(Short.MinValue, Short.MaxValue), - { (i: Short, os) => os.writeShort(i) }, { _.readShort }) + property("Can (read/write)Short") = + writeRead(Gen.chooseNum(Short.MinValue, Short.MaxValue), (i: Short, os) => os.writeShort(i), _.readShort) - property("Can (read/write)UnsignedByte") = writeRead(Gen.chooseNum(0, (1 << 8) - 1), - { (i: Int, os) => os.write(i.toByte) }, { _.readUnsignedByte }) + property("Can (read/write)UnsignedByte") = + writeRead(Gen.chooseNum(0, (1 << 8) - 1), (i: Int, os) => os.write(i.toByte), _.readUnsignedByte) - property("Can (read/write)UnsignedShort") = writeRead(Gen.chooseNum(0, (1 << 16) - 1), - { (i: Int, os) => os.writeShort(i.toShort) }, { _.readUnsignedShort }) + property("Can (read/write)UnsignedShort") = + writeRead(Gen.chooseNum(0, (1 << 16) - 1), (i: Int, os) => os.writeShort(i.toShort), _.readUnsignedShort) } diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/SerializationProperties.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/SerializationProperties.scala index fea20f433a..ad91009c1b 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/SerializationProperties.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/SerializationProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import org.scalacheck.Arbitrary @@ -24,7 +24,7 @@ import org.scalacheck.Prop._ import JavaStreamEnrichments._ import java.io._ -import scala.util.{ Try, Success } +import scala.util.{Success, Try} object LawTester { def apply[T: Arbitrary](base: String, laws: Iterable[Law[T]]): Properties = @@ -64,7 +64,8 @@ object SerializationProperties extends Properties("SerializationProperties") { class IntTryWrapperClass(val x: Int) implicit val myTryIntWrapperOrdSer: OrderedSerialization[IntTryWrapperClass] = - OrderedSerialization.viaTryTransform[IntTryWrapperClass, Int](_.x, { x: Int => Success(new IntTryWrapperClass(x)) }) + OrderedSerialization + .viaTryTransform[IntTryWrapperClass, Int](_.x, { x: Int => Success(new IntTryWrapperClass(x)) }) implicit val arbIntWrapperClass: Arbitrary[IntWrapperClass] = Arbitrary(implicitly[Arbitrary[Int]].arbitrary.map(new IntWrapperClass(_))) diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala index 1e47f44aba..3af9e440fd 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala @@ -9,18 +9,18 @@ object UnsignedComparisonLaws extends Properties("UnsignedComparisonLaws") { property("UnsignedLongCompare works") = forAll { (l1: Long, l2: Long) => val cmp = UnsignedComparisons.unsignedLongCompare(l1, l2) (l1 >= 0, l2 >= 0) match { - case (true, true) => cmp == java.lang.Long.compare(l1, l2) - case (true, false) => cmp < 0 // negative is bigger - case (false, true) => cmp > 0 + case (true, true) => cmp == java.lang.Long.compare(l1, l2) + case (true, false) => cmp < 0 // negative is bigger + case (false, true) => cmp > 0 case (false, false) => cmp == java.lang.Long.compare(l1 & Long.MaxValue, l2 & Long.MaxValue) } } property("UnsignedIntCompare works") = forAll { (l1: Int, l2: Int) => val cmp = UnsignedComparisons.unsignedIntCompare(l1, l2) (l1 >= 0, l2 >= 0) match { - case (true, true) => cmp == java.lang.Integer.compare(l1, l2) - case (true, false) => cmp < 0 // negative is bigger - case (false, true) => cmp > 0 + case (true, true) => cmp == java.lang.Integer.compare(l1, l2) + case (true, false) => cmp < 0 // negative is bigger + case (false, true) => cmp > 0 case (false, false) => cmp == java.lang.Integer.compare(l1 & Int.MaxValue, l2 & Int.MaxValue) } } @@ -28,7 +28,7 @@ object UnsignedComparisonLaws extends Properties("UnsignedComparisonLaws") { def clamp(i: Int) = if (i > 0) 1 else if (i < 0) -1 else 0 val cmp = clamp(UnsignedComparisons.unsignedByteCompare(l1, l2)) (l1 >= 0, l2 >= 0) match { - case (true, true) => cmp == clamp(java.lang.Byte.compare(l1, l2)) + case (true, true) => cmp == clamp(java.lang.Byte.compare(l1, l2)) case (true, false) => cmp < 0 // negative is bigger case (false, true) => cmp > 0 // Convert to positive ints @@ -36,4 +36,3 @@ object UnsignedComparisonLaws extends Properties("UnsignedComparisonLaws") { } } } - diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/WriterReaderProperties.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/WriterReaderProperties.scala index 8705541535..e1deb2dcf6 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/WriterReaderProperties.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/WriterReaderProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization import org.scalacheck.Arbitrary @@ -46,10 +46,9 @@ object WriterReaderProperties extends Properties("WriterReaderProperties") { } } implicit def teq[T1: Equiv, T2: Equiv]: Equiv[(T1, T2)] = new Equiv[(T1, T2)] { - def equiv(a: (T1, T2), b: (T1, T2)) = { + def equiv(a: (T1, T2), b: (T1, T2)) = Equiv[T1].equiv(a._1, b._1) && Equiv[T2].equiv(a._2, b._2) - } } def writerReader[T: Writer: Reader: Equiv](g: Gen[T]): Prop = @@ -61,12 +60,13 @@ object WriterReaderProperties extends Properties("WriterReaderProperties") { def writerReader[T: Writer: Reader: Equiv: Arbitrary]: Prop = writerReader(implicitly[Arbitrary[T]].arbitrary) - def writerReaderCollection[T: Writer: Reader, C <: Iterable[T]: Arbitrary: Equiv](implicit cbf: CanBuildFrom[Nothing, T, C]): Prop = - { - implicit val cwriter: Writer[C] = Writer.collection[T, C] - implicit val creader: Reader[C] = Reader.collection[T, C] - writerReader(implicitly[Arbitrary[C]].arbitrary) - } + def writerReaderCollection[T: Writer: Reader, C <: Iterable[T]: Arbitrary: Equiv](implicit + cbf: CanBuildFrom[Nothing, T, C] + ): Prop = { + implicit val cwriter: Writer[C] = Writer.collection[T, C] + implicit val creader: Reader[C] = Reader.collection[T, C] + writerReader(implicitly[Arbitrary[C]].arbitrary) + } /* * Test the Writer/Reader type-classes @@ -83,20 +83,14 @@ object WriterReaderProperties extends Properties("WriterReaderProperties") { property("Array[Byte] Writer/Reader") = writerReader[Array[Byte]] property("Array[Int] Writer/Reader") = writerReader[Array[Int]] property("Array[String] Writer/Reader") = writerReader[Array[String]] - property("List[String] Writer/Reader") = - writerReaderCollection[String, List[String]] - property("(Int, Array[String]) Writer/Reader") = - writerReader[(Int, Array[String])] + property("List[String] Writer/Reader") = writerReaderCollection[String, List[String]] + property("(Int, Array[String]) Writer/Reader") = writerReader[(Int, Array[String])] - property("Option[(Int, Double)] Writer/Reader") = - writerReader[Option[(Int, Double)]] + property("Option[(Int, Double)] Writer/Reader") = writerReader[Option[(Int, Double)]] - property("Option[Option[Unit]] Writer/Reader") = - writerReader[Option[Option[Unit]]] + property("Option[Option[Unit]] Writer/Reader") = writerReader[Option[Option[Unit]]] - property("Either[Int, String] Writer/Reader") = - writerReader[Either[Int, String]] + property("Either[Int, String] Writer/Reader") = writerReader[Either[Int, String]] - property("Map[Long, Byte] Writer/Reader") = - writerReaderCollection[(Long, Byte), Map[Long, Byte]] + property("Map[Long, Byte] Writer/Reader") = writerReaderCollection[(Long, Byte), Map[Long, Byte]] } diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/MacroOrderingProperties.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/MacroOrderingProperties.scala index 971bd681bf..7b566d36fd 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/MacroOrderingProperties.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/MacroOrderingProperties.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization.macros import scala.language.higherKinds -import java.io.{ ByteArrayOutputStream, InputStream } +import java.io.{ByteArrayOutputStream, InputStream} import java.nio.ByteBuffer import com.twitter.scalding.serialization.{ @@ -28,9 +28,9 @@ import com.twitter.scalding.serialization.{ OrderedSerialization, Serialization } -import org.scalacheck.Arbitrary.{ arbitrary => arb } -import org.scalacheck.{ Arbitrary, Gen, Prop } -import org.scalatest.prop.{ Checkers, PropertyChecks } +import org.scalacheck.Arbitrary.{arbitrary => arb} +import org.scalacheck.{Arbitrary, Gen, Prop} +import org.scalatest.prop.{Checkers, PropertyChecks} import org.scalatest.FunSuite //, ShouldMatchers } import com.twitter.scalding.some.other.space.space._ import scala.collection.immutable.Queue @@ -147,17 +147,18 @@ case object A extends TestSealedAbstractClass(None) case object B extends TestSealedAbstractClass(Some("b")) sealed trait SealedTraitTest -case class TestCC(a: Int, - b: Long, - c: Option[Int], - d: Double, - e: Option[String], - f: Option[List[String]], - aBB: ByteBuffer) - extends SealedTraitTest +case class TestCC( + a: Int, + b: Long, + c: Option[Int], + d: Double, + e: Option[String], + f: Option[List[String]], + aBB: ByteBuffer +) extends SealedTraitTest case class TestCaseClassB(a: Int, b: Long, c: Option[Int], d: Double, e: Option[String]) - extends SealedTraitTest + extends SealedTraitTest case class TestCaseClassD(a: Int) extends SealedTraitTest @@ -203,19 +204,18 @@ object MyData { } } -class MyData(override val _1: Int, override val _2: Option[Long]) - extends Product2[Int, Option[Long]] { +class MyData(override val _1: Int, override val _2: Option[Long]) extends Product2[Int, Option[Long]] { override def canEqual(that: Any): Boolean = that match { case o: MyData => true - case _ => false + case _ => false } override def equals(obj: scala.Any): Boolean = obj match { case o: MyData => (o._2, _2) match { case (Some(l), Some(r)) => r == l && _1 == o._1 - case (None, None) => _1 == o._1 - case _ => false + case (None, None) => _1 == o._1 + case _ => false } case _ => false } @@ -261,7 +261,7 @@ class MacroOpaqueContainer(val myField: Int) { override def equals(obj: scala.Any): Boolean = obj match { case that: MacroOpaqueContainer => that.myField == myField - case _ => false + case _ => false } } @@ -275,26 +275,24 @@ object Container { type SetAlias = Set[Double] case class InnerCaseClass(e: SetAlias) } -class MacroOrderingProperties - extends FunSuite - with PropertyChecks - with BinaryOrdering { +class MacroOrderingProperties extends FunSuite with PropertyChecks with BinaryOrdering { type SetAlias = Set[Double] import ByteBufferArb._ import Container.arbitraryInnerCaseClass - import OrderedSerialization.{ compare => oBufCompare } + import OrderedSerialization.{compare => oBufCompare} def gen[T: Arbitrary]: Gen[T] = implicitly[Arbitrary[T]].arbitrary def arbMap[T: Arbitrary, U](fn: T => U): Arbitrary[U] = Arbitrary(gen[T].map(fn)) - def collectionArb[C[_], T: Arbitrary]( - implicit cbf: collection.generic.CanBuildFrom[Nothing, T, C[T]]): Arbitrary[C[T]] = + def collectionArb[C[_], T: Arbitrary](implicit + cbf: collection.generic.CanBuildFrom[Nothing, T, C[T]] + ): Arbitrary[C[T]] = Arbitrary { gen[List[T]].map { l => val builder = cbf() - l.foreach { builder += _ } + l.foreach(builder += _) builder.result } } @@ -306,9 +304,9 @@ class MacroOrderingProperties import JavaStreamEnrichments._ val baos = new ByteArrayOutputStream - t.foreach({ e => + t.foreach { e => orderedBuffer.write(baos, e) - }) + } baos.toInputStream } @@ -323,19 +321,20 @@ class MacroOrderingProperties def checkManyExplicit[T](i: List[(T, T)])(implicit obuf: OrderedSerialization[T]) = { val serializedA = serializeSeq(i.map(_._1)) val serializedB = serializeSeq(i.map(_._2)) - i.foreach { - case (a, b) => - val compareBinary = obuf.compareBinary(serializedA, serializedB).unsafeToInt - val compareMem = obuf.compare(a, b) - if (compareBinary < 0) { - assert( - compareMem < 0, - s"Compare binary: $compareBinary, and compareMem : $compareMem must have the same sign") - } else if (compareBinary > 0) { - assert( - compareMem > 0, - s"Compare binary: $compareBinary, and compareMem : $compareMem must have the same sign") - } + i.foreach { case (a, b) => + val compareBinary = obuf.compareBinary(serializedA, serializedB).unsafeToInt + val compareMem = obuf.compare(a, b) + if (compareBinary < 0) { + assert( + compareMem < 0, + s"Compare binary: $compareBinary, and compareMem : $compareMem must have the same sign" + ) + } else if (compareBinary > 0) { + assert( + compareMem > 0, + s"Compare binary: $compareBinary, and compareMem : $compareMem must have the same sign" + ) + } } } @@ -357,10 +356,14 @@ class MacroOrderingProperties assert(oBufCompare(rta, a) === 0, s"A should be equal to itself after an RT -- ${rt(a)}") assert(oBufCompare(rtb, b) === 0, s"B should be equal to itself after an RT-- ${rt(b)}") assert(oBufCompare(a, b) + oBufCompare(b, a) === 0, "In memory comparasons make sense") - assert(rawCompare(a, b) + rawCompare(b, a) === 0, - "When adding the raw compares in inverse order they should sum to 0") - assert(oBufCompare(rta, rtb) === oBufCompare(a, b), - "Comparing a and b with ordered bufferables compare after a serialization RT") + assert( + rawCompare(a, b) + rawCompare(b, a) === 0, + "When adding the raw compares in inverse order they should sum to 0" + ) + assert( + oBufCompare(rta, rtb) === oBufCompare(a, b), + "Comparing a and b with ordered bufferables compare after a serialization RT" + ) } def checkAreSame[T](a: T, b: T)(implicit obuf: OrderedSerialization[T]): Unit = { @@ -370,12 +373,12 @@ class MacroOrderingProperties assert(oBufCompare(rtb, b) === 0, s"B should be equal to itself after an RT-- ${rt(b)}") assert(oBufCompare(a, b) === 0, "In memory comparasons make sense") assert(oBufCompare(b, a) === 0, "In memory comparasons make sense") - assert(rawCompare(a, b) === 0, - "When adding the raw compares in inverse order they should sum to 0") - assert(rawCompare(b, a) === 0, - "When adding the raw compares in inverse order they should sum to 0") - assert(oBufCompare(rta, rtb) === 0, - "Comparing a and b with ordered bufferables compare after a serialization RT") + assert(rawCompare(a, b) === 0, "When adding the raw compares in inverse order they should sum to 0") + assert(rawCompare(b, a) === 0, "When adding the raw compares in inverse order they should sum to 0") + assert( + oBufCompare(rta, rtb) === 0, + "Comparing a and b with ordered bufferables compare after a serialization RT" + ) } def check[T: Arbitrary](implicit obuf: OrderedSerialization[T]) = { @@ -413,7 +416,7 @@ class MacroOrderingProperties } check[java.lang.Boolean] } - test("Test out Byte") { check[Byte] } + test("Test out Byte")(check[Byte]) test("Test out jl.Byte") { implicit val a = arbMap { b: Byte => java.lang.Byte.valueOf(b) @@ -421,7 +424,7 @@ class MacroOrderingProperties check[java.lang.Byte] checkCollisions[java.lang.Byte] } - test("Test out Short") { check[Short] } + test("Test out Short")(check[Short]) test("Test out jl.Short") { implicit val a = arbMap { b: Short => java.lang.Short.valueOf(b) @@ -429,7 +432,7 @@ class MacroOrderingProperties check[java.lang.Short] checkCollisions[java.lang.Short] } - test("Test out Char") { check[Char] } + test("Test out Char")(check[Char]) test("Test out jl.Char") { implicit val a = arbMap { b: Char => java.lang.Character.valueOf(b) @@ -475,7 +478,7 @@ class MacroOrderingProperties checkCollisions[java.lang.Integer] } - test("Test out Float") { check[Float] } + test("Test out Float")(check[Float]) test("Test out jl.Float") { implicit val a = arbMap { b: Float => java.lang.Float.valueOf(b) @@ -483,7 +486,7 @@ class MacroOrderingProperties check[java.lang.Float] checkCollisions[java.lang.Float] } - test("Test out Long") { check[Long] } + test("Test out Long")(check[Long]) test("Test out jl.Long") { implicit val a = arbMap { b: Long => java.lang.Long.valueOf(b) @@ -491,7 +494,7 @@ class MacroOrderingProperties check[java.lang.Long] checkCollisions[java.lang.Long] } - test("Test out Double") { check[Double] } + test("Test out Double")(check[Double]) test("Test out jl.Double") { implicit val a = arbMap { b: Double => java.lang.Double.valueOf(b) @@ -676,7 +679,8 @@ class MacroOrderingProperties "堒凳媨쉏떽㶥⾽샣井ㆠᇗ裉깴辫࠷᤭塈䎙寫㸉ᶴ䰄똇䡷䥞㷗䷱赫懓䷏剆祲ᝯ졑쐯헢鷴ӕ秔㽰ퟡ㏉鶖奚㙰银䮌ᕗ膾买씋썴행䣈丶偝쾕鐗쇊ኋ넥︇瞤䋗噯邧⹆♣ἷ铆玼⪷沕辤ᠥ⥰箼䔄◗", "騰쓢堷뛭ᣣﰩ嚲ﲯ㤑ᐜ檊೦⠩奯ᓩ윇롇러ᕰెꡩ璞﫼᭵礀閮䈦椄뾪ɔ믻䖔᪆嬽フ鶬曭꣍ᆏ灖㐸뗋ㆃ녵ퟸ겵晬礙㇩䫓ᘞ昑싨", "좃ఱ䨻綛糔唄࿁劸酊᫵橻쩳괊筆ݓ淤숪輡斋靑耜঄骐冠㝑⧠떅漫곡祈䵾ᳺ줵됵↲搸虂㔢Ꝅ芆٠풐쮋炞哙⨗쾄톄멛癔짍避쇜畾㣕剼⫁়╢ꅢ澛氌ᄚ㍠ꃫᛔ匙㜗詇閦單錖⒅瘧崥", - "獌癚畇") + "獌癚畇" + ) checkManyExplicit(c.map { i => (i, i) }) @@ -714,9 +718,9 @@ class MacroOrderingProperties val oser = BinaryOrdering.ordSer[Either[Int, String]] assert(oser.staticSize === None, "can't get the size statically") assert( - Some(Serialization.toBytes[Either[Int, String]](Left(1)).length) === oser.dynamicSize( - Left(1)), - "serialization size matches dynamic size") + Some(Serialization.toBytes[Either[Int, String]](Left(1)).length) === oser.dynamicSize(Left(1)), + "serialization size matches dynamic size" + ) check[Either[Int, String]] checkCollisions[Either[Int, String]] } @@ -762,9 +766,7 @@ class MacroOrderingProperties } test("test specific tuple 3") { - val c = List( - ("", None, ""), - ("a", Some(1), "b")) + val c = List(("", None, ""), ("a", Some(1), "b")) checkManyExplicit(c.map { i => (i, i) }) @@ -851,8 +853,7 @@ class MacroOrderingProperties noOrderedSerialization[BigTrait] } - def fn[A]( - implicit or: OrderedSerialization[A]): OrderedSerialization[TypedParameterCaseClass[A]] = + def fn[A](implicit or: OrderedSerialization[A]): OrderedSerialization[TypedParameterCaseClass[A]] = BinaryOrdering.ordSer[TypedParameterCaseClass[A]] test("Test out MacroOpaqueContainer inside a case class as an abstract type") { diff --git a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/TraversableHelperLaws.scala b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/TraversableHelperLaws.scala index 5be2427c7f..bed1d688b4 100644 --- a/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/TraversableHelperLaws.scala +++ b/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/TraversableHelperLaws.scala @@ -12,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization.macros -import org.scalatest.{ FunSuite, Matchers } +import org.scalatest.{FunSuite, Matchers} import org.scalatest.prop.PropertyChecks import impl.ordered_serialization.runtime_helpers.TraversableHelpers._ @@ -24,26 +24,38 @@ import impl.ordered_serialization.runtime_helpers.TraversableHelpers._ class TraversableHelperLaws extends FunSuite with PropertyChecks with Matchers { test("Iterator ordering should be Iterable ordering") { forAll { (l1: List[Int], l2: List[Int]) => - assert(iteratorCompare[Int](l1.iterator, l2.iterator) === - Ordering[Iterable[Int]].compare(l1, l2), "Matches scala's Iterable compare") + assert( + iteratorCompare[Int](l1.iterator, l2.iterator) === + Ordering[Iterable[Int]].compare(l1, l2), + "Matches scala's Iterable compare" + ) } } test("Iterator equiv should be Iterable ordering") { forAll { (l1: List[Int], l2: List[Int]) => - assert(iteratorEquiv[Int](l1.iterator, l2.iterator) === - Ordering[Iterable[Int]].equiv(l1, l2), "Matches scala's Iterable compare") + assert( + iteratorEquiv[Int](l1.iterator, l2.iterator) === + Ordering[Iterable[Int]].equiv(l1, l2), + "Matches scala's Iterable compare" + ) } } test("sortedCompare matches sort followed by compare List[Int]") { forAll(minSuccessful(1000)) { (l1: List[Int], l2: List[Int]) => - assert(sortedCompare[Int](l1, l2) === - Ordering[Iterable[Int]].compare(l1.sorted, l2.sorted), "Matches scala's Iterable compare") + assert( + sortedCompare[Int](l1, l2) === + Ordering[Iterable[Int]].compare(l1.sorted, l2.sorted), + "Matches scala's Iterable compare" + ) } } test("sortedCompare matches sort followed by compare Set[Int]") { forAll(minSuccessful(1000)) { (l1: Set[Int], l2: Set[Int]) => - assert(sortedCompare[Int](l1, l2) === - Ordering[Iterable[Int]].compare(l1.toList.sorted, l2.toList.sorted), "Matches scala's Iterable compare") + assert( + sortedCompare[Int](l1, l2) === + Ordering[Iterable[Int]].compare(l1.toList.sorted, l2.toList.sorted), + "Matches scala's Iterable compare" + ) } } } diff --git a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Iterators.scala b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Iterators.scala index 24d1b24769..c0e4304027 100644 --- a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Iterators.scala +++ b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Iterators.scala @@ -3,40 +3,33 @@ package com.twitter.scalding.spark_backend object Iterators { /** - * Partitions the iterator into runs of equivalent keys, then - * returns an iterator of each distinct key followed by an - * sub-iterator of its values. + * Partitions the iterator into runs of equivalent keys, then returns an iterator of each distinct key + * followed by an sub-iterator of its values. * * For example: * - * val lst = (1 to 8).map(i => (i / 3, i)).toList - * // List((0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7), (2, 8)) + * val lst = (1 to 8).map(i => (i / 3, i)).toList // List((0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6), + * (2, 7), (2, 8)) * - * groupSequential(lst.iterator).map(_.toList).toList - * // List((0, List(1, 2)), (1, List(3, 4, 5)), (2, List(6, 7, 8))) + * groupSequential(lst.iterator).map(_.toList).toList // List((0, List(1, 2)), (1, List(3, 4, 5)), (2, + * List(6, 7, 8))) * - * groupSequential() does not load the iterator's contents into - * memory more than is absolutely necessary. This means it is safe - * to use on very large (or potentially infinite) iterators as long - * as the downstream consumer handles the results carefully. + * groupSequential() does not load the iterator's contents into memory more than is absolutely necessary. + * This means it is safe to use on very large (or potentially infinite) iterators as long as the downstream + * consumer handles the results carefully. * - * Note that the sub-iterators are fragile. That means that as soon - * as you call .hasNext or .next on the top-level iterator, the - * previous sub-iterator is invalidated (and will become empty). A - * consequence of this is that you can only operate on one - * sub-iterator at a time. + * Note that the sub-iterators are fragile. That means that as soon as you call .hasNext or .next on the + * top-level iterator, the previous sub-iterator is invalidated (and will become empty). A consequence of + * this is that you can only operate on one sub-iterator at a time. * * Laws: * - * 0. groupSequential(it).map(_._2.length).sum - * ~ it.length + * 0. groupSequential(it).map(_._2.length).sum ~ it.length * - * 1. groupSequential(it).flatMap { case (k, vs) => vs.map(v => (k, v)) } - * ~ it + * 1. groupSequential(it).flatMap { case (k, vs) => vs.map(v => (k, v)) } ~ it * - * 2. given xs = lst.sortBy(_._1): - * groupSequential(xs.iterator).map { case (k, vs) => (k, vs.toList) }.toList - * ~ xs.groupBy(_._1).mapValues(_.map(_._2)).toList.sortBy(_._1) + * 2. given xs = lst.sortBy(_._1): groupSequential(xs.iterator).map { case (k, vs) => (k, vs.toList) + * }.toList ~ xs.groupBy(_._1).mapValues(_.map(_._2)).toList.sortBy(_._1) */ def groupSequential[K, V](it: Iterator[(K, V)]): Iterator[(K, Iterator[V])] = // we need to look ahead to see if we have a key/value to start @@ -51,12 +44,11 @@ object Iterators { /** * This is the internal class that powers Iterators.groupSequential. * - * This process always requires one item worth of look-ahead, so - * this class' constructor assumes the caller has already pulled - * `(k0, v0)` from the front of `it`. + * This process always requires one item worth of look-ahead, so this class' constructor assumes the caller + * has already pulled `(k0, v0)` from the front of `it`. */ private class GroupSequentialIterator[K, V](k0: K, v0: V, it: Iterator[(K, V)]) - extends Iterator[(K, Iterator[V])] { parent => + extends Iterator[(K, Iterator[V])] { parent => var ready: InnerIterator = new InnerIterator(k0, v0, it) var child: InnerIterator = null diff --git a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Op.scala b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Op.scala index 3dbf17acb5..ffde9e84e7 100644 --- a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Op.scala +++ b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/Op.scala @@ -11,7 +11,7 @@ import com.twitter.scalding.typed.TypedSource import SparkPlanner.PartitionComputer sealed abstract class Op[+A] { - import Op.{Transformed, fakeClassTag} + import Op.{fakeClassTag, Transformed} def run(session: SparkSession)(implicit ec: ExecutionContext): Future[RDD[_ <: A]] @@ -36,53 +36,76 @@ object Op extends Serializable { Transformed[(K, V), (K, U)](op, _.flatMapValues(fn)) def mapValues[U](fn: V => U): Op[(K, U)] = Transformed[(K, V), (K, U)](op, _.mapValues(fn)) - def mapGroup[U](partitionComputer: PartitionComputer)(fn: (K, Iterator[V]) => Iterator[U])(implicit ordK: Ordering[K]): Op[(K, U)] = - Transformed[(K, V), (K, U)](op, { rdd: RDD[(K, V)] => - val numPartitions = partitionComputer(rdd.getNumPartitions) - val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) - val partitioned = rdd.repartitionAndSortWithinPartitions(partitioner) - partitioned.mapPartitions({ its => - // since we are sorted, the adjacent keys are next to each other - val grouped = Iterators.groupSequential(its) - grouped.flatMap { case (k, vs) => fn(k, vs).map((k, _)) } - }, preservesPartitioning = true) - }) + def mapGroup[U](partitionComputer: PartitionComputer)(fn: (K, Iterator[V]) => Iterator[U])(implicit + ordK: Ordering[K] + ): Op[(K, U)] = + Transformed[(K, V), (K, U)]( + op, + { rdd: RDD[(K, V)] => + val numPartitions = partitionComputer(rdd.getNumPartitions) + val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) + val partitioned = rdd.repartitionAndSortWithinPartitions(partitioner) + partitioned.mapPartitions( + { its => + // since we are sorted, the adjacent keys are next to each other + val grouped = Iterators.groupSequential(its) + grouped.flatMap { case (k, vs) => fn(k, vs).map((k, _)) } + }, + preservesPartitioning = true + ) + } + ) def hashJoin[U, W](right: Op[(K, U)])(fn: (K, V, Iterable[U]) => Iterator[W]): Op[(K, W)] = HashJoinOp(op, right, fn) - def sorted(partitionComputer: PartitionComputer)(implicit ordK: Ordering[K], ordV: Ordering[V]): Op[(K, V)] = - Transformed[(K, V), (K, V)](op, { rdd: RDD[(K, V)] => - // The idea here is that we put the key and the value in - // logical key, but partition only on the left part of the key - val numPartitions = partitionComputer(rdd.getNumPartitions) - val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) - val keyOnlyPartioner = KeyHashPartitioner(partitioner) - val unitValue: RDD[((K, V), Unit)] = rdd.map { kv => (kv, ()) } - val partitioned = unitValue.repartitionAndSortWithinPartitions(keyOnlyPartioner) - partitioned.mapPartitions({ its => - // discard the unit value - its.map { case (kv, _) => kv } - }, preservesPartitioning = true) // the keys haven't changed - }) - - def sortedMapGroup[U](partitionComputer: PartitionComputer)(fn: (K, Iterator[V]) => Iterator[U])(implicit ordK: Ordering[K], ordV: Ordering[V]): Op[(K, U)] = - Transformed[(K, V), (K, U)](op, { rdd: RDD[(K, V)] => - // The idea here is that we put the key and the value in - // logical key, but partition only on the left part of the key - val numPartitions = partitionComputer(rdd.getNumPartitions) - val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) - val keyOnlyPartioner = KeyHashPartitioner(partitioner) - val unitValue: RDD[((K, V), Unit)] = rdd.map { kv => (kv, ()) } - val partitioned = unitValue.repartitionAndSortWithinPartitions(keyOnlyPartioner) - partitioned.mapPartitions({ its => - // discard the unit value - val kviter = its.map { case (kv, _) => kv } - // since we are sorted first by key, then value, the keys are grouped - val grouped = Iterators.groupSequential(kviter) - grouped.flatMap { case (k, vs) => fn(k, vs).map((k, _)) } - }, preservesPartitioning = true) // the keys haven't changed - }) + def sorted( + partitionComputer: PartitionComputer + )(implicit ordK: Ordering[K], ordV: Ordering[V]): Op[(K, V)] = + Transformed[(K, V), (K, V)]( + op, + { rdd: RDD[(K, V)] => + // The idea here is that we put the key and the value in + // logical key, but partition only on the left part of the key + val numPartitions = partitionComputer(rdd.getNumPartitions) + val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) + val keyOnlyPartioner = KeyHashPartitioner(partitioner) + val unitValue: RDD[((K, V), Unit)] = rdd.map(kv => (kv, ())) + val partitioned = unitValue.repartitionAndSortWithinPartitions(keyOnlyPartioner) + partitioned.mapPartitions( + its => + // discard the unit value + its.map { case (kv, _) => kv }, + preservesPartitioning = true + ) // the keys haven't changed + } + ) + + def sortedMapGroup[U](partitionComputer: PartitionComputer)( + fn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], ordV: Ordering[V]): Op[(K, U)] = + Transformed[(K, V), (K, U)]( + op, + { rdd: RDD[(K, V)] => + // The idea here is that we put the key and the value in + // logical key, but partition only on the left part of the key + val numPartitions = partitionComputer(rdd.getNumPartitions) + val partitioner = rdd.partitioner.getOrElse(new HashPartitioner(numPartitions)) + val keyOnlyPartioner = KeyHashPartitioner(partitioner) + val unitValue: RDD[((K, V), Unit)] = rdd.map(kv => (kv, ())) + val partitioned = unitValue.repartitionAndSortWithinPartitions(keyOnlyPartioner) + partitioned.mapPartitions( + { its => + // discard the unit value + val kviter = its.map { case (kv, _) => kv } + // since we are sorted first by key, then value, the keys are grouped + val grouped = Iterators.groupSequential(kviter) + grouped.flatMap { case (k, vs) => fn(k, vs).map((k, _)) } + }, + preservesPartitioning = true + ) // the keys haven't changed + } + ) } private case class KeyHashPartitioner(partitioner: Partitioner) extends Partitioner { @@ -124,10 +147,12 @@ object Op extends Serializable { Future(session.sparkContext.makeRDD(iterable.toSeq, 1)) } - final case class Source[A](conf: Config, original: TypedSource[A], input: Option[SparkSource[A]]) extends Op[A] { + final case class Source[A](conf: Config, original: TypedSource[A], input: Option[SparkSource[A]]) + extends Op[A] { def run(session: SparkSession)(implicit ec: ExecutionContext): Future[RDD[_ <: A]] = input match { - case None => Future.failed(new IllegalArgumentException(s"source $original was not connected to a spark source")) + case None => + Future.failed(new IllegalArgumentException(s"source $original was not connected to a spark source")) case Some(src) => src.read(session, conf) } } @@ -141,67 +166,75 @@ object Op extends Serializable { @transient private val cache = new FutureCache[SparkSession, RDD[_ <: A]] def run(session: SparkSession)(implicit ec: ExecutionContext): Future[RDD[_ <: A]] = - cache.getOrElseUpdate(session, input.run(session).map { rdd => fn(widen(rdd)) }) + cache.getOrElseUpdate(session, input.run(session).map(rdd => fn(widen(rdd)))) } final case class Merged[A](pc: PartitionComputer, left: Op[A], tail: List[Op[A]]) extends Op[A] { @transient private val cache = new FutureCache[SparkSession, RDD[_ <: A]] def run(session: SparkSession)(implicit ec: ExecutionContext): Future[RDD[_ <: A]] = - cache.getOrElseUpdate(session, { - // start running in parallel - val lrdd = left.run(session) - val tailRunning = tail.map(_.run(session)) - tailRunning match { - case Nil => lrdd - case nonEmpty => - // start all the upstream in parallel: - Future.sequence(lrdd :: nonEmpty).map { rdds => - val rddAs = rdds.map(widen[A](_)) - val merged = new UnionRDD(session.sparkContext, rddAs) - val partitions = merged.getNumPartitions - val newPartitions = pc(partitions) - if (newPartitions < partitions) { - merged.coalesce(newPartitions) - } else { - merged + cache.getOrElseUpdate( + session, { + // start running in parallel + val lrdd = left.run(session) + val tailRunning = tail.map(_.run(session)) + tailRunning match { + case Nil => lrdd + case nonEmpty => + // start all the upstream in parallel: + Future.sequence(lrdd :: nonEmpty).map { rdds => + val rddAs = rdds.map(widen[A](_)) + val merged = new UnionRDD(session.sparkContext, rddAs) + val partitions = merged.getNumPartitions + val newPartitions = pc(partitions) + if (newPartitions < partitions) { + merged.coalesce(newPartitions) + } else { + merged + } } - } + } } - }) + ) } - final case class HashJoinOp[A, B, C, D](left: Op[(A, B)], right: Op[(A, C)], joiner: (A, B, Iterable[C]) => Iterator[D]) extends Op[(A, D)] { + final case class HashJoinOp[A, B, C, D]( + left: Op[(A, B)], + right: Op[(A, C)], + joiner: (A, B, Iterable[C]) => Iterator[D] + ) extends Op[(A, D)] { @transient private val cache = new FutureCache[SparkSession, RDD[_ <: (A, D)]] def run(session: SparkSession)(implicit ec: ExecutionContext): Future[RDD[_ <: (A, D)]] = - cache.getOrElseUpdate(session, { - // start running in parallel - val rrdd = right.run(session) - val lrdd = left.run(session) - - rrdd.flatMap { rightRdd => - // TODO: spark has some thing to send replicated data to nodes - // we should materialize the small side, use the above, then - // implement a join using mapPartitions - val rightMap: Map[A, List[C]] = rightRdd - .toLocalIterator - .toList - .groupBy(_._1) - .map { case (k, vs) => (k, vs.map(_._2)) } - - val bcastMap = session.sparkContext.broadcast(rightMap) - lrdd.map { leftrdd => - - val localJoiner = joiner - - leftrdd.mapPartitions({ it: Iterator[(A, B)] => - val rightMap = bcastMap.value - it.flatMap { case (a, b) => localJoiner(a, b, rightMap.getOrElse(a, Nil)).map((a, _)) } - }, preservesPartitioning = true) + cache.getOrElseUpdate( + session, { + // start running in parallel + val rrdd = right.run(session) + val lrdd = left.run(session) + + rrdd.flatMap { rightRdd => + // TODO: spark has some thing to send replicated data to nodes + // we should materialize the small side, use the above, then + // implement a join using mapPartitions + val rightMap: Map[A, List[C]] = rightRdd.toLocalIterator.toList + .groupBy(_._1) + .map { case (k, vs) => (k, vs.map(_._2)) } + + val bcastMap = session.sparkContext.broadcast(rightMap) + lrdd.map { leftrdd => + val localJoiner = joiner + + leftrdd.mapPartitions( + { it: Iterator[(A, B)] => + val rightMap = bcastMap.value + it.flatMap { case (a, b) => localJoiner(a, b, rightMap.getOrElse(a, Nil)).map((a, _)) } + }, + preservesPartitioning = true + ) + } } } - }) + ) } } diff --git a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkBackend.scala b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkBackend.scala index 4e2c444ab2..62cb3f9864 100644 --- a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkBackend.scala +++ b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkBackend.scala @@ -1,13 +1,13 @@ package com.twitter.scalding.spark_backend -import com.stripe.dagon.{ FunctionK, Memoize } +import com.stripe.dagon.{FunctionK, Memoize} import com.twitter.algebird.Semigroup import com.twitter.scalding.Config import com.twitter.scalding.typed._ -import com.twitter.scalding.typed.functions.{ DebugFn, FilterKeysToFilter } -import java.util.{ LinkedHashMap => JLinkedHashMap, Map => JMap } +import com.twitter.scalding.typed.functions.{DebugFn, FilterKeysToFilter} +import java.util.{LinkedHashMap => JLinkedHashMap, Map => JMap} import org.apache.spark.storage.StorageLevel -import scala.collection.mutable.{ Map => MMap, ArrayBuffer } +import scala.collection.mutable.{ArrayBuffer, Map => MMap} object SparkPlanner { import SparkMode.SparkConfigMethods @@ -20,18 +20,18 @@ object SparkPlanner { } /** - * A PartitionComputer which returns the desired number of partitions given the configured - * max partition count, reducer scaling factor, number of scalding reducers, and current number of partitions. - * We calculate the desired number of partitions in two stages: - * 1. If the number of scalding reducers is provided, we scale this number by the reducer scaling factor. - * If it is <= 0 or missing, we use the current number of partitions. - * 2. If we have a configured a max number of partitions, we cap the result of 1 by this number. Otherwise, - * just return the result of 1. + * A PartitionComputer which returns the desired number of partitions given the configured max partition + * count, reducer scaling factor, number of scalding reducers, and current number of partitions. We + * calculate the desired number of partitions in two stages: + * 1. If the number of scalding reducers is provided, we scale this number by the reducer scaling factor. + * If it is <= 0 or missing, we use the current number of partitions. 2. If we have a configured a max + * number of partitions, we cap the result of 1 by this number. Otherwise, just return the result of 1. */ - final case class ConfigPartitionComputer(config: Config, scaldingReducers: Option[Int]) extends PartitionComputer { + final case class ConfigPartitionComputer(config: Config, scaldingReducers: Option[Int]) + extends PartitionComputer { def apply(currentNumPartitions: Int): Int = { val maxPartitions = config.getMaxPartitionCount - val getReducerScaling = config.getReducerScaling.getOrElse(1.0D) + val getReducerScaling = config.getReducerScaling.getOrElse(1.0d) val candidate = scaldingReducers match { case None => currentNumPartitions @@ -52,7 +52,9 @@ object SparkPlanner { // we probably always want at least 1 partition 1 } else { - throw new IllegalArgumentException("Got a negative partition count. Check configured maxPartitionCount or reducerScaling.") + throw new IllegalArgumentException( + "Got a negative partition count. Check configured maxPartitionCount or reducerScaling." + ) } } } @@ -67,9 +69,8 @@ object SparkPlanner { def toFunction[A] = { case (cp @ CounterPipe(_), rec) => // TODO: counters not yet supported - def go[A](p: CounterPipe[A]): Op[A] = { + def go[A](p: CounterPipe[A]): Op[A] = rec(p.pipe).map(_._1) - } go(cp) case (cp @ CrossPipe(_, _), rec) => def go[A, B](cp: CrossPipe[A, B]): Op[(A, B)] = @@ -112,7 +113,7 @@ object SparkPlanner { val sparkPipe = rec(pipe) config.getForceToDiskPersistMode.getOrElse(StorageLevel.DISK_ONLY) match { case StorageLevel.NONE => sparkPipe - case notNone => sparkPipe.persist(notNone) + case notNone => sparkPipe.persist(notNone) } case (Fork(pipe), rec) => val sparkPipe = rec(pipe) @@ -122,7 +123,7 @@ object SparkPlanner { // or be careful about using forceToDisk config.getForkPersistMode.getOrElse(StorageLevel.NONE) match { case StorageLevel.NONE => sparkPipe - case notNone => sparkPipe.persist(notNone) + case notNone => sparkPipe.persist(notNone) } case (IterablePipe(iterable), _) => Op.FromIterable(iterable) @@ -138,7 +139,7 @@ object SparkPlanner { // but won't otherwise optimize if not given in // one batch OptimizationRules.unrollMerge(m) match { - case Nil => rec(EmptyTypedPipe) + case Nil => rec(EmptyTypedPipe) case h :: Nil => rec(h) case h :: rest => val pc = ConfigPartitionComputer(config, None) @@ -228,7 +229,8 @@ object SparkPlanner { def next = it.next } - case class CachingSum[K, V](capacity: Int, semigroup: Semigroup[V]) extends Function1[Iterator[(K, V)], Iterator[(K, V)]] { + case class CachingSum[K, V](capacity: Int, semigroup: Semigroup[V]) + extends Function1[Iterator[(K, V)], Iterator[(K, V)]] { def newCache(evicted: MMap[K, V]): JMap[K, V] = new JLinkedHashMap[K, V](capacity + 1, 0.75f, true) { override protected def removeEldestEntry(eldest: JMap.Entry[K, V]) = if (super.size > capacity) { @@ -247,7 +249,7 @@ object SparkPlanner { def hasNext = kvs.hasNext || resultIterator.hasNext @annotation.tailrec - def next: (K, V) = { + def next: (K, V) = if (resultIterator.hasNext) { resultIterator.next } else if (kvs.hasNext) { @@ -268,11 +270,10 @@ object SparkPlanner { } else { // time to flush the cache import scala.collection.JavaConverters._ - val cacheIter = currentCache.entrySet.iterator.asScala.map { e => (e.getKey, e.getValue) } + val cacheIter = currentCache.entrySet.iterator.asScala.map(e => (e.getKey, e.getValue)) resultIterator = OnEmptyIterator(cacheIter, () => currentCache.clear()) next } - } } } } @@ -280,7 +281,11 @@ object SparkPlanner { private def planHashJoinable[K, V](hj: HashJoinable[K, V], rec: FunctionK[TypedPipe, Op]): Op[(K, V)] = rec(TypedPipe.ReduceStepPipe(HashJoinable.toReduceStep(hj))) - private def planCoGroup[K, V](config: Config, cg: CoGrouped[K, V], rec: FunctionK[TypedPipe, Op]): Op[(K, V)] = { + private def planCoGroup[K, V]( + config: Config, + cg: CoGrouped[K, V], + rec: FunctionK[TypedPipe, Op] + ): Op[(K, V)] = { import CoGrouped._ cg match { @@ -301,7 +306,7 @@ object SparkPlanner { def planSide[A, B](cg: CoGroupable[A, B]): Op[(A, B)] = cg match { case hg: HashJoinable[A, B] => planHashJoinable(hg, rec) - case cg: CoGrouped[A, B] => planCoGroup(config, cg, rec) + case cg: CoGrouped[A, B] => planCoGroup(config, cg, rec) } def planPair[A, B, C, D](p: Pair[A, B, C, D]): Op[(A, D)] = { @@ -310,14 +315,14 @@ object SparkPlanner { val joinFn = p.fn val pc = ConfigPartitionComputer(config, p.reducers) // we repartition in sorted, so no need to repartition in merge - (eleft.merge(IdentityPartitionComputer, eright)) + (eleft + .merge(IdentityPartitionComputer, eright)) .sorted(pc)(p.keyOrdering, JoinOrdering()) .mapPartitions { it => val grouped = Iterators.groupSequential(it) - grouped.flatMap { - case (k, eithers) => - val kfn: Function2[Iterator[B], Iterable[C], Iterator[D]] = joinFn(k, _, _) - JoinIterator[B, C, D](kfn)(eithers).map((k, _)) + grouped.flatMap { case (k, eithers) => + val kfn: Function2[Iterator[B], Iterable[C], Iterator[D]] = joinFn(k, _, _) + JoinIterator[B, C, D](kfn)(eithers).map((k, _)) } } } @@ -340,20 +345,20 @@ object SparkPlanner { else 0 } - case class JoinIterator[A, B, C](fn: (Iterator[A], Iterable[B]) => Iterator[C]) extends Function1[Iterator[Either[A, B]], Iterator[C]] { + case class JoinIterator[A, B, C](fn: (Iterator[A], Iterable[B]) => Iterator[C]) + extends Function1[Iterator[Either[A, B]], Iterator[C]] { @SuppressWarnings(Array("org.wartremover.warts.EitherProjectionPartial")) def apply(eitherIter: Iterator[Either[A, B]]) = { val buffered = eitherIter.buffered val bs: Iterable[B] = { @annotation.tailrec - def loop(buf: ArrayBuffer[B]): Iterable[B] = { + def loop(buf: ArrayBuffer[B]): Iterable[B] = if (buffered.isEmpty) buf else if (buffered.head.isLeft) buf else { buf += buffered.next.right.get loop(buf) } - } loop(ArrayBuffer()) } val iterA: Iterator[A] = buffered.map(_.left.get) diff --git a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkMode.scala b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkMode.scala index f5f67374d2..29e5ff0593 100644 --- a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkMode.scala +++ b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkMode.scala @@ -1,31 +1,34 @@ package com.twitter.scalding.spark_backend -import com.twitter.scalding.{Config, Mode, WritableSequenceFile, TextLine} -import com.twitter.scalding.typed.{ Resolver, TypedSource, TypedSink } +import com.twitter.scalding.{Config, Mode, TextLine, WritableSequenceFile} +import com.twitter.scalding.typed.{Resolver, TypedSink, TypedSource} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.SparkSession -import scala.concurrent.{ Future, ExecutionContext } +import scala.concurrent.{ExecutionContext, Future} import scala.reflect.ClassTag import org.apache.hadoop.io.Writable -case class SparkMode(session: SparkSession, sources: Resolver[TypedSource, SparkSource], sink: Resolver[TypedSink, SparkSink]) extends Mode { +case class SparkMode( + session: SparkSession, + sources: Resolver[TypedSource, SparkSource], + sink: Resolver[TypedSink, SparkSink] +) extends Mode { def newWriter(): SparkWriter = new SparkWriter(this) } object SparkMode { + /** - * A Sparkmode with no sources or sink support. - * Only useful for testing, or working flows that never - * read or write from disk + * A Sparkmode with no sources or sink support. Only useful for testing, or working flows that never read or + * write from disk */ def empty(session: SparkSession): SparkMode = SparkMode(session, Resolver.empty, Resolver.empty) /** - * set up the default sources and sinks, which support - * some of scalding's built in sources and sinks + * set up the default sources and sinks, which support some of scalding's built in sources and sinks */ def default(session: SparkSession): SparkMode = SparkMode(session, SparkSource.Default, SparkSink.Default) @@ -67,52 +70,61 @@ trait SparkSource[+A] extends Serializable { object SparkSource extends Serializable { def textLine(path: String, parts: Option[Int]): SparkSource[String] = new SparkSource[String] { - override def read(session: SparkSession, config: Config)(implicit ec: ExecutionContext): Future[RDD[_ <: String]] = { + override def read(session: SparkSession, config: Config)(implicit + ec: ExecutionContext + ): Future[RDD[_ <: String]] = { val partitions = parts.orElse(config.getNumReducers).getOrElse(10) Future(session.sparkContext.textFile(path, partitions)) } } - def writableSequenceFile[K <: Writable, V <: Writable](path: String, - kclass: Class[K], - vclass: Class[V]): SparkSource[(K, V)] = new SparkSource[(K, V)] { - override def read(session: SparkSession, config: Config)(implicit ec: ExecutionContext): Future[RDD[_ <: (K, V)]] = { + def writableSequenceFile[K <: Writable, V <: Writable]( + path: String, + kclass: Class[K], + vclass: Class[V] + ): SparkSource[(K, V)] = new SparkSource[(K, V)] { + override def read(session: SparkSession, config: Config)(implicit + ec: ExecutionContext + ): Future[RDD[_ <: (K, V)]] = Future(session.sparkContext.sequenceFile[K, V](path, kclass, vclass)) - } } /** - * This has a mappings for some built in scalding sources - * currently only WritableSequenceFile and TextLine are supported + * This has a mappings for some built in scalding sources currently only WritableSequenceFile and TextLine + * are supported * * users can add their own implementations and compose Resolvers using orElse */ val Default: Resolver[TypedSource, SparkSource] = new Resolver[TypedSource, SparkSource] { - def apply[A](i: TypedSource[A]): Option[SparkSource[A]] = { + def apply[A](i: TypedSource[A]): Option[SparkSource[A]] = i match { case ws @ WritableSequenceFile(path, _, _) => - Some(writableSequenceFile(path, ws.keyType, ws.valueType)) + Some(writableSequenceFile(path, ws.keyType, ws.valueType)) case tl: TextLine => // actually only one path: Some(textLine(tl.localPaths.head, None)) case _ => None } - } } } trait SparkSink[-A] extends Serializable { - def write(session: SparkSession, config: Config, rdd: RDD[_ <: A])(implicit ec: ExecutionContext): Future[Unit] + def write(session: SparkSession, config: Config, rdd: RDD[_ <: A])(implicit + ec: ExecutionContext + ): Future[Unit] } object SparkSink extends Serializable { def writableSequenceFile[K <: Writable, V <: Writable]( - path: String, - keyClass: Class[K], - valClass: Class[V]): SparkSink[(K, V)] = new SparkSink[(K, V)] { - override def write(session: SparkSession, config: Config, rdd: RDD[_ <: (K, V)])(implicit ec: ExecutionContext): Future[Unit] = { + path: String, + keyClass: Class[K], + valClass: Class[V] + ): SparkSink[(K, V)] = new SparkSink[(K, V)] { + override def write(session: SparkSession, config: Config, rdd: RDD[_ <: (K, V)])(implicit + ec: ExecutionContext + ): Future[Unit] = { // first widen to (K, V) implicit val ck: ClassTag[K] = ClassTag[K](keyClass) implicit val cv: ClassTag[V] = ClassTag[V](valClass) @@ -122,19 +134,21 @@ object SparkSink extends Serializable { def textLine(path: String): SparkSink[String] = new SparkSink[String] { - override def write(session: SparkSession, config: Config, rdd: RDD[_ <: String])(implicit ec: ExecutionContext): Future[Unit] = { + override def write(session: SparkSession, config: Config, rdd: RDD[_ <: String])(implicit + ec: ExecutionContext + ): Future[Unit] = Future(rdd.saveAsTextFile(path)) - } } + /** - * This has a mappings for some built in scalding sinks - * currently only WritableSequenceFile and TextLine are supported + * This has a mappings for some built in scalding sinks currently only WritableSequenceFile and TextLine are + * supported * * users can add their own implementations and compose Resolvers using orElse */ val Default: Resolver[TypedSink, SparkSink] = new Resolver[TypedSink, SparkSink] { - def apply[A](i: TypedSink[A]): Option[SparkSink[A]] = { + def apply[A](i: TypedSink[A]): Option[SparkSink[A]] = i match { case ws @ WritableSequenceFile(path, fields, sinkMode) => Some(writableSequenceFile(path, ws.keyType, ws.valueType).asInstanceOf[SparkSink[A]]) @@ -143,7 +157,5 @@ object SparkSink extends Serializable { case _ => None } - } } } - diff --git a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkWriter.scala b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkWriter.scala index 776c32d7e4..fa93a11169 100644 --- a/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkWriter.scala +++ b/scalding-spark/src/main/scala/com/twitter/scalding/spark_backend/SparkWriter.scala @@ -1,18 +1,18 @@ package com.twitter.scalding.spark_backend import cascading.flow.FlowDef -import com.stripe.dagon.{ HMap, Rule } +import com.stripe.dagon.{HMap, Rule} import com.twitter.scalding.typed._ import com.twitter.scalding.Mode import com.twitter.scalding.typed.memory_backend.AtomicBox -import com.twitter.scalding.{ Config, Execution, ExecutionCounters, CancellationHandler, CFuture} +import com.twitter.scalding.{CFuture, CancellationHandler, Config, Execution, ExecutionCounters} import org.apache.spark.sql.SparkSession import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -import scala.concurrent.{ Future, ExecutionContext, Promise } +import scala.concurrent.{ExecutionContext, Future, Promise} import java.util.concurrent.atomic.AtomicLong -import Execution.{ ToWrite, Writer } +import Execution.{ToWrite, Writer} import SparkMode.SparkConfigMethods @@ -36,28 +36,29 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { type WorkVal[+A] = (TypedSource[A], Future[RDD[_ <: A]]) private[this] case class State( - id: Long, - sources: Resolver[TypedSource, SparkSource], - initToOpt: HMap[StateKey, TypedPipe], - forcedPipes: HMap[StateKey, WorkVal]) { + id: Long, + sources: Resolver[TypedSource, SparkSource], + initToOpt: HMap[StateKey, TypedPipe], + forcedPipes: HMap[StateKey, WorkVal] + ) { /** - * Returns true if we actually add this optimized pipe. We do this - * because we don't want to take the side effect twice. + * Returns true if we actually add this optimized pipe. We do this because we don't want to take the side + * effect twice. */ def addForce[T]( - c: Config, - init: TypedPipe[T], - opt: TypedPipe[T], - rdd: Future[RDD[_ <: T]], - persist: Option[StorageLevel])(implicit ec: ExecutionContext): (State, Boolean) = - + c: Config, + init: TypedPipe[T], + opt: TypedPipe[T], + rdd: Future[RDD[_ <: T]], + persist: Option[StorageLevel] + )(implicit ec: ExecutionContext): (State, Boolean) = forcedPipes.get((c, opt)) match { case None => // we have not previously forced this source val forcedRdd: Future[RDD[_ <: T]] = persist match { - case None => rdd + case None => rdd case Some(level) => rdd.map(_.persist(level)) } val ssrc: SparkSource[T] = materializedSource[T](forcedRdd) @@ -68,10 +69,7 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { val newForced = forcedPipes + ((c, opt) -> workVal) val newInitToOpt = initToOpt + ((c, init) -> opt) - (copy( - sources = newSources, - forcedPipes = newForced, - initToOpt = newInitToOpt), true) + (copy(sources = newSources, forcedPipes = newForced, initToOpt = newInitToOpt), true) case Some(_) => (copy(initToOpt = initToOpt + ((c, init) -> opt)), false) } @@ -99,7 +97,9 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { } // This should be called after a pipe has been forced - def write[T](c: Config, init: TypedPipe[T], sink: TypedSink[T])(implicit ec: ExecutionContext): Future[Unit] = + def write[T](c: Config, init: TypedPipe[T], sink: TypedSink[T])(implicit + ec: ExecutionContext + ): Future[Unit] = sparkMode.sink(sink) match { case None => Future.failed(new Exception(s"unknown sink: $sink when writing $init")) case Some(ssink) => @@ -118,33 +118,40 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { private def materializedSource[A](persisted: Future[RDD[_ <: A]]): SparkSource[A] = new SparkSource[A] { def read(s: SparkSession, config: Config)(implicit ec: ExecutionContext): Future[RDD[_ <: A]] = - if (session != s) Future.failed(new Exception("SparkSession has changed, illegal state. You must not share TypedPipes across Execution runs")) + if (session != s) + Future.failed( + new Exception( + "SparkSession has changed, illegal state. You must not share TypedPipes across Execution runs" + ) + ) else { persisted } } - def finished(): Unit = { + def finished(): Unit = state.set(null) - } - def getForced[T](conf: Config, initial: TypedPipe[T])(implicit cec: ExecutionContext): Future[TypedPipe[T]] = + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[TypedPipe[T]] = state.get().getForced(conf, initial) - def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit cec: ExecutionContext): Future[Iterable[T]] = + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[Iterable[T]] = state.get().getIterable(conf, initial) def start(): Unit = () /** - * do a batch of writes, possibly optimizing, and return a new unique - * Long. + * do a batch of writes, possibly optimizing, and return a new unique Long. * * empty writes are legitimate and should still return a Long */ - def execute( - conf: Config, - writes: List[ToWrite[_]])(implicit cec: ExecutionContext): CFuture[(Long, ExecutionCounters)] = { + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { val planner = SparkPlanner.plan(conf, sparkMode.sources.orElse(state.get().sources)) @@ -165,7 +172,8 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { init = keyPipe, opt = opt, promise.future, - conf.getForceToDiskPersistMode.orElse(Some(StorageLevel.DISK_ONLY))) + conf.getForceToDiskPersistMode.orElse(Some(StorageLevel.DISK_ONLY)) + ) def action = () => { // actually run val op = planner(opt) @@ -175,7 +183,12 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { } (newState, if (added) action else emptyAction) } - def write[T](opt: TypedPipe[T], keyPipe: TypedPipe[T], sink: TypedSink[T], oldState: State): (State, Action) = { + def write[T]( + opt: TypedPipe[T], + keyPipe: TypedPipe[T], + sink: TypedSink[T], + oldState: State + ): (State, Action) = { val promise = Promise[RDD[_ <: T]]() val (newState, added) = oldState.addForce[T](conf, init = keyPipe, opt = opt, promise.future, None) val action = () => { @@ -186,8 +199,7 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { val rddF = op.run(session) promise.completeWith(rddF) rddF.map(_ => ()) - } - else Future.successful(()) + } else Future.successful(()) rddF.flatMap(_ => newState.write(conf, keyPipe, sink)) } @@ -195,15 +207,15 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { } /** - * We keep track of the actions to avoid calling run on any RDDs - * until we have fully built the entire next state + * We keep track of the actions to avoid calling run on any RDDs until we have fully built the entire next + * state */ val (id: Long, acts) = state.update { s => val (nextState, acts) = optimizedWrites.foldLeft((s, List.empty[Action])) { - case (old@(state, acts), OptimizedWrite(pipe, Force(opt))) => + case (old @ (state, acts), OptimizedWrite(pipe, Force(opt))) => val (st, a) = force(opt, pipe, state) (st, a :: acts) - case (old@(state, acts), OptimizedWrite(pipe, ToIterable(opt))) => + case (old @ (state, acts), OptimizedWrite(pipe, ToIterable(opt))) => val (st, a) = force(opt, pipe, state) (st, a :: acts) case ((state, acts), OptimizedWrite(pipe, ToWrite.SimpleWrite(opt, sink))) => @@ -213,6 +225,6 @@ class SparkWriter(val sparkMode: SparkMode) extends Writer { (nextState.copy(id = nextState.id + 1), (nextState.id, acts)) } // now we run the actions: - CFuture.uncancellable(Future.traverse(acts) { fn => fn() }.map(_ => (id, ExecutionCounters.empty))) + CFuture.uncancellable(Future.traverse(acts)(fn => fn()).map(_ => (id, ExecutionCounters.empty))) } } diff --git a/scalding-spark/src/test/scala/com/twitter/scalding/spark_backend/SparkBackendTests.scala b/scalding-spark/src/test/scala/com/twitter/scalding/spark_backend/SparkBackendTests.scala index 5b68fe6782..f6638d1e2d 100644 --- a/scalding-spark/src/test/scala/com/twitter/scalding/spark_backend/SparkBackendTests.scala +++ b/scalding-spark/src/test/scala/com/twitter/scalding/spark_backend/SparkBackendTests.scala @@ -1,11 +1,11 @@ package com.twitter.scalding.spark_backend -import org.scalatest.{ BeforeAndAfter, FunSuite, PropSpec } +import org.scalatest.{BeforeAndAfter, FunSuite, PropSpec} import org.apache.hadoop.io.IntWritable import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import com.twitter.algebird.Monoid -import com.twitter.scalding.{ Config, Execution, TextLine, WritableSequenceFile } +import com.twitter.scalding.{Config, Execution, TextLine, WritableSequenceFile} import com.twitter.scalding.typed._ import com.twitter.scalding.typed.memory_backend.MemoryMode import java.io.File @@ -37,7 +37,10 @@ class SparkBackendTests extends FunSuite with BeforeAndAfter { new SparkConf() .setMaster(master) .setAppName(appName) - .set("spark.driver.host", "localhost") // this is needed to work on OSX when disconnected from the network + .set( + "spark.driver.host", + "localhost" + ) // this is needed to work on OSX when disconnected from the network session = SparkSession.builder.config(conf).getOrCreate() } @@ -47,7 +50,11 @@ class SparkBackendTests extends FunSuite with BeforeAndAfter { session = null } - def sparkMatchesIterable[A: Ordering](t: Execution[Iterable[A]], iter: Iterable[A], conf: Config = Config.empty) = { + def sparkMatchesIterable[A: Ordering]( + t: Execution[Iterable[A]], + iter: Iterable[A], + conf: Config = Config.empty + ) = { val smode = SparkMode.default(session) val semit = t.waitFor(conf, smode).get @@ -55,17 +62,22 @@ class SparkBackendTests extends FunSuite with BeforeAndAfter { } def sparkMatchesMemory[A: Ordering](t: TypedPipe[A]) = - sparkMatchesIterable(t.toIterableExecution, - t.toIterableExecution.waitFor(Config.empty, MemoryMode.empty).get) + sparkMatchesIterable( + t.toIterableExecution, + t.toIterableExecution.waitFor(Config.empty, MemoryMode.empty).get + ) test("some basic map-only operations work") { sparkMatchesMemory(TypedPipe.from(0 to 100)) sparkMatchesMemory(TypedPipe.from(0 to 100).map(_ * 2)) - sparkMatchesMemory(TypedPipe.from(0 to 100).map { x => (x, x * Int.MaxValue) }) - - sparkMatchesMemory(TypedPipe.from(0 to 100) - .map { x => (x, x * Int.MaxValue) } - .filter { case (k, v) => k > v }) + sparkMatchesMemory(TypedPipe.from(0 to 100).map(x => (x, x * Int.MaxValue))) + + sparkMatchesMemory( + TypedPipe + .from(0 to 100) + .map(x => (x, x * Int.MaxValue)) + .filter { case (k, v) => k > v } + ) } test("test with map-only with merge") { @@ -79,7 +91,7 @@ class SparkBackendTests extends FunSuite with BeforeAndAfter { sparkMatchesMemory { val input = TypedPipe.from(0 to 1000) // many merges - Monoid.sum((2 to 8).map { i => input.filter(_ % i == 0) }) + Monoid.sum((2 to 8).map(i => input.filter(_ % i == 0))) } } @@ -134,85 +146,98 @@ class SparkBackendTests extends FunSuite with BeforeAndAfter { } def tmpPath(suffix: String): String = - Paths.get(System.getProperty("java.io.tmpdir"), - "scalding", - "spark_backend", - suffix).toString + Paths.get(System.getProperty("java.io.tmpdir"), "scalding", "spark_backend", suffix).toString test("writeExecution works with TextLine") { val path = tmpPath("textline") - sparkMatchesIterable({ - val loc = TextLine(path) - val input = TypedPipe.from(0 to 100000) - input.groupBy(_ % 2) - .sorted - .foldLeft(0)(_ - _) - .toTypedPipe - .map(_.toString) - .writeExecution(loc) - .flatMap { _ => - TypedPipe.from(loc).toIterableExecution - } - - }, (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString)) + sparkMatchesIterable( + { + val loc = TextLine(path) + val input = TypedPipe.from(0 to 100000) + input + .groupBy(_ % 2) + .sorted + .foldLeft(0)(_ - _) + .toTypedPipe + .map(_.toString) + .writeExecution(loc) + .flatMap { _ => + TypedPipe.from(loc).toIterableExecution + } + + }, + (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString) + ) removeDir(path) } test("writeExecution works with IntWritable") { val path = tmpPath("int_writable") - sparkMatchesIterable({ - val loc = WritableSequenceFile[IntWritable, IntWritable](path) - val input = TypedPipe.from(0 to 100000) - input.groupBy(_ % 2) - .sorted - .foldLeft(0)(_ - _) - .toTypedPipe - .map { case (k, v) => (new IntWritable(k), new IntWritable(v)) } - .writeExecution(loc) - .flatMap { _ => - TypedPipe.from(loc) - .map { case (k, v) => (k.get, v.get) } - .toIterableExecution - } - - }, (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _))) + sparkMatchesIterable( + { + val loc = WritableSequenceFile[IntWritable, IntWritable](path) + val input = TypedPipe.from(0 to 100000) + input + .groupBy(_ % 2) + .sorted + .foldLeft(0)(_ - _) + .toTypedPipe + .map { case (k, v) => (new IntWritable(k), new IntWritable(v)) } + .writeExecution(loc) + .flatMap { _ => + TypedPipe.from(loc).map { case (k, v) => (k.get, v.get) }.toIterableExecution + } + + }, + (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)) + ) removeDir(path) } test("forceToDisk works") { - sparkMatchesIterable({ - val input = TypedPipe.from(0 to 100000) - input.groupBy(_ % 2) - .sorted - .foldLeft(0)(_ - _) - .toTypedPipe - .map(_.toString) - .forceToDiskExecution - .flatMap(_.toIterableExecution) - - }, (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString)) + sparkMatchesIterable( + { + val input = TypedPipe.from(0 to 100000) + input + .groupBy(_ % 2) + .sorted + .foldLeft(0)(_ - _) + .toTypedPipe + .map(_.toString) + .forceToDiskExecution + .flatMap(_.toIterableExecution) + + }, + (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString) + ) } test("forceToDisk works with no persistance") { - sparkMatchesIterable({ - val input = TypedPipe.from(0 to 100000) - input.groupBy(_ % 2) - .sorted - .foldLeft(0)(_ - _) - .toTypedPipe - .map(_.toString) - .forceToDisk - .toIterableExecution - - }, (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString), - Config.empty.setForceToDiskPersistMode("NONE")) + sparkMatchesIterable( + { + val input = TypedPipe.from(0 to 100000) + input + .groupBy(_ % 2) + .sorted + .foldLeft(0)(_ - _) + .toTypedPipe + .map(_.toString) + .forceToDisk + .toIterableExecution + + }, + (0 to 100000).groupBy(_ % 2).mapValues(_.foldLeft(0)(_ - _)).map(_.toString), + Config.empty.setForceToDiskPersistMode("NONE") + ) } } class ConfigPartitionComputerTest extends PropSpec with PropertyChecks { - property("when no config or number of reducers are given, returns the current number of partitions (or 1)") { + property( + "when no config or number of reducers are given, returns the current number of partitions (or 1)" + ) { val pc = ConfigPartitionComputer(Config.empty, None) forAll { i: Int => if (i >= 1) assert(pc(i) == i) diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala index c4473694cb..654a083d04 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala @@ -18,7 +18,7 @@ package com.twitter.scalding.thrift.macros import com.twitter.scalding.serialization.OrderedSerialization import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl -import scala.language.experimental.{ macros => sMacros } +import scala.language.experimental.{macros => sMacros} object Macros { implicit def scroogeOrdSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T] diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala index 3d5878d8c4..cceb2caf01 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala @@ -1,13 +1,14 @@ package com.twitter.scalding.thrift.macros -import com.twitter.scalding.serialization.{ OrderedSerialization, RequiredBinaryComparatorsConfig } +import com.twitter.scalding.serialization.{OrderedSerialization, RequiredBinaryComparatorsConfig} import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl -import scala.language.experimental.{ macros => smacros } +import scala.language.experimental.{macros => smacros} /** - * Provides support for Scrooge classes in addition to primitives, cases classes, tuples etc. Use this - * if you use Scrooge classes as `key` in your scalding job. - * @author Mansur Ashraf. + * Provides support for Scrooge classes in addition to primitives, cases classes, tuples etc. Use this if you + * use Scrooge classes as `key` in your scalding job. + * @author + * Mansur Ashraf. */ trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig { implicit def ordSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T] diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ScroogeInternalOrderedSerializationImpl.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ScroogeInternalOrderedSerializationImpl.scala index 3147d3902f..c930ad0a17 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ScroogeInternalOrderedSerializationImpl.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ScroogeInternalOrderedSerializationImpl.scala @@ -18,7 +18,12 @@ package com.twitter.scalding.thrift.macros.impl import com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl import com.twitter.scalding.serialization.macros.impl.ordered_serialization._ import com.twitter.scalding.serialization.OrderedSerialization -import com.twitter.scalding.thrift.macros.impl.ordered_serialization.{ ScroogeEnumOrderedBuf, ScroogeUnionOrderedBuf, ScroogeOrderedBuf, ScroogeOuterOrderedBuf } +import com.twitter.scalding.thrift.macros.impl.ordered_serialization.{ + ScroogeEnumOrderedBuf, + ScroogeOrderedBuf, + ScroogeOuterOrderedBuf, + ScroogeUnionOrderedBuf +} import scala.reflect.macros.Context @@ -34,12 +39,14 @@ object ScroogeInternalOrderedSerializationImpl { // which will inject an implicit lazy val for a new OrderedSerialization and then exit the macro. // This avoids methods becoming too long via inlining. private def baseScroogeDispatcher(c: Context): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - def buildDispatcher: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = ScroogeInternalOrderedSerializationImpl.innerDispatcher(c) + def buildDispatcher: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = + ScroogeInternalOrderedSerializationImpl.innerDispatcher(c) val scroogeEnumDispatcher = ScroogeEnumOrderedBuf.dispatch(c) val scroogeUnionDispatcher = ScroogeUnionOrderedBuf.dispatch(c)(buildDispatcher) val scroogeOuterOrderedBuf = ScroogeOuterOrderedBuf.dispatch(c) - OrderedSerializationProviderImpl.normalizedDispatcher(c)(buildDispatcher) + OrderedSerializationProviderImpl + .normalizedDispatcher(c)(buildDispatcher) .orElse(scroogeEnumDispatcher) .orElse(scroogeUnionDispatcher) .orElse(scroogeOuterOrderedBuf) @@ -50,8 +57,8 @@ object ScroogeInternalOrderedSerializationImpl { import c.universe._ baseScroogeDispatcher(c) .orElse(OrderedSerializationProviderImpl.fallbackImplicitDispatcher(c)) - .orElse { - case tpe: Type => c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type ${tpe}""") + .orElse { case tpe: Type => + c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type $tpe""") } } @@ -61,11 +68,12 @@ object ScroogeInternalOrderedSerializationImpl { private def outerDispatcher(c: Context): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { import c.universe._ - OrderedSerializationProviderImpl.normalizedDispatcher(c)(ScroogeInternalOrderedSerializationImpl.outerDispatcher(c)) + OrderedSerializationProviderImpl + .normalizedDispatcher(c)(ScroogeInternalOrderedSerializationImpl.outerDispatcher(c)) .orElse(ScroogeOrderedBuf.dispatch(c)(baseScroogeDispatcher(c))) .orElse(baseScroogeDispatcher(c)) - .orElse { - case tpe: Type => c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type ${tpe}""") + .orElse { case tpe: Type => + c.abort(c.enclosingPosition, s"""Unable to find OrderedSerialization for type $tpe""") } } diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeEnumOrderedBuf.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeEnumOrderedBuf.scala index ee666d153f..4d7f4c64d8 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeEnumOrderedBuf.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeEnumOrderedBuf.scala @@ -55,9 +55,9 @@ object ScroogeEnumOrderedBuf { _root_.java.lang.Integer.compare($elementA.value, $elementB.value) : _root_.scala.Int """ - override def length(element: Tree): CompileTimeLengthTypes[c.type] = CompileTimeLengthTypes.FastLengthCalculation(c)(q"posVarIntSize($element.value)") + override def length(element: Tree): CompileTimeLengthTypes[c.type] = + CompileTimeLengthTypes.FastLengthCalculation(c)(q"posVarIntSize($element.value)") override val lazyOuterVariables: Map[String, ctx.Tree] = Map.empty } } } - diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOrderedBuf.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOrderedBuf.scala index 4e70f98b12..c7a5301f4f 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOrderedBuf.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOrderedBuf.scala @@ -16,7 +16,7 @@ package com.twitter.scalding.thrift.macros.impl.ordered_serialization import com.twitter.scalding.serialization.macros.impl.ordered_serialization._ -import com.twitter.scrooge.{ ThriftStruct, ThriftUnion } +import com.twitter.scrooge.{ThriftStruct, ThriftUnion} import scala.reflect.macros.Context @@ -26,18 +26,27 @@ import scala.reflect.macros.Context fields named after the thrift fields. So we look at the companion object to figure out those fields names. Then we scan the trait for those methods to build the similar listing as is used in products. Other than that we use the same constructor approach to case classes in calling the companion object over calling new on the trait - */ + */ object ScroogeOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { import c.universe._ val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if tpe <:< typeOf[ThriftStruct] && !(tpe =:= typeOf[ThriftStruct]) && !(tpe <:< typeOf[ThriftUnion]) => ScroogeOrderedBuf(c)(buildDispatcher, tpe) + case tpe + if tpe <:< typeOf[ThriftStruct] && !(tpe =:= typeOf[ThriftStruct]) && !(tpe <:< typeOf[ + ThriftUnion + ]) => + ScroogeOrderedBuf(c)(buildDispatcher, tpe) } pf } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) @@ -45,8 +54,7 @@ object ScroogeOrderedBuf { val companionSymbol = outerType.typeSymbol.companionSymbol - val fieldNames: List[String] = companionSymbol.asModule.moduleClass.asType.toType - .declarations + val fieldNames: List[String] = companionSymbol.asModule.moduleClass.asType.toType.declarations .filter(_.name.decoded.endsWith("Field ")) .collect { case s: TermSymbol => s } .filter(_.isStatic) @@ -54,18 +62,19 @@ object ScroogeOrderedBuf { .map { t => val decodedName = t.name.decoded // Looks like "MethodNameField " decodedName.dropRight(6).toLowerCase // These things end in "Field " , yes there is a space in there - }.toList + } + .toList val elementData: List[(c.universe.Type, TermName, TreeOrderedBuf[c.type])] = - outerType - .declarations + outerType.declarations .collect { case m: MethodSymbol => m } .filter(m => fieldNames.contains(m.name.toString.toLowerCase)) .map { accessorMethod => val fieldType = accessorMethod.returnType.asSeenFrom(outerType, outerType.typeSymbol.asClass) val b: TreeOrderedBuf[c.type] = dispatcher(fieldType) (fieldType, accessorMethod.name, b) - }.toList + } + .toList new TreeOrderedBuf[c.type] { override val ctx: c.type = c @@ -80,19 +89,18 @@ object ScroogeOrderedBuf { override def get(inputStream: ctx.TermName): ctx.Tree = { - val getValProcessor = elementData.map { - case (tpe, accessorSymbol, tBuf) => - val curR = freshT("curR") - val builderTree = q""" + val getValProcessor = elementData.map { case (tpe, accessorSymbol, tBuf) => + val curR = freshT("curR") + val builderTree = q""" val $curR: ${tBuf.tpe} = { ${tBuf.get(inputStream)} } """ - (builderTree, curR) + (builderTree, curR) } q""" ..${getValProcessor.map(_._1)} - ${companionSymbol}(..${getValProcessor.map(_._2)}) : $outerType + $companionSymbol(..${getValProcessor.map(_._2)}) : $outerType """ } override def compare(elementA: ctx.TermName, elementB: ctx.TermName): ctx.Tree = @@ -106,4 +114,3 @@ object ScroogeOrderedBuf { } } } - diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOuterOrderedBuf.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOuterOrderedBuf.scala index b5239f9919..a2011c8776 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOuterOrderedBuf.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeOuterOrderedBuf.scala @@ -16,7 +16,7 @@ package com.twitter.scalding.thrift.macros.impl.ordered_serialization import com.twitter.scalding.serialization.macros.impl.ordered_serialization._ -import com.twitter.scrooge.{ ThriftStruct, ThriftUnion } +import com.twitter.scrooge.{ThriftStruct, ThriftUnion} import scala.reflect.macros.Context @@ -24,7 +24,7 @@ import scala.reflect.macros.Context ScroogeOuterOrderedBuf is a short cut to stop the macro's recursing onto nested thrift structs. An inner one like this puts an outer implicit variable in the current closure. The next pass from the compiler will trigger the macro again to build a new class for it. -*/ + */ object ScroogeOuterOrderedBuf { // This intentionally handles thrift structs, but not unions, since we want to break out in the struct but not the union // That way we can inject all the sub types of the union as implicits into the outer thrift struct. @@ -32,7 +32,8 @@ object ScroogeOuterOrderedBuf { import c.universe._ val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if tpe <:< typeOf[ThriftStruct] && !(tpe <:< typeOf[ThriftUnion]) => ScroogeOuterOrderedBuf(c)(tpe) + case tpe if tpe <:< typeOf[ThriftStruct] && !(tpe <:< typeOf[ThriftUnion]) => + ScroogeOuterOrderedBuf(c)(tpe) } pf } @@ -44,7 +45,8 @@ object ScroogeOuterOrderedBuf { val variableID = (outerType.typeSymbol.fullName.hashCode.toLong + Int.MaxValue.toLong).toString val variableNameStr = s"bufferable_$variableID" val variableName = newTermName(variableNameStr) - val implicitInstanciator = q"""_root_.scala.Predef.implicitly[_root_.com.twitter.scalding.serialization.OrderedSerialization[$outerType]]""" + val implicitInstanciator = + q"""_root_.scala.Predef.implicitly[_root_.com.twitter.scalding.serialization.OrderedSerialization[$outerType]]""" new TreeOrderedBuf[c.type] { override val ctx: c.type = c @@ -80,4 +82,3 @@ object ScroogeOuterOrderedBuf { } } } - diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeUnionOrderedBuf.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeUnionOrderedBuf.scala index 80b760e2d4..52e2c26fcd 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeUnionOrderedBuf.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/ScroogeUnionOrderedBuf.scala @@ -22,18 +22,25 @@ import com.twitter.scrooge.ThriftUnion import scala.reflect.macros.Context object ScroogeUnionOrderedBuf { - def dispatch(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]]): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { + def dispatch(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]] + ): PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { import c.universe._ val pf: PartialFunction[c.Type, TreeOrderedBuf[c.type]] = { - case tpe if tpe <:< typeOf[ThriftUnion] && - (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isTrait) && - !tpe.typeSymbol.asClass.knownDirectSubclasses.isEmpty => ScroogeUnionOrderedBuf(c)(buildDispatcher, tpe) + case tpe + if tpe <:< typeOf[ThriftUnion] && + (tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isTrait) && + !tpe.typeSymbol.asClass.knownDirectSubclasses.isEmpty => + ScroogeUnionOrderedBuf(c)(buildDispatcher, tpe) } pf } - def apply(c: Context)(buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], outerType: c.Type): TreeOrderedBuf[c.type] = { + def apply(c: Context)( + buildDispatcher: => PartialFunction[c.Type, TreeOrderedBuf[c.type]], + outerType: c.Type + ): TreeOrderedBuf[c.type] = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(s"$id")) @@ -41,28 +48,35 @@ object ScroogeUnionOrderedBuf { val subClasses: List[Type] = StableKnownDirectSubclasses(c)(outerType).map(_.toType) - val subData: List[(Int, Type, Option[TreeOrderedBuf[c.type]])] = subClasses.map { t => - if (t.typeSymbol.name.toString == "UnknownUnionField") { - (t, None) - } else { - (t, Some(dispatcher(t))) + val subData: List[(Int, Type, Option[TreeOrderedBuf[c.type]])] = subClasses + .map { t => + if (t.typeSymbol.name.toString == "UnknownUnionField") { + (t, None) + } else { + (t, Some(dispatcher(t))) + } } - }.zipWithIndex.map { case ((tpe, tbuf), idx) => (idx, tpe, tbuf) } + .zipWithIndex + .map { case ((tpe, tbuf), idx) => (idx, tpe, tbuf) } require(subData.nonEmpty, "Must have some sub types on a union?") new TreeOrderedBuf[c.type] { override val ctx: c.type = c override val tpe = outerType - override def compareBinary(inputStreamA: ctx.TermName, inputStreamB: ctx.TermName) = UnionLike.compareBinary(c)(inputStreamA, inputStreamB)(subData) + override def compareBinary(inputStreamA: ctx.TermName, inputStreamB: ctx.TermName) = + UnionLike.compareBinary(c)(inputStreamA, inputStreamB)(subData) override def hash(element: ctx.TermName): ctx.Tree = UnionLike.hash(c)(element)(subData) - override def put(inputStream: ctx.TermName, element: ctx.TermName) = UnionLike.put(c)(inputStream, element)(subData) + override def put(inputStream: ctx.TermName, element: ctx.TermName) = + UnionLike.put(c)(inputStream, element)(subData) override def get(inputStream: ctx.TermName): ctx.Tree = UnionLike.get(c)(inputStream)(subData) - override def compare(elementA: ctx.TermName, elementB: ctx.TermName): ctx.Tree = UnionLike.compare(c)(outerType, elementA, elementB)(subData) - override def length(element: Tree): CompileTimeLengthTypes[c.type] = UnionLike.length(c)(element)(subData) - override val lazyOuterVariables: Map[String, ctx.Tree] = subData.flatMap(_._3).map(_.lazyOuterVariables).reduce(_ ++ _) + override def compare(elementA: ctx.TermName, elementB: ctx.TermName): ctx.Tree = + UnionLike.compare(c)(outerType, elementA, elementB)(subData) + override def length(element: Tree): CompileTimeLengthTypes[c.type] = + UnionLike.length(c)(element)(subData) + override val lazyOuterVariables: Map[String, ctx.Tree] = + subData.flatMap(_._3).map(_.lazyOuterVariables).reduce(_ ++ _) } } } - diff --git a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/UnionLike.scala b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/UnionLike.scala index a865ee4ff4..b3a1ad4dd0 100644 --- a/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/UnionLike.scala +++ b/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/impl/ordered_serialization/UnionLike.scala @@ -23,19 +23,22 @@ object UnionLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def compareBinary(c: Context)(inputStreamA: c.TermName, inputStreamB: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { + def compareBinary(c: Context)(inputStreamA: c.TermName, inputStreamB: c.TermName)( + subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])] + ): c.Tree = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) val valueA = freshT("valueA") val valueB = freshT("valueB") val idxCmp = freshT("idxCmp") - val compareSameTypes: Tree = subData.foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, optiTBuf)) => - - val commonCmp: Tree = optiTBuf.map{ tBuf => - tBuf.compareBinary(inputStreamA, inputStreamB) - }.getOrElse[Tree](q"0") + val compareSameTypes: Tree = subData + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, optiTBuf)) => + val commonCmp: Tree = optiTBuf + .map { tBuf => + tBuf.compareBinary(inputStreamA, inputStreamB) + } + .getOrElse[Tree](q"0") existing match { case Some(t) => @@ -54,7 +57,8 @@ object UnionLike { _root_.scala.sys.error("Unable to compare unknown type") }""") } - }.get + } + .get q""" val $valueA: _root_.scala.Int = $inputStreamA.readByte.toInt @@ -70,20 +74,24 @@ object UnionLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def hash(c: Context)(element: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { + def hash( + c: Context + )(element: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) val innerArg = freshT("innerArg") - subData.foldLeft(Option.empty[Tree]) { - case (optiExisting, (idx, tpe, optiTBuf)) => - val commonPut: Tree = optiTBuf.map { tBuf => - q"""{ + subData + .foldLeft(Option.empty[Tree]) { case (optiExisting, (idx, tpe, optiTBuf)) => + val commonPut: Tree = optiTBuf + .map { tBuf => + q"""{ val $innerArg: $tpe = $element.asInstanceOf[$tpe] ${tBuf.hash(innerArg)} } """ - }.getOrElse[Tree](q"_root_.scala.Int.MaxValue") + } + .getOrElse[Tree](q"_root_.scala.Int.MaxValue") optiExisting match { case Some(s) => @@ -103,23 +111,28 @@ object UnionLike { } """) } - }.get + } + .get } // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def put(c: Context)(inputStream: c.TermName, element: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { + def put(c: Context)(inputStream: c.TermName, element: c.TermName)( + subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])] + ): c.Tree = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) val innerArg = freshT("innerArg") - subData.foldLeft(Option.empty[Tree]) { - case (optiExisting, (idx, tpe, optiTBuf)) => - val commonPut: Tree = optiTBuf.map { tBuf => - q"""val $innerArg: $tpe = $element.asInstanceOf[$tpe] + subData + .foldLeft(Option.empty[Tree]) { case (optiExisting, (idx, tpe, optiTBuf)) => + val commonPut: Tree = optiTBuf + .map { tBuf => + q"""val $innerArg: $tpe = $element.asInstanceOf[$tpe] ${tBuf.put(inputStream, innerArg)} """ - }.getOrElse[Tree](q"()") + } + .getOrElse[Tree](q"()") optiExisting match { case Some(s) => @@ -139,32 +152,40 @@ object UnionLike { } """) } - }.get + } + .get } // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial", "org.wartremover.warts.Return")) - def length(c: Context)(element: c.Tree)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): CompileTimeLengthTypes[c.type] = { + def length(c: Context)( + element: c.Tree + )(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): CompileTimeLengthTypes[c.type] = { import CompileTimeLengthTypes._ import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) - val prevSizeData = subData.foldLeft(Option.empty[Tree]) { - case (optiTree, (idx, tpe, tBufOpt)) => - - val baseLenT: Tree = tBufOpt.map{ tBuf => - tBuf.length(q"$element.asInstanceOf[$tpe]") match { - case m: MaybeLengthCalculation[_] => - m.asInstanceOf[MaybeLengthCalculation[c.type]].t - - case f: FastLengthCalculation[_] => - q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${f.asInstanceOf[FastLengthCalculation[c.type]].t})""" - - case _: NoLengthCalculationAvailable[_] => - return NoLengthCalculationAvailable(c) - case e => sys.error("unexpected input to union length code of " + e) + val prevSizeData = subData + .foldLeft(Option.empty[Tree]) { case (optiTree, (idx, tpe, tBufOpt)) => + val baseLenT: Tree = tBufOpt + .map { tBuf => + tBuf.length(q"$element.asInstanceOf[$tpe]") match { + case m: MaybeLengthCalculation[_] => + m.asInstanceOf[MaybeLengthCalculation[c.type]].t + + case f: FastLengthCalculation[_] => + q"""_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(${f + .asInstanceOf[FastLengthCalculation[c.type]] + .t})""" + + case _: NoLengthCalculationAvailable[_] => + return NoLengthCalculationAvailable(c) + case e => sys.error("unexpected input to union length code of " + e) + } } - }.getOrElse(q"_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(1)") + .getOrElse( + q"_root_.com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.DynamicLen(1)" + ) val tmpPreLen = freshT("tmpPreLen") val lenT = q""" @@ -196,28 +217,33 @@ object UnionLike { sys.error("Did not understand thrift union type") }""") } - }.get + } + .get - MaybeLengthCalculation(c) (prevSizeData) + MaybeLengthCalculation(c)(prevSizeData) } // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def get(c: Context)(inputStream: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { + def get( + c: Context + )(inputStream: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) val valueA = freshT("valueA") - val expandedOut = subData.foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, optiTBuf)) => - val extract = optiTBuf.map { tBuf => - q""" + val expandedOut = subData + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, optiTBuf)) => + val extract = optiTBuf + .map { tBuf => + q""" ${tBuf.get(inputStream)} """ - }.getOrElse { - q"""(new Object).asInstanceOf[$tpe]""" - } + } + .getOrElse { + q"""(new Object).asInstanceOf[$tpe]""" + } existing match { case Some(t) => @@ -237,7 +263,8 @@ object UnionLike { } """) } - }.get + } + .get q""" val $valueA: Int = $inputStream.readByte.toInt @@ -247,7 +274,9 @@ object UnionLike { // This `_.get` could be removed by switching `subData` to a non-empty list type @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) - def compare(c: Context)(cmpType: c.Type, elementA: c.TermName, elementB: c.TermName)(subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])]): c.Tree = { + def compare(c: Context)(cmpType: c.Type, elementA: c.TermName, elementB: c.TermName)( + subData: List[(Int, c.Type, Option[TreeOrderedBuf[c.type]])] + ): c.Tree = { import c.universe._ def freshT(id: String) = newTermName(c.fresh(id)) @@ -257,8 +286,8 @@ object UnionLike { val idxA = freshT("idxA") val idxB = freshT("idxB") - val toIdOpt: Tree = subData.foldLeft(Option.empty[Tree]) { - case (existing, (idx, tpe, _)) => + val toIdOpt: Tree = subData + .foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, _)) => existing match { case Some(t) => Some(q""" @@ -276,19 +305,22 @@ object UnionLike { sys.error("Unable to compare unknown type") }""") } - }.get + } + .get val compareSameTypes: Option[Tree] = subData.foldLeft(Option.empty[Tree]) { case (existing, (idx, tpe, optiTBuf)) => - val commonCmp = optiTBuf.map { tBuf => - val aTerm = freshT("aTerm") - val bTerm = freshT("bTerm") - q""" + val commonCmp = optiTBuf + .map { tBuf => + val aTerm = freshT("aTerm") + val bTerm = freshT("bTerm") + q""" val $aTerm: $tpe = $elementA.asInstanceOf[$tpe] val $bTerm: $tpe = $elementB.asInstanceOf[$tpe] ${tBuf.compare(aTerm, bTerm)} """ - }.getOrElse(q"0") + } + .getOrElse(q"0") existing match { case Some(t) => @@ -311,7 +343,7 @@ object UnionLike { val compareFn = q""" def instanceToIdx($arg: $cmpType): Int = { - ${toIdOpt}: Int + $toIdOpt: Int } val $idxA: Int = instanceToIdx($elementA) @@ -328,4 +360,3 @@ object UnionLike { compareFn } } - diff --git a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/PlatformTest.scala b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/PlatformTest.scala index 74e7010e47..2ae671e09b 100644 --- a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/PlatformTest.scala +++ b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/PlatformTest.scala @@ -12,26 +12,31 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.thrift.macros import com.twitter.scalding._ -import com.twitter.scalding.platform.{ HadoopPlatformJobTest, HadoopSharedPlatformTest } +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} import com.twitter.scalding.serialization.OrderedSerialization import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl import com.twitter.scalding.thrift.macros.scalathrift._ import org.scalacheck.Arbitrary -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} -import scala.language.experimental.{ macros => sMacros } +import scala.language.experimental.{macros => sMacros} class CompareJob[T: OrderedSerialization](in: Iterable[T], args: Args) extends Job(args) { - TypedPipe.from(in).flatMap{ i => - (0 until 1).map (_ => i) - }.map(_ -> 1L).sumByKey.map { - case (k, v) => + TypedPipe + .from(in) + .flatMap { i => + (0 until 1).map(_ => i) + } + .map(_ -> 1L) + .sumByKey + .map { case (k, v) => (k.hashCode, v) - }.write(TypedTsv[(Int, Long)]("output")) + } + .write(TypedTsv[(Int, Long)]("output")) } private[macros] trait InstanceProvider[T] { def g(idx: Int): T @@ -39,7 +44,8 @@ private[macros] trait InstanceProvider[T] { class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest { org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.FATAL) org.apache.log4j.Logger.getLogger("org.mortbay").setLevel(org.apache.log4j.Level.FATAL) - implicit def toScroogeInternalOrderedSerialization[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T] + implicit def toScroogeInternalOrderedSerialization[T]: OrderedSerialization[T] = + macro ScroogeInternalOrderedSerializationImpl[T] import ScroogeGenerators._ @@ -57,7 +63,7 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest val expected = input .groupBy(identity) - .map{ case (k, v) => (k.hashCode, v.size) } + .map { case (k, v) => (k.hashCode, v.size) } out.toSet shouldBe expected.toSet } @@ -84,7 +90,10 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "Expected items should match : Internal Serializer / TestUnion" in { toScroogeInternalOrderedSerialization[TestUnion] - runCompareTest[TestUnion](toScroogeInternalOrderedSerialization[TestUnion], arbitraryInstanceProvider[TestUnion]) + runCompareTest[TestUnion]( + toScroogeInternalOrderedSerialization[TestUnion], + arbitraryInstanceProvider[TestUnion] + ) } "Expected items should match : Internal Serializer / Enum" in { @@ -101,9 +110,8 @@ class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest "Expected items should match : Internal Serializer / (Long, TestTypes)" in { case object Container { - def ord[T](implicit oSer: OrderedSerialization[T]): OrderedSerialization[(Long, T)] = { + def ord[T](implicit oSer: OrderedSerialization[T]): OrderedSerialization[(Long, T)] = implicitly[OrderedSerialization[(Long, T)]] - } } val ordSer = Container.ord[TestTypes] diff --git a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeGenerators.scala b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeGenerators.scala index 54cd68a787..b4cfe29654 100644 --- a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeGenerators.scala +++ b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeGenerators.scala @@ -1,14 +1,14 @@ package com.twitter.scalding.thrift.macros import com.twitter.scalding.thrift.macros.scalathrift._ -import org.scalacheck.{ Arbitrary, Gen } -import org.scalacheck.Arbitrary.{ arbitrary => arb } +import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Arbitrary.{arbitrary => arb} import java.nio.ByteBuffer import org.scalacheck.Gen.Parameters import org.scalacheck.rng.Seed private object Perturbers { - def perturb(t0: TestStruct, t1: TestStruct, i: Int): TestStruct = { + def perturb(t0: TestStruct, t1: TestStruct, i: Int): TestStruct = i match { case 1 => t0.copy(aString = t1.aString) case 2 => t0.copy(aI32 = t1.aI32) @@ -16,73 +16,68 @@ private object Perturbers { case 4 => t0 case x => sys.error("Can't perturb TestStruct field: " + x) } - } - def perturb(t0: TestTypes, t1: TestTypes, i: Int): TestTypes = { + def perturb(t0: TestTypes, t1: TestTypes, i: Int): TestTypes = i match { - case 1 => t0.copy(aBool = t1.aBool) - case 2 => t0.copy(aByte = t1.aByte) - case 3 => t0.copy(aI16 = t1.aI16) - case 4 => t0.copy(aI32 = t1.aI32) - case 5 => t0.copy(aI64 = t1.aI64) - case 6 => t0.copy(aDouble = t1.aDouble) - case 7 => t0.copy(aString = t1.aString) - case 8 => t0.copy(aEnum = t1.aEnum) - case 9 => t0.copy(aBinary = t1.aBinary) + case 1 => t0.copy(aBool = t1.aBool) + case 2 => t0.copy(aByte = t1.aByte) + case 3 => t0.copy(aI16 = t1.aI16) + case 4 => t0.copy(aI32 = t1.aI32) + case 5 => t0.copy(aI64 = t1.aI64) + case 6 => t0.copy(aDouble = t1.aDouble) + case 7 => t0.copy(aString = t1.aString) + case 8 => t0.copy(aEnum = t1.aEnum) + case 9 => t0.copy(aBinary = t1.aBinary) case 10 => t0 - case x => sys.error("Can't perturb TestTypes field: " + x) + case x => sys.error("Can't perturb TestTypes field: " + x) } - } - def perturb(t0: TestLists, t1: TestLists, i: Int): TestLists = { + def perturb(t0: TestLists, t1: TestLists, i: Int): TestLists = i match { - case 1 => t0.copy(aBoolList = t1.aBoolList) - case 2 => t0.copy(aByteList = t1.aByteList) - case 3 => t0.copy(aI16List = t1.aI16List) - case 4 => t0.copy(aI32List = t1.aI32List) - case 5 => t0.copy(aI64List = t1.aI64List) - case 6 => t0.copy(aDoubleList = t1.aDoubleList) - case 7 => t0.copy(aStringList = t1.aStringList) - case 8 => t0.copy(aStructList = t1.aStructList) - case 9 => t0.copy(aListList = t1.aListList) + case 1 => t0.copy(aBoolList = t1.aBoolList) + case 2 => t0.copy(aByteList = t1.aByteList) + case 3 => t0.copy(aI16List = t1.aI16List) + case 4 => t0.copy(aI32List = t1.aI32List) + case 5 => t0.copy(aI64List = t1.aI64List) + case 6 => t0.copy(aDoubleList = t1.aDoubleList) + case 7 => t0.copy(aStringList = t1.aStringList) + case 8 => t0.copy(aStructList = t1.aStructList) + case 9 => t0.copy(aListList = t1.aListList) case 10 => t0.copy(aSetList = t1.aSetList) case 11 => t0.copy(aMapList = t1.aMapList) case 12 => t0 - case x => sys.error("Can't perturb TestLists field: " + x) + case x => sys.error("Can't perturb TestLists field: " + x) } - } - def perturb(t0: TestSets, t1: TestSets, i: Int): TestSets = { + def perturb(t0: TestSets, t1: TestSets, i: Int): TestSets = i match { - case 1 => t0.copy(aBoolSet = t1.aBoolSet) - case 2 => t0.copy(aByteSet = t1.aByteSet) - case 3 => t0.copy(aI16Set = t1.aI16Set) - case 4 => t0.copy(aI32Set = t1.aI32Set) - case 5 => t0.copy(aI64Set = t1.aI64Set) - case 6 => t0.copy(aDoubleSet = t1.aDoubleSet) - case 7 => t0.copy(aStringSet = t1.aStringSet) - case 8 => t0.copy(aStructSet = t1.aStructSet) - case 9 => t0.copy(aListSet = t1.aListSet) + case 1 => t0.copy(aBoolSet = t1.aBoolSet) + case 2 => t0.copy(aByteSet = t1.aByteSet) + case 3 => t0.copy(aI16Set = t1.aI16Set) + case 4 => t0.copy(aI32Set = t1.aI32Set) + case 5 => t0.copy(aI64Set = t1.aI64Set) + case 6 => t0.copy(aDoubleSet = t1.aDoubleSet) + case 7 => t0.copy(aStringSet = t1.aStringSet) + case 8 => t0.copy(aStructSet = t1.aStructSet) + case 9 => t0.copy(aListSet = t1.aListSet) case 10 => t0.copy(aSetSet = t1.aSetSet) case 11 => t0.copy(aMapSet = t1.aMapSet) case 12 => t0 - case x => sys.error("Can't perturb TestSets field: " + x) + case x => sys.error("Can't perturb TestSets field: " + x) } - } - def perturb(t0: TestMaps, t1: TestMaps, i: Int): TestMaps = { + def perturb(t0: TestMaps, t1: TestMaps, i: Int): TestMaps = i match { - case 1 => t0.copy(aBoolMap = t1.aBoolMap) - case 2 => t0.copy(aByteMap = t1.aByteMap) - case 3 => t0.copy(aI16Map = t1.aI16Map) - case 4 => t0.copy(aI32Map = t1.aI32Map) - case 5 => t0.copy(aI64Map = t1.aI64Map) - case 6 => t0.copy(aDoubleMap = t1.aDoubleMap) - case 7 => t0.copy(aStringMap = t1.aStringMap) - case 8 => t0.copy(aStructMap = t1.aStructMap) - case 9 => t0.copy(aListMap = t1.aListMap) + case 1 => t0.copy(aBoolMap = t1.aBoolMap) + case 2 => t0.copy(aByteMap = t1.aByteMap) + case 3 => t0.copy(aI16Map = t1.aI16Map) + case 4 => t0.copy(aI32Map = t1.aI32Map) + case 5 => t0.copy(aI64Map = t1.aI64Map) + case 6 => t0.copy(aDoubleMap = t1.aDoubleMap) + case 7 => t0.copy(aStringMap = t1.aStringMap) + case 8 => t0.copy(aStructMap = t1.aStructMap) + case 9 => t0.copy(aListMap = t1.aListMap) case 10 => t0.copy(aSetMap = t1.aSetMap) case 11 => t0.copy(aMapMap = t1.aMapMap) case 12 => t0 - case x => sys.error("Can't perturb TestMaps field: " + x) + case x => sys.error("Can't perturb TestMaps field: " + x) } - } } object ScroogeGenerators { @@ -92,9 +87,9 @@ object ScroogeGenerators { def g(innerI: Int, loops: Int): T = { val p = Parameters.default.withSize(2) implicitly[Arbitrary[T]].arbitrary(p, Seed(innerI)) match { - case Some(s) => s + case Some(s) => s case None if loops < 5 => g(innerI + 1, loops + 1) - case None => sys.error("Cannot appear to get Some for this generator.") + case None => sys.error("Cannot appear to get Some for this generator.") } } @@ -166,7 +161,19 @@ object ScroogeGenerators { aListList <- Gen.listOf(Gen.listOf(Gen.alphaStr)) aSetList <- Gen.listOf(Gen.listOf(Gen.alphaStr).map(_.toSet)) aMapList <- Gen.listOf(Gen.listOf(arb[(Int, Int)]).map(_.toMap)) - } yield TestLists(aBoolList, aByteList, aI16List, aI32List, aI64List, aDoubleList, aStringList, aStructList, aListList, aSetList, aMapList) + } yield TestLists( + aBoolList, + aByteList, + aI16List, + aI32List, + aI64List, + aDoubleList, + aStringList, + aStructList, + aListList, + aSetList, + aMapList + ) } case class TestListsPair(a: TestLists, b: TestLists) implicit def arbitraryTestListsPair: Arbitrary[TestListsPair] = Arbitrary { @@ -186,10 +193,26 @@ object ScroogeGenerators { aDoubleSet <- Gen.listOf(arb[Double]).map(_.toSet) aStringSet <- Gen.listOf(Gen.alphaStr).map(_.toSet) aStructSet <- Gen.listOf(arb[TestStruct]).map(_.toSet) - aListSet <- Gen.listOf(Gen.listOf(Gen.alphaStr).map(l => l.to[collection.Seq])).map(_.to[collection.Set]) + aListSet <- Gen + .listOf(Gen.listOf(Gen.alphaStr).map(l => l.to[collection.Seq])) + .map(_.to[collection.Set]) aSetSet <- Gen.listOf(Gen.listOf(Gen.alphaStr).map(l => l.to[collection.Set])).map(_.to[collection.Set]) - aMapSet <- Gen.listOf(Gen.listOf(arb[(Int, Int)]).map(l => l.toMap.asInstanceOf[collection.Map[Int, Int]])).map(_.to[collection.Set]) - } yield TestSets(aBoolSet, aByteSet, aI16Set, aI32Set, aI64Set, aDoubleSet, aStringSet, aStructSet, aListSet, aSetSet, aMapSet) + aMapSet <- Gen + .listOf(Gen.listOf(arb[(Int, Int)]).map(l => l.toMap.asInstanceOf[collection.Map[Int, Int]])) + .map(_.to[collection.Set]) + } yield TestSets( + aBoolSet, + aByteSet, + aI16Set, + aI32Set, + aI64Set, + aDoubleSet, + aStringSet, + aStructSet, + aListSet, + aSetSet, + aMapSet + ) } case class TestSetsPair(a: TestSets, b: TestSets) implicit def arbitraryTestSetsPair: Arbitrary[TestSetsPair] = Arbitrary { @@ -209,10 +232,42 @@ object ScroogeGenerators { aDoubleMap <- Gen.listOf(arb[(Double, Byte)]).map(_.toMap) aStringMap <- Gen.listOf(arb[(String, Boolean)]).map(_.toMap) aStructMap <- Gen.listOf(arb[(TestStruct, List[String])]).map(_.toMap) - aListMap <- Gen.listOf(arb[(List[String], TestStruct)]).map(_.toMap.map { case (k, v) => k.to[collection.Seq] -> v }.asInstanceOf[collection.Map[collection.Seq[String], TestStruct]]) - aSetMap <- Gen.listOf(arb[(Set[String], Set[String])]).map(_.toMap.map { case (k, v) => k.to[collection.Set] -> v.to[collection.Set] }.asInstanceOf[collection.Map[collection.Set[String], collection.Set[String]]]) - aMapMap <- Gen.listOf(arb[(Map[Int, Int], Map[Int, Int])]).map(_.toMap.map { case (k, v) => k.asInstanceOf[collection.Map[Int, Int]] -> v.asInstanceOf[collection.Map[Int, Int]] }.asInstanceOf[collection.Map[collection.Map[Int, Int], collection.Map[Int, Int]]]) - } yield TestMaps(aBoolMap, aByteMap, aI16Map, aI32Map, aI64Map, aDoubleMap, aStringMap, aStructMap, aListMap, aSetMap, aMapMap) + aListMap <- Gen + .listOf(arb[(List[String], TestStruct)]) + .map( + _.toMap + .map { case (k, v) => k.to[collection.Seq] -> v } + .asInstanceOf[collection.Map[collection.Seq[String], TestStruct]] + ) + aSetMap <- Gen + .listOf(arb[(Set[String], Set[String])]) + .map( + _.toMap + .map { case (k, v) => k.to[collection.Set] -> v.to[collection.Set] } + .asInstanceOf[collection.Map[collection.Set[String], collection.Set[String]]] + ) + aMapMap <- Gen + .listOf(arb[(Map[Int, Int], Map[Int, Int])]) + .map( + _.toMap + .map { case (k, v) => + k.asInstanceOf[collection.Map[Int, Int]] -> v.asInstanceOf[collection.Map[Int, Int]] + } + .asInstanceOf[collection.Map[collection.Map[Int, Int], collection.Map[Int, Int]]] + ) + } yield TestMaps( + aBoolMap, + aByteMap, + aI16Map, + aI32Map, + aI64Map, + aDoubleMap, + aStringMap, + aStructMap, + aListMap, + aSetMap, + aMapMap + ) } case class TestMapsPair(a: TestMaps, b: TestMaps) implicit def arbitraryTestMapsPair: Arbitrary[TestMapsPair] = Arbitrary { diff --git a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeMacrosUnitTests.scala b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeMacrosUnitTests.scala index b2089a98d9..ff6528bdc1 100644 --- a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeMacrosUnitTests.scala +++ b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/ScroogeMacrosUnitTests.scala @@ -18,7 +18,7 @@ package com.twitter.scalding.thrift.macros import com.twitter.scalding.serialization.OrderedSerialization import com.twitter.scalding.thrift.macros.scalathrift._ import org.scalatest.prop.PropertyChecks -import org.scalatest.{ Matchers, WordSpec } +import org.scalatest.{Matchers, WordSpec} class ScroogeMacrosUnitTests extends WordSpec with Matchers with PropertyChecks { import ScroogeGenerators._ @@ -56,9 +56,8 @@ class ScroogeMacrosUnitTests extends WordSpec with Matchers with PropertyChecks "Should RT correctly" in { class Container[T](implicit oSer: OrderedSerialization[T]) { - def ord: OrderedSerialization[(Long, T)] = { + def ord: OrderedSerialization[(Long, T)] = implicitly[OrderedSerialization[(Long, T)]] - } } val ordSer = (new Container[TestLists]).ord diff --git a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/TestHelper.scala b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/TestHelper.scala index 73efc0603f..0cad2fc327 100644 --- a/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/TestHelper.scala +++ b/scalding-thrift-macros/src/test/scala/com/twitter/scalding/thrift/macros/TestHelper.scala @@ -34,30 +34,29 @@ object TestHelper extends Matchers { r.head } - def oBufCompare[T](a: T, b: T)(implicit obuf: OrderedSerialization[T]): Int = { + def oBufCompare[T](a: T, b: T)(implicit obuf: OrderedSerialization[T]): Int = obuf.compare(a, b) - } - def deserializeSeq[T](items: Int, buf: InputStream)(implicit orderedBuffer: OrderedSerialization[T]): Seq[T] = { + def deserializeSeq[T](items: Int, buf: InputStream)(implicit + orderedBuffer: OrderedSerialization[T] + ): Seq[T] = (0 until items).map { _ => orderedBuffer.read(buf).get }.toList - } def serialize[T](t: T)(implicit orderedBuffer: OrderedSerialization[T]): InputStream = serializeSeq(List(t)) def serializeSeq[T](t: Seq[T])(implicit orderedBuffer: OrderedSerialization[T]): InputStream = { val baos = new ByteArrayOutputStream - t.foreach({ e => + t.foreach { e => orderedBuffer.write(baos, e) - }) + } baos.toInputStream } - def rawCompare[T](a: T, b: T)(implicit obuf: OrderedSerialization[T]): Int = { + def rawCompare[T](a: T, b: T)(implicit obuf: OrderedSerialization[T]): Int = obuf.compareBinary(serialize(a), serialize(b)).unsafeToInt - } def checkManyExplicit[T](i: List[T])(implicit obuf: OrderedSerialization[T]) = { val serializedA = serializeSeq(i) @@ -67,7 +66,9 @@ object TestHelper extends Matchers { } } - def compareSerialized[T](a: T, b: T)(implicit orderedBuffer: OrderedSerialization[T]): OrderedSerialization.Result = { + def compareSerialized[T](a: T, b: T)(implicit + orderedBuffer: OrderedSerialization[T] + ): OrderedSerialization.Result = { val bufA = serializeSeq[T]((0 until 20).map(_ => a)) val bufB = serializeSeq[T]((0 until 20).map(_ => b)) val r = (0 until 20).map { _ => diff --git a/tutorial/execution-tutorial/ExecutionTutorial.scala b/tutorial/execution-tutorial/ExecutionTutorial.scala index d3fe2133c2..4d7f19bd51 100644 --- a/tutorial/execution-tutorial/ExecutionTutorial.scala +++ b/tutorial/execution-tutorial/ExecutionTutorial.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.tutorial import java.io._ @@ -21,27 +21,26 @@ import scala.util.{Failure, Success} import com.twitter.scalding._ /** -Tutorial of using Execution - -This tutorial gives an example of use Execution to do MapReduce word count. -Instead of writing the results in reducers, it writes the data at submitter node. - -To test it, first build the assembly jar from root directory: - ./sbt execution-tutorial/assembly - -Run: - scala -classpath tutorial/execution-tutorial/target/execution-tutorial-assembly-0.17.2.jar \ - com.twitter.scalding.tutorial.MyExecJob --local \ - --input tutorial/data/hello.txt \ - --output tutorial/data/execution_output.txt -**/ + * Tutorial of using Execution + * + * This tutorial gives an example of use Execution to do MapReduce word count. Instead of writing the results + * in reducers, it writes the data at submitter node. + * + * To test it, first build the assembly jar from root directory: ./sbt execution-tutorial/assembly + * + * Run: scala -classpath tutorial/execution-tutorial/target/execution-tutorial-assembly-0.17.2.jar \ + * com.twitter.scalding.tutorial.MyExecJob --local \ + * --input tutorial/data/hello.txt \ + * --output tutorial/data/execution_output.txt + */ object MyExecJob extends ExecutionApp { override def job = Execution.getConfig.flatMap { config => val args = config.getArgs - TypedPipe.from(TextLine(args("input"))) + TypedPipe + .from(TextLine(args("input"))) .flatMap(_.split("\\s+")) .map((_, 1L)) .sumByKey @@ -52,7 +51,7 @@ object MyExecJob extends ExecutionApp { case Success(iter) => val file = new PrintWriter(new File(args("output"))) iter.foreach { case (k, v) => - file.write(s"$k\t$v\n") + file.write(s"$k\t$v\n") } file.close case Failure(e) => println("Error: " + e.toString) @@ -61,5 +60,3 @@ object MyExecJob extends ExecutionApp { .unit } } - - diff --git a/version.sbt b/version.sbt index 558ffa9a60..ae781a91f0 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -ThisBuild / version := "0.18.0-SNAPSHOT" \ No newline at end of file +ThisBuild / version := "0.18.0-SNAPSHOT"